diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index dd4c3bcaba..b1cbf33c41 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -121,10 +121,10 @@ set(STANDARD_PACKAGES ASPHERE BODY CLASS2 COLLOID COMPRESS DIPOLE PLUGIN QEQ REPLICA RIGID SHOCK SPIN SNAP SRD KIM PYTHON MSCG MPIIO VORONOI USER-ADIOS USER-ATC USER-AWPMD USER-BOCS USER-CGDNA USER-MESODPD USER-CGSDK USER-COLVARS USER-DIFFRACTION USER-DPD USER-DRUDE USER-EFF USER-FEP USER-H5MD - USER-LB USER-MANIFOLD USER-MEAMC USER-MESONT USER-MGPT USER-MISC USER-MOFFF + USER-LB USER-MANIFOLD USER-MDI USER-MEAMC USER-MESONT USER-MGPT USER-MISC USER-MOFFF USER-MOLFILE USER-NETCDF USER-PHONON USER-PLUMED USER-PTM USER-QTB USER-REACTION USER-REAXC USER-SCAFACOS USER-SDPD USER-SMD USER-SMTBQ USER-SPH - USER-TALLY USER-UEF USER-VTK USER-QUIP USER-QMMM USER-YAFF USER-PACE) + USER-TALLY USER-UEF USER-VTK USER-QUIP USER-QMMM USER-YAFF USER-PACE USER-BROWNIAN) set(SUFFIX_PACKAGES CORESHELL GPU KOKKOS OPT USER-INTEL USER-OMP) @@ -324,8 +324,8 @@ else() set(CUDA_REQUEST_PIC) endif() -foreach(PKG_WITH_INCL KSPACE PYTHON MLIAP VORONOI USER-COLVARS USER-MOLFILE USER-NETCDF USER-PLUMED USER-QMMM - USER-QUIP USER-SCAFACOS USER-SMD USER-VTK KIM LATTE MESSAGE MSCG COMPRESS USER-PACE) +foreach(PKG_WITH_INCL KSPACE PYTHON MLIAP VORONOI USER-COLVARS USER-MDI USER-MOLFILE USER-NETCDF USER-PLUMED + USER-QMMM USER-QUIP USER-SCAFACOS USER-SMD USER-VTK KIM LATTE MESSAGE MSCG COMPRESS USER-PACE) if(PKG_${PKG_WITH_INCL}) include(Packages/${PKG_WITH_INCL}) endif() diff --git a/cmake/Modules/Packages/KOKKOS.cmake b/cmake/Modules/Packages/KOKKOS.cmake index 1f00516e08..e0e32730ec 100644 --- a/cmake/Modules/Packages/KOKKOS.cmake +++ b/cmake/Modules/Packages/KOKKOS.cmake @@ -37,8 +37,8 @@ if(DOWNLOAD_KOKKOS) list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_CXX_EXTENSIONS=${CMAKE_CXX_EXTENSIONS}") list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}") include(ExternalProject) - set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/3.3.01.tar.gz" CACHE STRING "URL for KOKKOS tarball") - set(KOKKOS_MD5 "08201d1c7cf5bc458ce0f5b44a629d5a" CACHE STRING "MD5 checksum of KOKKOS tarball") + set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/3.4.00.tar.gz" CACHE STRING "URL for KOKKOS tarball") + set(KOKKOS_MD5 "c2fdcedb6953e6160c765366f6045abb" CACHE STRING "MD5 checksum of KOKKOS tarball") mark_as_advanced(KOKKOS_URL) mark_as_advanced(KOKKOS_MD5) ExternalProject_Add(kokkos_build @@ -58,7 +58,7 @@ if(DOWNLOAD_KOKKOS) target_link_libraries(lmp PRIVATE LAMMPS::KOKKOS) add_dependencies(LAMMPS::KOKKOS kokkos_build) elseif(EXTERNAL_KOKKOS) - find_package(Kokkos 3.3.01 REQUIRED CONFIG) + find_package(Kokkos 3.4.00 REQUIRED CONFIG) target_link_libraries(lammps PRIVATE Kokkos::kokkos) target_link_libraries(lmp PRIVATE Kokkos::kokkos) else() diff --git a/cmake/Modules/Packages/USER-MDI.cmake b/cmake/Modules/Packages/USER-MDI.cmake new file mode 100644 index 0000000000..75fe0d2bca --- /dev/null +++ b/cmake/Modules/Packages/USER-MDI.cmake @@ -0,0 +1,66 @@ +find_package(mdi QUIET) +if(${mdi_FOUND}) + set(DOWNLOAD_MDI_DEFAULT OFF) +else() + set(DOWNLOAD_MDI_DEFAULT ON) +endif() +option(DOWNLOAD_MDI "Download and compile the MDI library instead of using an already installed one" ${DOWNLOAD_MDI_DEFAULT}) + +if(DOWNLOAD_MDI) + message(STATUS "MDI download requested - we will build our own") + set(MDI_URL "https://github.com/MolSSI-MDI/MDI_Library/archive/v1.2.9.tar.gz" CACHE STRING "URL for MDI tarball") + set(MDI_MD5 "ddfa46d6ee15b4e59cfd527ec7212184" CACHE STRING "MD5 checksum for MDI tarball") + mark_as_advanced(MDI_URL) + mark_as_advanced(MDI_MD5) + + set(LAMMPS_LIB_MDI_BIN_DIR ${LAMMPS_LIB_BINARY_DIR}/mdi) + + include(ExternalProject) + message(STATUS "Building mdi.") + ExternalProject_Add(mdi_external + URL ${MDI_URL} + URL_MD5 ${MDI_MD5} + UPDATE_COMMAND "" + CMAKE_ARGS ${CMAKE_REQUEST_PIC} + -DCMAKE_INSTALL_PREFIX=${LAMMPS_LIB_MDI_BIN_DIR} + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} + -DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR} + -DCMAKE_INSTALL_INCLUDEDIR=${CMAKE_INSTALL_INCLUDEDIR} + -DBUILD_SHARED_LIBS=${BUILD_SHARED_LIBS} + -Dlanguage=C + CMAKE_CACHE_ARGS -DCMAKE_C_FLAGS:STRING=${CMAKE_C_FLAGS} + -DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS} + -DTargetOpenMP_FIND_COMPONENTS:STRING=C;CXX) + + # Link the lammps library against MDI + target_include_directories(lammps PRIVATE ${LAMMPS_LIB_MDI_BIN_DIR}/${CMAKE_INSTALL_INCLUDEDIR}/mdi) + target_link_directories(lammps PRIVATE ${LAMMPS_LIB_MDI_BIN_DIR}/${CMAKE_INSTALL_LIBDIR}/mdi) + target_link_libraries(lammps PRIVATE mdi) + add_dependencies(lammps mdi_external) + + # Link the lammps executable against MDI + target_include_directories(lmp PRIVATE ${LAMMPS_LIB_MDI_BIN_DIR}/${CMAKE_INSTALL_INCLUDEDIR}/mdi) + target_link_directories(lmp PRIVATE ${LAMMPS_LIB_MDI_BIN_DIR}/${CMAKE_INSTALL_LIBDIR}/mdi) + target_link_libraries(lmp PRIVATE mdi) + add_dependencies(lmp mdi_external) + +else() + + find_package(mdi) + if(NOT mdi_FOUND) + message(FATAL_ERROR "MDI library not found. Help CMake to find it " + "by setting mdi_LIBRARY and mdi_INCLUDE_DIR, or set DOWNLOAD_MDI=ON " + "to download and compile it") + endif() + + # Link the lammps library against MDI + target_include_directories(lammps PRIVATE ${mdi_INCLUDE_DIR}) + target_link_libraries(lammps PRIVATE ${mdi_LIBRARY}) + + # Link the lammps executable against MDI + target_include_directories(lmp PRIVATE ${mdi_INCLUDE_DIR}) + target_link_libraries(lmp PRIVATE ${mdi_LIBRARY}) +endif() + +target_compile_definitions(lammps PRIVATE -DLMP_USER_MDI) +target_compile_definitions(lmp PRIVATE -DLMP_USER_MDI) diff --git a/cmake/presets/most.cmake b/cmake/presets/most.cmake index 5dc58b735b..064b22595f 100644 --- a/cmake/presets/most.cmake +++ b/cmake/presets/most.cmake @@ -5,9 +5,9 @@ set(ALL_PACKAGES ASPHERE BODY CLASS2 COLLOID COMPRESS CORESHELL DIPOLE GRANULAR KSPACE MANYBODY MC MISC MLIAP MOLECULE OPT PERI PLUGIN POEMS PYTHON QEQ REPLICA RIGID SHOCK SNAP SPIN SRD VORONOI - USER-BOCS USER-CGDNA USER-CGSDK USER-COLVARS USER-DIFFRACTION - USER-DPD USER-DRUDE USER-EFF USER-FEP USER-MEAMC USER-MESODPD - USER-MISC USER-MOFFF USER-OMP USER-PHONON USER-REACTION + USER-BROWNIAN USER-BOCS USER-CGDNA USER-CGSDK USER-COLVARS + USER-DIFFRACTION USER-DPD USER-DRUDE USER-EFF USER-FEP USER-MEAMC + USER-MESODPD USER-MISC USER-MOFFF USER-OMP USER-PHONON USER-REACTION USER-REAXC USER-SDPD USER-SPH USER-SMD USER-UEF USER-YAFF) foreach(PKG ${ALL_PACKAGES}) diff --git a/doc/src/Build_extras.rst b/doc/src/Build_extras.rst index d375d33e56..25ef36b430 100644 --- a/doc/src/Build_extras.rst +++ b/doc/src/Build_extras.rst @@ -49,6 +49,7 @@ This is the list of packages that may require additional steps. * :ref:`USER-COLVARS ` * :ref:`USER-H5MD ` * :ref:`USER-INTEL ` + * :ref:`USER-MDI ` * :ref:`USER-MESONT ` * :ref:`USER-MOLFILE ` * :ref:`USER-NETCDF ` @@ -467,6 +468,9 @@ They must be specified in uppercase. * - ARMV8_THUNDERX2 - HOST - ARMv8 Cavium ThunderX2 CPU + * - A64FX + - HOST + - ARMv8.2 with SVE Support * - WSM - HOST - Intel Westmere CPU (SSE 4.2) @@ -539,6 +543,9 @@ They must be specified in uppercase. * - AMPERE80 - GPU - NVIDIA Ampere generation CC 8.0 GPU + * - AMPERE86 + - GPU + - NVIDIA Ampere generation CC 8.6 GPU * - VEGA900 - GPU - AMD GPU MI25 GFX900 @@ -547,12 +554,12 @@ They must be specified in uppercase. - AMD GPU MI50/MI60 GFX906 * - VEGA908 - GPU - - AMD GPU GFX908 + - AMD GPU MI100 GFX908 * - INTEL_GEN - GPU - Intel GPUs Gen9+ -This list was last updated for version 3.3 of the Kokkos library. +This list was last updated for version 3.4 of the Kokkos library. .. tabs:: @@ -1533,6 +1540,35 @@ TBB and MKL. ---------- +.. _user-mdi: + +USER-MDI package +----------------------------- + +.. tabs:: + + .. tab:: CMake build + + .. code-block:: bash + + -D DOWNLOAD_MDI=value # download MDI Library for build, value = no (default) or yes + + .. tab:: Traditional make + + Before building LAMMPS, you must build the MDI Library in + ``lib/mdi``\ . You can do this by executing a command like one + of the following from the ``lib/mdi`` directory: + + .. code-block:: bash + + $ python Install.py -m gcc # build using gcc compiler + $ python Install.py -m icc # build using icc compiler + + The build should produce two files: ``lib/mdi/includelink/mdi.h`` + and ``lib/mdi/liblink/libmdi.so``\ . + +---------- + .. _user-mesont: USER-MESONT package diff --git a/doc/src/Commands_all.rst b/doc/src/Commands_all.rst index b43fd0ed56..c708228be7 100644 --- a/doc/src/Commands_all.rst +++ b/doc/src/Commands_all.rst @@ -67,6 +67,7 @@ An alphabetic list of all general LAMMPS commands. * :doc:`lattice ` * :doc:`log ` * :doc:`mass ` + * :doc:`mdi/engine ` * :doc:`message ` * :doc:`minimize ` * :doc:`min_modify ` diff --git a/doc/src/Commands_fix.rst b/doc/src/Commands_fix.rst index 671716e89d..0dbe8f5bec 100644 --- a/doc/src/Commands_fix.rst +++ b/doc/src/Commands_fix.rst @@ -39,6 +39,9 @@ OPT. * :doc:`ave/time ` * :doc:`aveforce ` * :doc:`balance ` + * :doc:`brownian ` + * :doc:`brownian/asphere ` + * :doc:`brownian/sphere ` * :doc:`bocs ` * :doc:`bond/break ` * :doc:`bond/create ` @@ -98,6 +101,7 @@ OPT. * :doc:`lb/viscous ` * :doc:`lineforce ` * :doc:`manifoldforce ` + * :doc:`mdi/engine ` * :doc:`meso/move ` * :doc:`momentum (k) ` * :doc:`momentum/chunk ` diff --git a/doc/src/Commands_pair.rst b/doc/src/Commands_pair.rst index 40b81a2fd1..b7baaa8581 100644 --- a/doc/src/Commands_pair.rst +++ b/doc/src/Commands_pair.rst @@ -171,7 +171,7 @@ OPT. * :doc:`lj/sdk/coul/long (go) ` * :doc:`lj/sdk/coul/msm (o) ` * :doc:`lj/sf/dipole/sf (go) ` - * :doc:`lj/smooth (o) ` + * :doc:`lj/smooth (go) ` * :doc:`lj/smooth/linear (o) ` * :doc:`lj/switch3/coulgauss/long ` * :doc:`lj96/cut (go) ` diff --git a/doc/src/Examples.rst b/doc/src/Examples.rst index 8a76dca66e..8bfc4ad573 100644 --- a/doc/src/Examples.rst +++ b/doc/src/Examples.rst @@ -108,6 +108,8 @@ Lowercase directories +-------------+------------------------------------------------------------------+ | msst | MSST shock dynamics | +-------------+------------------------------------------------------------------+ +| multi | multi neighboring for systems with large interaction disparities | ++-------------+------------------------------------------------------------------+ | nb3b | use of non-bonded 3-body harmonic pair style | +-------------+------------------------------------------------------------------+ | neb | nudged elastic band (NEB) calculation for barrier finding | diff --git a/doc/src/Howto.rst b/doc/src/Howto.rst index ff75b751b1..99a9fae4da 100644 --- a/doc/src/Howto.rst +++ b/doc/src/Howto.rst @@ -23,6 +23,7 @@ General howto Howto_library Howto_couple Howto_client_server + Howto_mdi Settings howto ============== diff --git a/doc/src/Howto_mdi.rst b/doc/src/Howto_mdi.rst new file mode 100644 index 0000000000..a0a1cd0286 --- /dev/null +++ b/doc/src/Howto_mdi.rst @@ -0,0 +1,132 @@ +Using LAMMPS with the MDI library for code coupling +=================================================== + +..note:: + + This Howto doc page will eventually replace the + :doc:`Howto client/server ` doc page. + +Client/server coupling of two codes is where one code is the "client" +and sends request messages (data) to a "server" code. The server +responds to each request with a reply message. This enables the two +codes to work in tandem to perform a simulation. LAMMPS can act as +either a client or server code; it does this by using the `MolSSI +Driver Interface (MDI) library +`_, +developed by the `Molecular Sciences Software Institute (MolSSI) +`_. + +Alternate methods for code coupling with LAMMPS are described on the +:doc:`Howto couple ` doc page. + +Some advantages of client/server coupling are that the two codes can run +as stand-alone executables; they need not be linked together. Thus +neither code needs to have a library interface. This also makes it easy +to run the two codes on different numbers of processors. If a message +protocol (format and content) is defined for a particular kind of +simulation, then in principle any code which implements the client-side +protocol can be used in tandem with any code which implements the +server-side protocol. Neither code needs to know what specific other +code it is working with. + +In MDI nomenclature, a client code is the "driver", and a server code is +an "engine". One driver code can communicate with one or more instances +of one or more engine codes. Driver and engine codes can be written in +any language: C, C++, Fortran, Python, etc. + +In addition to allowing driver and engine(s) running to run as +stand-alone executables, MDI also enables a server code to be a +"plugin" to the client code. In this scenario, server code(s) are +compiled as shared libraries, and one (or more) instances of the +server are instantiated by the driver code. If the driver code runs +in parallel, it can split its MPI communicator into multiple +sub-communicators, and launch each plugin engine instance on a +sub-communicator. Driver processors in that sub-communicator exchange +messages with that engine instance, and can also send MPI messages to +other processors in the driver. The driver code can also destroy +engine instances and re-instantiate them. + +The way that a driver communicates with an engine is by making +MDI_Send() and MDI_Recv() calls, which are conceptually similar to +MPI_Send() and MPI_Recv() calls. Each send or receive has a string +which identifies the command name, and optionally some data, which can +be a single value or vector of values of any data type. Inside the +MDI library, data is exchanged between the driver and engine via MPI +calls or sockets. This a run-time choice by the user. + +------------- + +As an example, LAMMPS and the ``pw.x`` command from Quantum Espresso (a +suite of quantum DFT codes), can work together via the MDI library to +perform an ab initio MD (AIMD) simulation, where LAMMPS runs an MD +simulation and sends a message each timestep to ``pw.x`` asking it to +compute quantum forces on the current configuration of atoms. Here is +how the 2 codes are launched to communicate by MPI: + +.. code-block:: bash + +% mpirun -np 2 lmp_mpi -mdi "-role DRIVER -name d -method MPI" \ + -in in.aimd : -np 16 pw.x -in qe.in -mdi "-role ENGINE -name e -method MPI" + +In this case LAMMPS runs on 2 processors (MPI tasks), ``pw.x`` runs on 16 +processors. + +Here is how the 2 codes are launched to communicate by sockets: + +.. code-block:: bash + +% mpirun -np 2 lmp_mpi -mdi "-role DRIVER -name d -method TCP -port 8021" -in in.aimd +% mpirun -np 16 pw.x -in qe.in -mdi "-role ENGINE -name e -method TCP -port 8021 -hostname localhost" + +These commands could be issued in different windows on a desktop +machine. Or in the same window, if the first command is ended with +"&" so as to run in the background. If "localhost" is replaced by an +IP address, ``pw.x`` could be run on another machine on the same network, or +even on another machine across the country. + +After both codes initialize themselves to model the same system, this is +what occurs each timestep: + +* LAMMPS send a ">COORDS" message to ``pw.x`` with a 3*N vector of current atom coords +* ``pw.x`` receives the message/coords and computes quantum forces on all the atoms +* LAMMPS send a "` command. This will put LAMMPS into +engine mode where it waits for messages and data from the driver. +When the driver sends an "EXIT" command, LAMMPS will exit engine mode +and the input script will continue. + +If LAMMPS is used as a plugin engine it operates the same way, except +that the driver will pass LAMMPS an input script to initialize itself. +Upon receiving the "EXIT" command, LAMMPS will exit engine mode and the +input script will continue. After finishing execution of the input +script, the instance of LAMMPS will be destroyed. + +LAMMPS supports the full set of MD-appropriate engine commands defined +by the MDI library. See the :doc:`mdi/engine ` doc page for +a list of these. + +If those commands are not sufficient for a user-developed driver to use +LAMMPS as an engine, then new commands can be easily added. See these +two files which implement the definition of MDI commands and the logic +for responding to them: + +* src/MDI/mdi_engine.cpp +* src/MDI/fix_mdi_engine.cpp diff --git a/doc/src/Packages_details.rst b/doc/src/Packages_details.rst index f1c590d850..e70c20a60d 100644 --- a/doc/src/Packages_details.rst +++ b/doc/src/Packages_details.rst @@ -69,6 +69,7 @@ page gives those details. * :ref:`USER-ATC ` * :ref:`USER-AWPMD ` * :ref:`USER-BOCS ` + * :ref:`USER-BROWNIAN ` * :ref:`USER-CGDNA ` * :ref:`USER-CGSDK ` * :ref:`USER-COLVARS ` @@ -81,6 +82,7 @@ page gives those details. * :ref:`USER-INTEL ` * :ref:`USER-LB ` * :ref:`USER-MANIFOLD ` + * :ref:`USER-MDI ` * :ref:`USER-MEAMC ` * :ref:`USER-MESODPD ` * :ref:`USER-MESONT ` @@ -1266,6 +1268,26 @@ Example inputs are in the examples/USER/bocs folder. ---------- +.. _PKG-USER-BROWNIAN: + +USER-BROWNIAN package +--------------------- + +**Contents:** + +This package provides :doc:`fix brownian, fix brownian/sphere, and +fix brownian/asphere ` as well as +:doc:`fix propel/self ` which allow to do Brownian +Dynamics time integration of point, spherical and aspherical particles +and also support self-propelled particles. + +**Authors:** Sam Cameron (University of Bristol), +Stefan Paquay (while at Brandeis University) (initial version of fix propel/self) + +Example inputs are in the examples/USER/brownian folder. + +---------- + .. _PKG-USER-CGDNA: USER-CGDNA package @@ -1770,6 +1792,28 @@ Waltham, MA, USA) ---------- +.. _PKG-USER-MDI: + +USER-MDI package +---------------- + +**Contents:** + +A LAMMPS command and fix to allow client-server coupling of LAMMPS to +other atomic or molecular simulation codes via the `MolSSI Driver Interface +(MDI) library `_. + +**Author:** Taylor Barnes - MolSSI, taylor.a.barnes at gmail.com + +**Supporting info:** + +* src/USER-MDI/README +* :doc:`mdi/engine ` +* :doc:`fix mdi/engine ` +* examples/USER/mdi + +---------- + .. _PKG-USER-MEAMC: USER-MEAMC package diff --git a/doc/src/Packages_user.rst b/doc/src/Packages_user.rst index 00d1dfb67b..f14d449dfd 100644 --- a/doc/src/Packages_user.rst +++ b/doc/src/Packages_user.rst @@ -39,6 +39,8 @@ package: +------------------------------------------------+-----------------------------------------------------------------+-------------------------------------------------------------------------------+------------------------------------------------------+---------+ | :ref:`USER-BOCS ` | BOCS bottom up coarse graining | :doc:`fix bocs ` | USER/bocs | no | +------------------------------------------------+-----------------------------------------------------------------+-------------------------------------------------------------------------------+------------------------------------------------------+---------+ +| :ref:`USER-BROWNIAN ` | Brownian dynamics and self-propelled particles | :doc:`fix brownian `, :doc:`fix propel/self ` | USER/brownian | no | ++------------------------------------------------+-----------------------------------------------------------------+-------------------------------------------------------------------------------+------------------------------------------------------+---------+ | :ref:`USER-CGDNA ` | coarse-grained DNA force fields | src/USER-CGDNA/README | USER/cgdna | no | +------------------------------------------------+-----------------------------------------------------------------+-------------------------------------------------------------------------------+------------------------------------------------------+---------+ | :ref:`USER-CGSDK ` | SDK coarse-graining model | :doc:`pair_style lj/sdk ` | USER/cgsdk | no | @@ -63,6 +65,8 @@ package: +------------------------------------------------+-----------------------------------------------------------------+-------------------------------------------------------------------------------+------------------------------------------------------+---------+ | :ref:`USER-MANIFOLD ` | motion on 2d surfaces | :doc:`fix manifoldforce ` | USER/manifold | no | +------------------------------------------------+-----------------------------------------------------------------+-------------------------------------------------------------------------------+------------------------------------------------------+---------+ +| :ref:`USER-MDI ` | client-server coupling | :doc:`MDI Howto ` | USER/mdi | ext | ++------------------------------------------------+-----------------------------------------------------------------+-------------------------------------------------------------------------------+------------------------------------------------------+---------+ | :ref:`USER-MEAMC ` | modified EAM potential (C++) | :doc:`pair_style meam/c ` | meamc | no | +------------------------------------------------+-----------------------------------------------------------------+-------------------------------------------------------------------------------+------------------------------------------------------+---------+ | :ref:`USER-MESODPD ` | mesoscale DPD models | :doc:`pair_style edpd ` | USER/mesodpd | no | diff --git a/doc/src/comm_modify.rst b/doc/src/comm_modify.rst index 864c544fed..27b370c941 100644 --- a/doc/src/comm_modify.rst +++ b/doc/src/comm_modify.rst @@ -11,13 +11,17 @@ Syntax comm_modify keyword value ... * zero or more keyword/value pairs may be appended -* keyword = *mode* or *cutoff* or *cutoff/multi* or *group* or *vel* +* keyword = *mode* or *cutoff* or *cutoff/multi* or *multi/reduce* or *group* or *vel* .. parsed-literal:: - *mode* value = *single* or *multi* = communicate atoms within a single or multiple distances + *mode* value = *single*, *multi*, or *multi/old* = communicate atoms within a single or multiple distances *cutoff* value = Rcut (distance units) = communicate atoms from this far away - *cutoff/multi* type value + *cutoff/multi* collection value + collection = atom collection or collection range (supports asterisk notation) + value = Rcut (distance units) = communicate atoms for selected types from this far away + *reduce/multi* arg = none = reduce number of communicated ghost atoms for multi style + *cutoff/multi/old* type value type = atom type or type range (supports asterisk notation) value = Rcut (distance units) = communicate atoms for selected types from this far away *group* value = group-ID = only communicate atoms in the group @@ -28,9 +32,9 @@ Examples .. code-block:: LAMMPS - comm_modify mode multi + comm_modify mode multi reduce/multi comm_modify mode multi group solvent - comm_modift mode multi cutoff/multi 1 10.0 cutoff/multi 2*4 15.0 + comm_modify mode multi cutoff/multi 1 10.0 cutoff/multi 2*4 15.0 comm_modify vel yes comm_modify mode single cutoff 5.0 vel yes comm_modify cutoff/multi * 0.0 @@ -62,12 +66,18 @@ sub-domain. The distance is by default the maximum of the neighbor cutoff across all atom type pairs. For many systems this is an efficient algorithm, but for systems with -widely varying cutoffs for different type pairs, the *multi* mode can -be faster. In this case, each atom type is assigned its own distance +widely varying cutoffs for different type pairs, the *multi* or *multi/old* mode can +be faster. In *multi*, each atom is assigned to a collection which should +correspond to a set of atoms with similar interaction cutoffs. +In this case, each atom collection is assigned its own distance cutoff for communication purposes, and fewer atoms will be -communicated. See the :doc:`neighbor multi ` command for a -neighbor list construction option that may also be beneficial for -simulations of this kind. +communicated. in *multi/old*, a similar technique is used but atoms +are grouped by atom type. See the :doc:`neighbor multi ` and +:doc:`neighbor multi/old ` commands for +neighbor list construction options that may also be beneficial for +simulations of this kind. The *multi* communication mode is only compatible +with the *multi* neighbor style. The *multi/old* communication mode is comparable +with both the *multi* and *multi/old* neighbor styles. The *cutoff* keyword allows you to extend the ghost cutoff distance for communication mode *single*\ , which is the distance from the borders @@ -87,12 +97,26 @@ warning is printed, if this bond based estimate is larger than the communication cutoff used. The *cutoff/multi* option is equivalent to *cutoff*\ , but applies to -communication mode *multi* instead. Since in this case the communication -cutoffs are determined per atom type, a type specifier is needed and -cutoff for one or multiple types can be extended. Also ranges of types -using the usual asterisk notation can be given. For granular pair styles, -the default cutoff is set to the sum of the current maximum atomic radii -for each type. +communication mode *multi* instead. Since the communication cutoffs are +determined per atom collections, a collection specifier is needed and +cutoff for one or multiple collections can be extended. Also ranges of +collections using the usual asterisk notation can be given. Collections +are indexed from 1 to N where N is the total number of collections. +Note that the arguments for *cutoff/multi* are parsed right before each +simulation to account for potential changes in the number of +collections. Custom cutoffs are preserved between runs but if +collections are redefined, one may want to re-specify the communication +cutoffs. For granular pair styles,the default cutoff is set to the sum +of the current maximum atomic radii for each collection. The +*cutoff/multi/old* option is similar to *cutoff/multi* except it +operates on atom types as opposed to collections. + +The *reduce/multi* option applies to *multi* and sets the communication +cutoff for a particle equal to the maximum interaction distance between particles +in the same collection. This reduces the number of +ghost atoms that need to be communicated. This method is only compatible with the +*multi* neighbor style and requires a half neighbor list and Newton on. +See the :doc:`neighbor multi ` command for more information. These are simulation scenarios in which it may be useful or even necessary to set a ghost cutoff > neighbor cutoff: @@ -123,7 +147,7 @@ ghost cutoff should be set. In the last scenario, a :doc:`fix ` or :doc:`compute ` or :doc:`pairwise potential ` needs to calculate with ghost atoms beyond the normal pairwise cutoff for some computation it -performs (e.g. locate neighbors of ghost atoms in a multibody pair +performs (e.g. locate neighbors of ghost atoms in a manybody pair potential). Setting the ghost cutoff appropriately can insure it will find the needed atoms. diff --git a/doc/src/commands_list.rst b/doc/src/commands_list.rst index e30d5c52dc..75affe7ce6 100644 --- a/doc/src/commands_list.rst +++ b/doc/src/commands_list.rst @@ -59,6 +59,7 @@ Commands lattice log mass + mdi_engine message min_modify min_spin diff --git a/doc/src/fix.rst b/doc/src/fix.rst index 109bfb00be..41c9732c62 100644 --- a/doc/src/fix.rst +++ b/doc/src/fix.rst @@ -182,6 +182,9 @@ accelerated styles exist. * :doc:`ave/time ` - compute/output global time-averaged quantities * :doc:`aveforce ` - add an averaged force to each atom * :doc:`balance ` - perform dynamic load-balancing +* :doc:`brownian ` - overdamped translational brownian motion +* :doc:`brownian/asphere ` - overdamped translational and rotational brownian motion for ellipsoids +* :doc:`brownian/sphere ` - overdamped translational and rotational brownian motion for spheres * :doc:`bocs ` - NPT style time integration with pressure correction * :doc:`bond/break ` - break bonds on the fly * :doc:`bond/create ` - create bonds on the fly diff --git a/doc/src/fix_bond_react.rst b/doc/src/fix_bond_react.rst index 5291aff81b..f42952ecdc 100644 --- a/doc/src/fix_bond_react.rst +++ b/doc/src/fix_bond_react.rst @@ -328,8 +328,8 @@ keyword 'ChiralIDs' lists the atom IDs of chiral atoms whose handedness should be enforced. The fifth optional section begins with the keyword 'Constraints' and lists additional criteria that must be satisfied in order for the reaction to occur. Currently, there are -five types of constraints available, as discussed below: 'distance', -'angle', 'dihedral', 'arrhenius', and 'rmsd'. +six types of constraints available, as discussed below: 'distance', +'angle', 'dihedral', 'arrhenius', 'rmsd', and 'custom'. A sample map file is given below: @@ -500,6 +500,45 @@ example, the molecule fragment could consist of only the backbone atoms of a polymer chain. This constraint can be used to enforce a specific relative position and orientation between reacting molecules. +The constraint of type 'custom' has the following syntax: + +.. parsed-literal:: + + custom *varstring* + +where 'custom' is the required keyword, and *varstring* is a +variable expression. The expression must be a valid equal-style +variable formula that can be read by the :doc:`variable ` command, +after any special reaction functions are evaluated. If the resulting +expression is zero, the reaction is prevented from occurring; +otherwise, it is permitted to occur. There are two special reaction +functions available, 'rxnsum' and 'rxnave'. These functions operate +over the atoms in a given reaction site, and have one mandatory +argument and one optional argument. The mandatory argument is the +identifier for an atom-style variable. The second, optional argument +is the name of a molecule fragment in the pre-reaction template, and +can be used to operate over a subset of atoms in the reaction site. +The 'rxnsum' function sums the atom-style variable over the reaction +site, while the 'rxnave' returns the average value. For example, a +constraint on the total potential energy of atoms involved in the +reaction can be imposed as follows: + +.. code-block:: LAMMPS + +compute 1 all pe/atom # in LAMMPS input script +variable my_pe atom c_1 # in LAMMPS input script + +.. code-block:: LAMMPS + +custom "rxnsum(v_my_pe) > 100" # in Constraints section of map file + +The above example prevents the reaction from occurring unless the +total potential energy of the reaction site is above 100. The variable +expression can be interpreted as the probability of the reaction +occurring by using an inequality and the 'random(x,y,z)' function +available as an equal-style variable input, similar to the 'arrhenius' +constraint above. + By default, all constraints must be satisfied for the reaction to occur. In other words, constraints are evaluated as a series of logical values using the logical AND operator "&&". More complex logic diff --git a/doc/src/fix_brownian.rst b/doc/src/fix_brownian.rst new file mode 100644 index 0000000000..d032346617 --- /dev/null +++ b/doc/src/fix_brownian.rst @@ -0,0 +1,216 @@ +.. index:: fix brownian +.. index:: fix brownian/sphere +.. index:: fix brownian/asphere + +fix brownian command +=========================== + +fix brownian/sphere command +=========================== + +fix brownian/sphere command +=========================== + + +Syntax +"""""" + +.. parsed-literal:: + + fix ID group-ID style_name temp seed keyword args + +* ID, group-ID are documented in :doc:`fix ` command +* style_name = *brownian* or *brownian/sphere* or *brownian/asphere* +* temp = temperature +* seed = random number generator seed +* one or more keyword/value pairs may be appended +* keyword = *rng* or *dipole* or *gamma_r_eigen* or *gamma_t_eigen* or *gamma_r* or *gamma_t* + + .. parsed-literal:: + + *rng* value = *uniform* or *gaussian* or *none* + *uniform* = use uniform random number generator + *gaussian* = use gaussian random number generator + *none* = turn off noise + *dipole* value = *mux* and *muy* and *muz* for *brownian/asphere* + *mux*, *muy*, and *muz* = update orientation of dipole having direction (*mux*,*muy*,*muz*) in body frame of rigid body + *gamma_r_eigen* values = *gr1* and *gr2* and *gr3* for *brownian/asphere* + *gr1*, *gr2*, and *gr3* = diagonal entries of body frame rotational friction tensor + *gamma_r* values = *gr* for *brownian/sphere* + *gr* = magnitude of the (isotropic) rotational friction tensor + *gamma_t_eigen* values = *gt1* and *gt2* and *gt3* for *brownian/asphere* + *gt1*, *gt2*, and *gt3* = diagonal entries of body frame translational friction tensor + *gamma_t* values = *gt* for *brownian* and *brownian/sphere* + *gt* = magnitude of the (isotropic) translational friction tensor + + +Examples +"""""""" + +.. code-block:: LAMMPS + + fix 1 all brownian 1.0 12908410 gamma_t 1.0 + fix 1 all brownian 1.0 12908410 gamma_t 3.0 rng gaussian + fix 1 all brownian/sphere 1.0 1294019 gamma_t 3.0 gamma_r 1.0 + fix 1 all brownian/sphere 1.0 19581092 gamma_t 1.0 gamma_r 0.3 rng none + fix 1 all brownian/asphere 1.0 1294019 gamma_t_eigen 1.0 2.0 3.0 gamma_r_eigen 4.0 7.0 8.0 rng gaussian + fix 1 all brownian/asphere 1.0 1294019 gamma_t_eigen 1.0 2.0 3.0 gamma_r_eigen 4.0 7.0 8.0 dipole 1.0 0.0 0.0 + + +Description +""""""""""" + +Perform Brownian Dynamics time integration to update position, velocity, +dipole orientation (for spheres) and quaternion orientation (for +ellipsoids, with optional dipole update as well) of all particles in the +fix group in each timestep. Brownian Dynamics uses Newton's laws of +motion in the limit that inertial forces are negligible compared to +viscous forces. The stochastic equation of motion for the center of mass +positions is + +.. math:: + + d\mathbf{r} = \mathbf{\gamma}_t^{-1}\mathbf{F}dt+\sqrt{2k_BT}\mathbf{\gamma}_t^{-1/2}d\mathbf{W}_t, + +in the lab-frame (i.e. :math:`\mathbf{\gamma}_t` is not diagonal, but +only depends on orientation and so the noise is still additive). + +The rotational motion for the spherical and ellipsoidal particles is not +as simple an expression, but is chosen to replicate the Boltzmann +distribution for the case of conservative torques (see :ref:`(Ilie) +` or :ref:`(Delong) `). + +For the style *brownian*, only the positions of the particles are +updated. This is therefore suitable for point particle simulations. + +For the style *brownian/sphere*, the positions of the particles are +updated, and a dipole slaved to the spherical orientation is also +updated. This style therefore requires the hybrid atom style +:doc:`atom_style dipole ` and :doc:`atom_style sphere +`. + +For the style *brownian/asphere*, the center of mass positions and the +quaternions of ellipsoidal particles are updated. This fix style is +suitable for equations of motion where the rotational and translational +friction tensors can be diagonalized in a certain (body) reference frame. + + +--------- + +.. note:: + + This integrator does not by default assume a relationship between the + rotational and translational friction tensors, though such a relationship + should exist in the case of no-slip boundary conditions between the particles and + the surrounding (implicit) solvent. E.g. in the case of spherical particles, + the condition :math:`\gamma_t=3\gamma_r/\sigma^2` must be explicitly + accounted for by setting *gamma_t* to 3x and *gamma_r* to x (where + :math:`\sigma` is the spherical diameter). A similar (though more complex) + relationship holds for ellipsoids and rod-like particles. + +--------- + +.. note:: + + Temperature computation using the :doc:`compute temp ` + will not correctly compute temperature of these overdamped dynamics + since we are explicitly neglecting inertial effects. + Furthermore, this time integrator does not add the stochastic terms or + viscous terms to the force and/or torques. Rather, they are just added + in to the equations of motion to update the degrees of freedom. + +--------- + + +If the *rng* keyword is used with the *uniform* value, then the noise +is generated from a uniform distribution (see +:ref:`(Dunweg) ` for why this works). This is the same method +of noise generation as used in :doc:`fix_langevin `. + +If the *rng* keyword is used with the *gaussian* value, then the noise +is generated from a gaussian distribution. Typically this added +complexity is unnecessary, and one should be fine using the *uniform* +value for reasons argued in :ref:`(Dunweg) `. + +If the *rng* keyword is used with the *none* value, then the noise +terms are set to zero. + +The *gamma_t* keyword sets the (isotropic) translational viscous damping. +Required for (and only compatible with) *brownian* and *brownian/sphere*. +The units of *gamma_t* are mass/time. + +The *gamma_r* keyword sets the (isotropic) rotational viscous damping. +Required for (and only compatible with) *brownian/sphere*. +The units of *gamma_r* are mass*length**2/time. + +The *gamma_r_eigen*, and *gamma_t_eigen* keywords are the eigenvalues of +the rotational and viscous damping tensors (having the same units as +their isotropic counterparts). Required for (and only compatible with) +*brownian/asphere*. For a 2D system, the first two values of *gamma_r_eigen* +must be inf (only rotation in xy plane), and the third value of *gamma_t_eigen* +must be inf (only diffusion in xy plane). + +If the *dipole* keyword is used, then the dipole moments of the particles +are updated as described above. Only compatible with *brownian/asphere* +(as *brownian/sphere* updates dipoles automatically). + +---------- + +.. note:: + For style *brownian/asphere*, the components *gamma_t_eigen* =(x,x,x) and + *gamma_r_eigen* = (y,y,y), the dynamics will replicate those of the + *brownian/sphere* style with *gamma_t* = x and *gamma_r* = y. + +---------- + +Restart, fix_modify, output, run start/stop, minimize info +""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + +No information about this fix is written to :doc:`binary restart files `. +No global or per-atom quantities are stored +by this fix for access by various :doc:`output commands `. + + +No parameter of this fix can be used with the *start/stop* keywords of +the :doc:`run ` command. This fix is not invoked during +:doc:`energy minimization `. + +Restrictions +"""""""""""" + +The style *brownian/sphere* fix requires that atoms store torque and angular velocity (omega) +as defined by the :doc:`atom_style sphere ` command. +The style *brownian/asphere* fix requires that atoms store torque and quaternions +as defined by the :doc:`atom_style ellipsoid ` command. +If the *dipole* keyword is used, they must also store a dipole moment +as defined by the :doc:`atom_style dipole ` command. + +This fix is part of the USER-BROWNIAN package. It is only enabled if +LAMMPS was built with that package. See the :doc:`Build package ` +doc page for more info. + +Related commands +"""""""""""""""" + +:doc:`fix propel/self `, +:doc:`fix langevin `, :doc:`fix nve/sphere `, + +Default +""""""" + +The default for *rng* is *uniform*. The default for the rotational and translational friction +tensors are the identity tensor. + +---------- + +.. _Ilie1: + +**(Ilie)** Ilie, Briels, den Otter, Journal of Chemical Physics, 142, 114103 (2015). + +.. _Delong1: + +**(Delong)** Delong, Usabiaga, Donev, Journal of Chemical Physics. 143, 144107 (2015) + +.. _Dunweg7: + +**(Dunweg)** Dunweg and Paul, Int J of Modern Physics C, 2, 817-27 (1991). diff --git a/doc/src/fix_mdi_engine.rst b/doc/src/fix_mdi_engine.rst new file mode 100644 index 0000000000..a2f25780f5 --- /dev/null +++ b/doc/src/fix_mdi_engine.rst @@ -0,0 +1,59 @@ +.. index:: fix move + +fix mdi/engine command +====================== + +Syntax +"""""" + +.. parsed-literal:: + + fix ID group-ID mdi/engine + +* ID, group-ID are documented in :doc:`fix ` command +* mdi/engine = style name of this fix command + +Examples +"""""""" + +.. code-block:: LAMMPS + + fix 1 all mdi/engine + +Description +""""""""""" + +This fix is used along with the :doc:`mdi/engine ` command +to enable LAMMPS to use the `MDI Library +`_ to run as +an MDI engine. The fix provides hooks that enable MDI driver codes to +communicate with LAMMPS at various points within a LAMMPS timestep. + +It is not generally necessary to add this fix to a LAMMPS input file, +even when using the :doc:`mdi/engine ` command. If the +:doc:`mdi/engine ` command is executed and this fix is not +present, it will automatically be added and applied as a new fix for +all atoms for the duration of the command. Thus it is only necessary +to add this fix to an input file when you want to modify the group-ID +or the ordering of this fix relative to other fixes in the input script. + +For more information about running LAMMPS as an MDI engine, see the +:doc:`mdi/engine ` command and the :doc:`Howto mdi +` doc page. + +Restrictions +"""""""""""" + +This command is part of the USER-MDI package. It is only enabled if +LAMMPS was built with that package. See the :doc:`Build package +` doc page for more info. + +Related commands +"""""""""""""""" + +:doc:`mdi/engine ` + +Default +""""""" + +none diff --git a/doc/src/fix_propel_self.rst b/doc/src/fix_propel_self.rst index 3a1bfb3166..ddc96817f9 100644 --- a/doc/src/fix_propel_self.rst +++ b/doc/src/fix_propel_self.rst @@ -8,52 +8,121 @@ Syntax .. parsed-literal:: - fix ID group-ID propel/self mode magnitude keyword values ... + fix ID group-ID propel/self mode magnitude keyword values * ID, group-ID are documented in :doc:`fix ` command * propel/self = style name of this fix command -* mode = velocity or quat -* magnitude = magnitude of the active force -* one or more keyword/value pairs may be appended to args -* keyword = *types* +* mode = *dipole* or *velocity* or *quat* +* magnitude = magnitude of self-propulsion force +* zero or one keyword/value pairs may be appended +* keyword = *qvector* + + .. parsed-literal:: + + *qvector* value = direction of force in ellipsoid frame + *sx*, *sy*, *sz* = components of *qvector* - *types* values = one or more atom types Examples """""""" .. code-block:: LAMMPS - fix active_group all propel/self velocity 1.0 - fix constant_velocity all viscous 1.0 - - fix active_group all propel/self quat 1.0 - - fix active all propel/self quat 1.0 types 1 2 4 + fix active all propel/self dipole 40.0 + fix active all propel/self velocity 10.0 + fix active all propel/self quat 15.7 qvector 1.0 0.0 0.0 Description """"""""""" -Adds a force of a constant magnitude to each atom in the group. The nature in -which the force is added depends on the mode. +Add a force to each atom in the group due to a self-propulsion force. The +force is given by -For *mode* = *velocity*, the active force acts along the velocity vector of -each atom. This can be interpreted as a velocity-dependent friction, -such as proposed by :ref:`(Erdmann) `. +.. math:: -For *mode* = *quat* the force is applied along the axis obtained -by rotating the x-axis along the atom's quaternion. In other words, the -force is along the x-axis in the atom's body frame. This mode requires -all atoms in the group to have a quaternion, so atom_style should -either be ellipsoid or body. In combination with Langevin thermostat -for translation and rotation in the overdamped regime, the quaternion -mode corresponds to the active Brownian particle model introduced by -:ref:`(Henkes) `, :ref:`(Bialke) ` and :ref:`(Fily) -`. + F_i = f_P e_i -By default, this fix is applied to all atoms in the group. You can -override this behavior by specifying the atom types the fix should work -on through the *types* keyword. +where *i* is the particle the force is being applied to, :math:`f_P` +is the magnitude of the force, and :math:`e_i` is the vector direction +of the force. The specification of :math:`e_i` is based on which of the +three keywords (*dipole* or *velocity* or *quat*) one selects. + +For mode *dipole*, :math:`e_i` is just equal to +the dipole vectors of the atoms in the group. Therefore, if the dipoles +are not unit vectors, the :math:`e_i` will not be unit vectors. + +.. note:: + + If another command changes the magnitude of the dipole, this force will + change accordingly (since :math:`|e_i|` will change, which is physically + equivalent to re-scaling :math:`f_P` while keeping :math:`|e_i|` constant), + and no warning will be provided by LAMMPS. This is almost never what you + want, so ensure you are not changing dipole magnitudes with another LAMMPS + fix or pair style. Furthermore, self-propulsion forces (almost) always + set :math:`e_i` to be a unit vector for all times, so it's best to set + all the dipole magnitudes to 1.0 unless you have a good reason not to + (see the :doc:`set ` command on how to do this). + +For mode *velocity*, :math:`e_i` points in the direction +of the current velocity (a unit-vector). This can be interpreted as a +velocity-dependent friction, as proposed by e.g. :ref:`(Erdmann) `. + +For mode *quat*, :math:`e_i` points in the direction of a unit +vector, oriented in the coordinate frame of the ellipsoidal particles, +which defaults to point along the x-direction. This default behavior +can be changed by via the *quatvec* keyword. + +The optional *quatvec* keyword specifies the direction of self-propulsion +via a unit vector (sx,sy,sz). The arguments *sx*, *sy*, and *sz*, are +defined within the coordinate frame of the atom's +ellipsoid. For instance, for an ellipsoid with long axis along +its x-direction, if one wanted the self-propulsion force to also +be along this axis, set *sx* equal to 1 and *sy*, *sz* both equal +to zero. This keyword may only be specified for mode *quat*. + +.. note:: + + In using keyword *quatvec*, the three arguments *sx*, + *sy*, and *sz* will be automatically normalized to components + of a unit vector internally to avoid users having to explicitly + do so themselves. Therefore, in mode *quat*, the vectors :math:`e_i` + will always be of unit length. + + +Along with adding a force contribution, this fix can also +contribute to the virial (pressure) of the system, defined as +:math:`f_P \sum_i /(d V)`, where :math:`r_i` is the +*unwrapped* coordinate of particle i in the case of periodic +boundary conditions. See :ref:`(Winkler) ` for a +discussion of this active pressure contribution. + +For modes *dipole* and *quat*, this fix is by default +included in pressure computations. + +For mode *velocity*, this fix is by default not included +in pressure computations. + + +.. note:: + + In contrast to equilibrium systems, pressure of active systems + in general depends on the geometry of the container. + The active pressure contribution as calculated in this fix + is only valid for certain boundary conditions (spherical + walls, rectangular walls, or periodic boundary conditions). + For other geometries, the pressure must be measured via + explicit calculation of the force per unit area on a wall, + and so one must not calculate it using this fix. + (Use :doc:`fix_modify ` as described below + to turn off the virial contribution of this fix). Again, + see :ref:`(Winkler) ` for discussion of why this + is the case. + + Furthermore, when dealing with active systems, the temperature + is no longer well defined. Therefore, one should ensure that + the *virial* flag is used in the + :doc:`compute pressure ` command (turning + off temperature contributions). ---------- @@ -62,40 +131,48 @@ Restart, fix_modify, output, run start/stop, minimize info No information about this fix is written to :doc:`binary restart files `. -This fix is not imposed during minimization. +The :doc:`fix_modify ` *virial* option is supported by this +fix to add the contribution due to the added forces on atoms to the +system's virial as part of :doc:`thermodynamic output `. +The default is *virial yes* for keywords *dipole* and *quat*. The +default is *virial no* for keyword *velocity*. + + +No parameter of this fix can be used with the *start/stop* keywords of +the :doc:`run ` command. + Restrictions """""""""""" -In quat mode, this fix makes use of per-atom quaternions to take -into account the fact that the orientation can rotate and hence the -direction of the active force can change. The quat mode -of this fix only works with atom_style ellipsoid. +With keyword *dipole*, this fix only works when the DIPOLE package is enabled. +See the :doc:`Build package ` doc page for more info. + +This fix is part of the USER-BROWNIAN package. It is only enabled if +LAMMPS was built with that package. See the :doc:`Build package ` +doc page for more info. + Related commands """""""""""""""" -:doc:`fix setforce `, :doc:`fix addforce ` - -.. _Erdmann: - -**(Erdmann)** U. Erdmann , W. Ebeling, L. Schimansky-Geier, and F. Schweitzer, -Eur. Phys. J. B 15, 105-113, 2000. - -.. _Henkes: - -**(Henkes)** Henkes, S, Fily, Y., and Marchetti, M. C. Phys. Rev. E, 84, 040301(R), 2011. - -.. _Bialke: - -**(Bialke)** J. Bialke, T. Speck, and H Loewen, Phys. Rev. Lett. 108, 168301, 2012. - -.. _Fily: - -**(Fily)** Y. Fily and M.C. Marchetti, Phys. Rev. Lett. 108, 235702, 2012. +:doc:`fix efield ` , :doc:`fix setforce `, +:doc:`fix addforce ` Default """"""" -types +none +---------- + + +.. _Erdmann1: + +**(Erdmann)** U. Erdmann , W. Ebeling, L. Schimansky-Geier, and F. Schweitzer, +Eur. Phys. J. B 15, 105-113, 2000. + + +.. _Winkler1: + +**(Winkler)** Winkler, Wysocki, and Gompper, Soft Matter, 11, 6680 (2015). diff --git a/doc/src/mdi_engine.rst b/doc/src/mdi_engine.rst new file mode 100644 index 0000000000..927a518f4e --- /dev/null +++ b/doc/src/mdi_engine.rst @@ -0,0 +1,88 @@ +.. index:: mdi_engine + +mdi_engine command +================== + +Syntax +"""""" + +.. parsed-literal:: + + mdi_engine + +Description +""""""""""" + +This command is used to have LAMMPS act as a server with another +client code to effectively couple the two codes together in +client/server mode. + +More specifically, this command causes LAMMPS to begin using the `MDI +Library `_ +to run as an MDI engine (server), responding to commands made by an +external MDI driver code (client). See the :doc:`Howto mdi +` doc page for more information about how LAMMPS can work +as both an MDI driver or engine. + +General information about launching codes that communicate using the +MDI Library can be found in the `corresponding page +`_ +of the MDI Library's documentation. + +---------- + +This command should typically be used in an input script after LAMMPS +has setup the system it is going to model in collaboration with the +driver code. Depending on how the driver code tells the LAMMPS engine +to exit, other commands can be executed after this command, but +typically it should be used at the end of the LAMMPS input script. + +To act as a MD-based MDI engine, this is the list of MDI commands from +a driver code which LAMMPS currently recognizes. See more details +about these commands in the `MDI library documentation +`_ +.. NOTE: Taylor - is this the best link for this info? Can we flesh this +.. out with the full list of supported commands? Maybe the distinction +.. of what "node" the commands refer to is not needed in this table? + +.. list-table:: + :widths: 20 80 + :header-rows: 1 + + * - Command name + - Action + * - >NATOMS + - Driver sends the number of atoms in the system + * - FORCES + - Driver sends 3*N double-precision atom forces + * - ` doc page for more info. + +Related commands +"""""""""""""""" + +:doc:`fix mdi/engine ` + +Default +""""""" + +None diff --git a/doc/src/neigh_modify.rst b/doc/src/neigh_modify.rst index 2618953dd7..25c2c2ac77 100644 --- a/doc/src/neigh_modify.rst +++ b/doc/src/neigh_modify.rst @@ -14,7 +14,7 @@ Syntax .. parsed-literal:: - keyword = *delay* or *every* or *check* or *once* or *cluster* or *include* or *exclude* or *page* or *one* or *binsize* + keyword = *delay* or *every* or *check* or *once* or *cluster* or *include* or *exclude* or *page* or *one* or *binsize* or *collection/type* or *collection/interval* *delay* value = N N = delay building until this many steps since last build *every* value = M @@ -47,6 +47,12 @@ Syntax N = max number of neighbors of one atom *binsize* value = size size = bin size for neighbor list construction (distance units) + *collection/type* values = N arg1 ... argN + N = number of custom collections + arg = N separate lists of types (see below) + *collection/interval* values = N arg1 ... argN + N = number of custom collections + arg = N separate cutoffs for intervals (see below) Examples """""""" @@ -58,6 +64,8 @@ Examples neigh_modify exclude group frozen frozen check no neigh_modify exclude group residue1 chain3 neigh_modify exclude molecule/intra rigid + neigh_modify collection/type 2 1*2,5 3*4 + neigh_modify collection/interval 2 1.0 10.0 Description """"""""""" @@ -188,8 +196,9 @@ atom can have. The *binsize* option allows you to specify what size of bins will be used in neighbor list construction to sort and find neighboring atoms. By default, for :doc:`neighbor style bin `, LAMMPS uses bins -that are 1/2 the size of the maximum pair cutoff. For :doc:`neighbor style multi `, the bins are 1/2 the size of the minimum pair -cutoff. Typically these are good values for minimizing the time for +that are 1/2 the size of the maximum pair cutoff. For :doc:`neighbor style multi `, +the bins are 1/2 the size of the collection interaction cutoff. +Typically these are good values for minimizing the time for neighbor list construction. This setting overrides the default. If you make it too big, there is little overhead due to looping over bins, but more atoms are checked. If you make it too @@ -197,6 +206,31 @@ small, the optimal number of atoms is checked, but bin overhead goes up. If you set the binsize to 0.0, LAMMPS will use the default binsize of 1/2 the cutoff. +The *collection/type* option allows you to define collections of atom +types, used by the *multi* neighbor mode. By grouping atom types with +similar physical size or interaction cutoff lengths, one may be able +to improve performance by reducing +overhead. You must first specify the number of collections N to be +defined followed by N lists of types. Each list consists of a series of type +ranges separated by commas. The range can be specified as a +single numeric value, or a wildcard asterisk can be used to specify a range +of values. This takes the form "\*" or "\*n" or "n\*" or "m\*n". For +example, if M = the number of atom types, then an asterisk with no numeric +values means all types from 1 to M. A leading asterisk means all types +from 1 to n (inclusive). A trailing asterisk means all types from n to M +(inclusive). A middle asterisk means all types from m to n (inclusive). +Note that all atom types must be included in exactly one of the N collections. + +The *collection/interval* option provides a similar capability. This +command allows a user to define collections by specifying a series of +cutoff intervals. LAMMPS will automatically sort atoms into these +intervals based on their type-dependent cutoffs or their finite size. +You must first specify the number of collections N to be defined +followed by N values representing the upper cutoff of each interval. +This command is particularly useful for granular pair styles where the +interaction distance of particles depends on their radius and may not +depend on their atom type. + Restrictions """""""""""" diff --git a/doc/src/neighbor.rst b/doc/src/neighbor.rst index 98ee0d0b6a..1b10ec8998 100644 --- a/doc/src/neighbor.rst +++ b/doc/src/neighbor.rst @@ -11,7 +11,7 @@ Syntax neighbor skin style * skin = extra distance beyond force cutoff (distance units) -* style = *bin* or *nsq* or *multi* +* style = *bin* or *nsq* or *multi* or *multi/old* Examples """""""" @@ -55,14 +55,31 @@ For the *bin* style, the bin size is set to 1/2 of the largest cutoff distance between any pair of atom types and a single set of bins is defined to search over for all atom types. This can be inefficient if one pair of types has a very long cutoff, but -other type pairs have a much shorter cutoff. For style *multi* the -bin size is set to 1/2 of the shortest cutoff distance and multiple -sets of bins are defined to search over for different atom types. +other type pairs have a much shorter cutoff. The *multi* style uses +different sized bins for collections of different sized particles, where +"size" may mean the physical size of the particle or its cutoff +distance for interacting with other particles. Different +sets of bins are then used to construct the neighbor lists as as further +described by Shire, Hanley, and Stratford :ref:`(Shire) `. This imposes some extra setup overhead, but the searches themselves -may be much faster for the short-cutoff cases. -See the :doc:`comm_modify mode multi ` command for a -communication option that may also be beneficial for simulations of -this kind. +may be much faster. By default, each atom type defines a separate +collection of particles. For systems where two or more atom types +have the same size (either physical size or cutoff distance), the +definition of collections can be customized, which can result in less +overhead and faster performance. See the :doc:`neigh_modify ` +command for how to define custom collections. Whether the collection +definition is customized or not, also see the +:doc:`comm_modify mode multi ` command for communication +options that further improve performance in a manner consistent with +neighbor style multi. + +An alternate style, *multi/old*, sets the bin size to 1/2 of the shortest +cutoff distance and multiple sets of bins are defined to search over for +different atom types. This algorithm used to be the default *multi* +algorithm in LAMMPS but was found to be significantly slower than the new +approach. For now we are keeping the old option in case there are use cases +where multi/old outperforms the new multi style. + The :doc:`neigh_modify ` command has additional options that control how often neighbor lists are built and which pairs are @@ -90,3 +107,9 @@ Default | 0.001 bin for units = si, skin = 0.001 meters = 1.0 mm | 0.1 bin for units = cgs, skin = 0.1 cm = 1.0 mm | + +---------- + +.. _bytype-Shire: + +**(Shire)** Shire, Hanley and Stratford, Comp Part Mech, (2020). diff --git a/doc/src/pair_lj_smooth.rst b/doc/src/pair_lj_smooth.rst index 7ba12c89ba..a77eefd023 100644 --- a/doc/src/pair_lj_smooth.rst +++ b/doc/src/pair_lj_smooth.rst @@ -1,10 +1,11 @@ .. index:: pair_style lj/smooth +.. index:: pair_style lj/smooth/gpu .. index:: pair_style lj/smooth/omp pair_style lj/smooth command ============================ -Accelerator Variants: *lj/smooth/omp* +Accelerator Variants: *lj/smooth/gpu*, *lj/smooth/omp* Syntax """""" diff --git a/doc/utils/sphinx-config/false_positives.txt b/doc/utils/sphinx-config/false_positives.txt index fc40ad004c..7bbafaeafb 100644 --- a/doc/utils/sphinx-config/false_positives.txt +++ b/doc/utils/sphinx-config/false_positives.txt @@ -47,6 +47,7 @@ Agnolin Ai Aidan aij +aimd airebo Aj ajs @@ -196,6 +197,7 @@ Ballenegger Bammann Banna Barashev +barnes barostat Barostats barostatted @@ -312,6 +314,7 @@ br Branduardi Branicio brennan +Briels Brien Brilliantov Broadwell @@ -626,6 +629,7 @@ delflag Dellago delocalization delocalized +Delong delr deltaHf Dendrimer @@ -709,6 +713,7 @@ dodgerblue dof doi Donadio +Donev dotc Doty doxygen @@ -1143,6 +1148,7 @@ GMock gneb GNEB Goldfarb +Gompper Gonzalez-Melchor googlemail googletest @@ -1203,6 +1209,7 @@ Halver Hamaker Hamel Hammerschmidt +Hanley haptic Hara Harpertown @@ -1265,6 +1272,7 @@ holonomic Homebrew hooke Hookean +hostname hotpink Houlle howto @@ -1321,6 +1329,7 @@ ijk ijkl ik Ikeshoji +Ilie ilmenau Ilmenau ilp @@ -1723,6 +1732,7 @@ Lmpsdata lmptype LMT ln +localhost localTemp localvectors Loewen @@ -1853,6 +1863,8 @@ mc McLachlan md mdf +MDI +mdi mdpd mDPD meam @@ -3109,6 +3121,7 @@ Swinburne Swol Swope Sx +sx sy Sy symplectic @@ -3118,6 +3131,7 @@ sysdim Syst systemd Sz +sz Tabbernor tabinner Tadmor @@ -3136,6 +3150,7 @@ Tanmoy Tartakovsky taskset taubi +taylor tb tchain Tchain @@ -3358,6 +3373,7 @@ upenn upto Urbakh Urbana +Usabiaga usec uSemiParallel userguide @@ -3502,6 +3518,7 @@ Wikipedia Wildcard wildcard wildcards +Winkler Wirnsberger wirtes witin @@ -3513,6 +3530,7 @@ Worley Wriggers Wuppertal Wurtzite +Wysocki www wx Wx diff --git a/examples/USER/brownian/2d_velocity/in2d.velocity b/examples/USER/brownian/2d_velocity/in2d.velocity new file mode 100644 index 0000000000..fcfae0554b --- /dev/null +++ b/examples/USER/brownian/2d_velocity/in2d.velocity @@ -0,0 +1,53 @@ +##### 2d overdamped brownian dynamics with self-propulsion force in direction of velocity. ##### + +variable gamma_t equal 1.0 +variable temp equal 1.0 +variable seed equal 1974019 +variable fp equal 4.0 +variable params string ${gamma_t}_${temp}_${fp} + +units lj +dimension 2 +newton off + +lattice sq 0.4 +region box block -16 16 -16 16 -0.2 0.2 +create_box 1 box +create_atoms 1 box +mass * 1.0 +velocity all create 1.0 1 loop geom + +neighbor 1.0 bin +neigh_modify every 1 delay 1 check yes + +pair_style none + +fix step all brownian ${temp} ${seed} gamma_t ${gamma_t} +fix vel all propel/self velocity ${fp} +fix 2 all enforce2d +fix_modify vel virial yes + +compute press all pressure NULL virial + +thermo_style custom step temp epair c_press + +#equilibration +timestep 0.0000000001 +thermo 500 +run 5000 +reset_timestep 0 + +#initialisation for the main run + +# MSD +compute msd all msd + +thermo_style custom step ke pe c_msd[*] c_press + +timestep 0.00001 +thermo 1000 + +# main run +run 12000 + + diff --git a/examples/USER/brownian/2d_velocity/log.11May2021.in2d_velocity.g++.1 b/examples/USER/brownian/2d_velocity/log.11May2021.in2d_velocity.g++.1 new file mode 100644 index 0000000000..a5a58ba9f0 --- /dev/null +++ b/examples/USER/brownian/2d_velocity/log.11May2021.in2d_velocity.g++.1 @@ -0,0 +1,151 @@ +LAMMPS (8 Apr 2021) + using 1 OpenMP thread(s) per MPI task +##### 2d overdamped brownian dynamics with self-propulsion force in direction of velocity. ##### + +variable gamma_t equal 1.0 +variable temp equal 1.0 +variable seed equal 1974019 +variable fp equal 4.0 +variable params string ${gamma_t}_${temp}_${fp} +variable params string 1_${temp}_${fp} +variable params string 1_1_${fp} +variable params string 1_1_4 + +units lj +dimension 2 +newton off + +lattice sq 0.4 +Lattice spacing in x,y,z = 1.5811388 1.5811388 1.5811388 +region box block -16 16 -16 16 -0.2 0.2 +create_box 1 box +Created orthogonal box = (-25.298221 -25.298221 -0.31622777) to (25.298221 25.298221 0.31622777) + 1 by 1 by 1 MPI processor grid +create_atoms 1 box +Created 1024 atoms + create_atoms CPU = 0.001 seconds +mass * 1.0 +velocity all create 1.0 1 loop geom + +neighbor 1.0 bin +neigh_modify every 1 delay 1 check yes + +pair_style none + +fix step all brownian ${temp} ${seed} gamma_t ${gamma_t} +fix step all brownian 1 ${seed} gamma_t ${gamma_t} +fix step all brownian 1 1974019 gamma_t ${gamma_t} +fix step all brownian 1 1974019 gamma_t 1 +fix vel all propel/self velocity ${fp} +fix vel all propel/self velocity 4 +fix 2 all enforce2d +fix_modify vel virial yes + +compute press all pressure NULL virial + +thermo_style custom step temp epair c_press + +#equilibration +timestep 0.0000000001 +thermo 500 +run 5000 +WARNING: No pairwise cutoff or binsize set. Atom sorting therefore disabled. (src/atom.cpp:2141) +WARNING: Communication cutoff is 0.0. No ghost atoms will be generated. Atoms may get lost. (src/comm_brick.cpp:167) +Per MPI rank memory allocation (min/avg/max) = 2.289 | 2.289 | 2.289 Mbytes +Step Temp E_pair c_press + 0 1 0 -0.18336111 + 500 2.0519273e+10 0 -0.048238222 + 1000 1.9821717e+10 0 -0.4711053 + 1500 1.9697609e+10 0 -0.13539588 + 2000 2.0209443e+10 0 0.0094958039 + 2500 1.9591299e+10 0 0.40117118 + 3000 2.089566e+10 0 -0.036548251 + 3500 1.978692e+10 0 0.28282578 + 4000 2.0657848e+10 0 0.17618064 + 4500 2.0837353e+10 0 -0.080724651 + 5000 2.0348316e+10 0 -0.17471195 +Loop time of 0.575164 on 1 procs for 5000 steps with 1024 atoms + +Performance: 0.075 tau/day, 8693.168 timesteps/s +99.9% CPU use with 1 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0 | 0 | 0 | 0.0 | 0.00 +Neigh | 0 | 0 | 0 | 0.0 | 0.00 +Comm | 0.0036819 | 0.0036819 | 0.0036819 | 0.0 | 0.64 +Output | 0.00027752 | 0.00027752 | 0.00027752 | 0.0 | 0.05 +Modify | 0.51999 | 0.51999 | 0.51999 | 0.0 | 90.41 +Other | | 0.05121 | | | 8.90 + +Nlocal: 1024.00 ave 1024 max 1024 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 65.0000 ave 65 max 65 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 0.00000 ave 0 max 0 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 0 +Ave neighs/atom = 0.0000000 +Neighbor list builds = 0 +Dangerous builds = 0 +reset_timestep 0 + +#initialisation for the main run + +# MSD +compute msd all msd + +thermo_style custom step ke pe c_msd[*] c_press + +timestep 0.00001 +thermo 1000 + +# main run +run 12000 +WARNING: Communication cutoff is 0.0. No ghost atoms will be generated. Atoms may get lost. (src/comm_brick.cpp:167) +Per MPI rank memory allocation (min/avg/max) = 2.664 | 2.664 | 2.664 Mbytes +Step KinEng PotEng c_msd[1] c_msd[2] c_msd[3] c_msd[4] c_press + 0 2.0328444e+10 0 0 0 0 0 -0.17471195 + 1000 197017.59 0 0.018147562 0.019839233 0 0.037986796 -0.71897807 + 2000 197030.23 0 0.03909867 0.041721342 0 0.080820011 -0.30051929 + 3000 201997.2 0 0.065694399 0.06235257 0 0.12804697 -0.85167039 + 4000 199927.76 0 0.085698715 0.080328815 0 0.16602753 0.18493117 + 5000 198665.7 0 0.10896054 0.097021266 0 0.2059818 -0.090735406 + 6000 199277.78 0 0.13081111 0.11724814 0 0.24805925 -0.18189034 + 7000 199850.54 0 0.14721838 0.13806858 0 0.28528696 0.11334674 + 8000 191577.11 0 0.16582149 0.15935853 0 0.32518002 -0.73284569 + 9000 197331.29 0 0.17995704 0.18652927 0 0.3664863 -0.015558407 + 10000 197048.17 0 0.2034106 0.20329856 0 0.40670916 0.36985211 + 11000 200105.54 0 0.21809835 0.21966463 0 0.43776298 0.36437 + 12000 203180.39 0 0.23810386 0.23666184 0 0.47476569 -0.072006034 +Loop time of 1.37465 on 1 procs for 12000 steps with 1024 atoms + +Performance: 7542.303 tau/day, 8729.517 timesteps/s +99.8% CPU use with 1 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0 | 0 | 0 | 0.0 | 0.00 +Neigh | 0.00012231 | 0.00012231 | 0.00012231 | 0.0 | 0.01 +Comm | 0.0024607 | 0.0024607 | 0.0024607 | 0.0 | 0.18 +Output | 0.00068665 | 0.00068665 | 0.00068665 | 0.0 | 0.05 +Modify | 1.2479 | 1.2479 | 1.2479 | 0.0 | 90.78 +Other | | 0.1235 | | | 8.98 + +Nlocal: 1024.00 ave 1024 max 1024 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 0.00000 ave 0 max 0 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 0.00000 ave 0 max 0 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 0 +Ave neighs/atom = 0.0000000 +Neighbor list builds = 15 +Dangerous builds = 0 + + +Total wall time: 0:00:01 diff --git a/examples/USER/brownian/2d_velocity/log.11May2021.in2d_velocity.g++.4 b/examples/USER/brownian/2d_velocity/log.11May2021.in2d_velocity.g++.4 new file mode 100644 index 0000000000..df9ff24070 --- /dev/null +++ b/examples/USER/brownian/2d_velocity/log.11May2021.in2d_velocity.g++.4 @@ -0,0 +1,151 @@ +LAMMPS (8 Apr 2021) + using 1 OpenMP thread(s) per MPI task +##### 2d overdamped brownian dynamics with self-propulsion force in direction of velocity. ##### + +variable gamma_t equal 1.0 +variable temp equal 1.0 +variable seed equal 1974019 +variable fp equal 4.0 +variable params string ${gamma_t}_${temp}_${fp} +variable params string 1_${temp}_${fp} +variable params string 1_1_${fp} +variable params string 1_1_4 + +units lj +dimension 2 +newton off + +lattice sq 0.4 +Lattice spacing in x,y,z = 1.5811388 1.5811388 1.5811388 +region box block -16 16 -16 16 -0.2 0.2 +create_box 1 box +Created orthogonal box = (-25.298221 -25.298221 -0.31622777) to (25.298221 25.298221 0.31622777) + 2 by 2 by 1 MPI processor grid +create_atoms 1 box +Created 1024 atoms + create_atoms CPU = 0.001 seconds +mass * 1.0 +velocity all create 1.0 1 loop geom + +neighbor 1.0 bin +neigh_modify every 1 delay 1 check yes + +pair_style none + +fix step all brownian ${temp} ${seed} gamma_t ${gamma_t} +fix step all brownian 1 ${seed} gamma_t ${gamma_t} +fix step all brownian 1 1974019 gamma_t ${gamma_t} +fix step all brownian 1 1974019 gamma_t 1 +fix vel all propel/self velocity ${fp} +fix vel all propel/self velocity 4 +fix 2 all enforce2d +fix_modify vel virial yes + +compute press all pressure NULL virial + +thermo_style custom step temp epair c_press + +#equilibration +timestep 0.0000000001 +thermo 500 +run 5000 +WARNING: No pairwise cutoff or binsize set. Atom sorting therefore disabled. (src/atom.cpp:2141) +WARNING: Communication cutoff is 0.0. No ghost atoms will be generated. Atoms may get lost. (src/comm_brick.cpp:167) +Per MPI rank memory allocation (min/avg/max) = 2.289 | 2.289 | 2.289 Mbytes +Step Temp E_pair c_press + 0 1 0 -0.18336111 + 500 1.9862591e+10 0 -0.32013566 + 1000 2.0093184e+10 0 -0.36609742 + 1500 1.9562283e+10 0 -0.53349351 + 2000 1.9903977e+10 0 0.63783249 + 2500 2.0260128e+10 0 0.30046413 + 3000 1.9948065e+10 0 -0.63093105 + 3500 1.9507486e+10 0 0.48762848 + 4000 2.0049087e+10 0 0.40289309 + 4500 1.9975813e+10 0 0.57649363 + 5000 2.0129291e+10 0 -0.41288352 +Loop time of 0.238949 on 4 procs for 5000 steps with 1024 atoms + +Performance: 0.181 tau/day, 20924.952 timesteps/s +92.6% CPU use with 4 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0 | 0 | 0 | 0.0 | 0.00 +Neigh | 0 | 0 | 0 | 0.0 | 0.00 +Comm | 0.0080078 | 0.024718 | 0.031782 | 6.2 | 10.34 +Output | 0.0001812 | 0.00029999 | 0.00063467 | 0.0 | 0.13 +Modify | 0.13401 | 0.14401 | 0.15438 | 2.4 | 60.27 +Other | | 0.06992 | | | 29.26 + +Nlocal: 256.000 ave 256 max 256 min +Histogram: 4 0 0 0 0 0 0 0 0 0 +Nghost: 33.0000 ave 33 max 33 min +Histogram: 4 0 0 0 0 0 0 0 0 0 +Neighs: 0.00000 ave 0 max 0 min +Histogram: 4 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 0 +Ave neighs/atom = 0.0000000 +Neighbor list builds = 0 +Dangerous builds = 0 +reset_timestep 0 + +#initialisation for the main run + +# MSD +compute msd all msd + +thermo_style custom step ke pe c_msd[*] c_press + +timestep 0.00001 +thermo 1000 + +# main run +run 12000 +WARNING: Communication cutoff is 0.0. No ghost atoms will be generated. Atoms may get lost. (src/comm_brick.cpp:167) +Per MPI rank memory allocation (min/avg/max) = 2.664 | 2.664 | 2.664 Mbytes +Step KinEng PotEng c_msd[1] c_msd[2] c_msd[3] c_msd[4] c_press + 0 2.0109634e+10 0 0 0 0 0 -0.41288352 + 1000 195711.46 0 0.020076462 0.020523099 0 0.040599561 -0.32125126 + 2000 203263.85 0 0.039242992 0.039661282 0 0.078904274 0.11008705 + 3000 197417.54 0 0.064938128 0.057716419 0 0.12265455 0.16967601 + 4000 200505.97 0 0.086511225 0.074975267 0 0.16148649 0.31338473 + 5000 199373.77 0 0.10583263 0.098175658 0 0.20400829 0.34205791 + 6000 192881.14 0 0.12152088 0.11706037 0 0.23858125 -0.27870467 + 7000 203045.3 0 0.1383248 0.13629503 0 0.27461983 -0.046936646 + 8000 198544.08 0 0.16064738 0.1582206 0 0.31886798 -0.18803452 + 9000 205450.74 0 0.17926529 0.1829047 0 0.36216999 0.47191228 + 10000 200371.73 0 0.20084273 0.20365189 0 0.40449463 0.093098262 + 11000 202911.93 0 0.21569236 0.22221715 0 0.43790952 -0.38430031 + 12000 192590.04 0 0.24041439 0.24114487 0 0.48155926 -0.1677052 +Loop time of 0.443026 on 4 procs for 12000 steps with 1024 atoms + +Performance: 23402.683 tau/day, 27086.439 timesteps/s +97.7% CPU use with 4 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0 | 0 | 0 | 0.0 | 0.00 +Neigh | 3.2663e-05 | 3.3855e-05 | 3.4809e-05 | 0.0 | 0.01 +Comm | 0.0030291 | 0.0030628 | 0.0030825 | 0.0 | 0.69 +Output | 0.00027895 | 0.00051624 | 0.001184 | 0.0 | 0.12 +Modify | 0.31607 | 0.33372 | 0.37391 | 4.0 | 75.33 +Other | | 0.1057 | | | 23.86 + +Nlocal: 256.000 ave 259 max 253 min +Histogram: 1 0 0 1 0 0 1 0 0 1 +Nghost: 0.00000 ave 0 max 0 min +Histogram: 4 0 0 0 0 0 0 0 0 0 +Neighs: 0.00000 ave 0 max 0 min +Histogram: 4 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 0 +Ave neighs/atom = 0.0000000 +Neighbor list builds = 15 +Dangerous builds = 0 + + +Total wall time: 0:00:00 diff --git a/examples/USER/brownian/asphere/in2d.ellipsoid b/examples/USER/brownian/asphere/in2d.ellipsoid new file mode 100644 index 0000000000..4b1a88bba7 --- /dev/null +++ b/examples/USER/brownian/asphere/in2d.ellipsoid @@ -0,0 +1,54 @@ +##### overdamped dynamics of non-interacting ellipsoids in 2D ##### + +variable rng string gaussian +variable seed string 198098 +variable temp string 1.0 +variable gamma_r_1 string inf +variable gamma_r_2 string inf +variable gamma_r_3 string 0.1 +variable gamma_t_1 string 5.0 +variable gamma_t_2 string 7.0 +variable gamma_t_3 string inf +variable params string ${rng}_${temp}_${gamma_r_1}_${gamma_r_2}_${gamma_r_3}_${gamma_t_1}_${gamma_t_2}_${gamma_t_3} + +units lj +atom_style hybrid dipole ellipsoid +dimension 2 +newton off + +lattice sq 0.4 +region box block -30 30 -30 30 -0.2 0.2 +create_box 1 box +create_atoms 1 box +mass * 1.0 +set type * dipole/random ${seed} 1.0 +set type * shape 3.0 1.0 1.0 +set type * quat/random ${seed} +velocity all create 1.0 1 loop geom + +neighbor 1.0 bin +neigh_modify every 1 delay 1 check yes + +pair_style none + +fix 1 all brownian/asphere ${temp} ${seed} rng ${rng} & + gamma_r_eigen ${gamma_r_1} ${gamma_r_2} ${gamma_r_3} & + gamma_t_eigen ${gamma_t_1} ${gamma_t_2} ${gamma_t_3} & + dipole 1.0 0.0 0.0 + +#initialisation for the main run + +# MSD +compute msd all msd + +thermo_style custom step ke pe c_msd[*] + +#dump 1 all custom 1000 dump_${params}_2d.lammpstrj id type & +# x y z xu yu zu mux muy muz fx fy fz +#dump_modify 1 first yes sort id + +timestep 0.00001 +thermo 100 + +# main run +run 3000 diff --git a/examples/USER/brownian/asphere/in3d.ellipsoid b/examples/USER/brownian/asphere/in3d.ellipsoid new file mode 100644 index 0000000000..34f2684ee7 --- /dev/null +++ b/examples/USER/brownian/asphere/in3d.ellipsoid @@ -0,0 +1,54 @@ +##### overdamped dynamics of non-interacting ellipsoids in 3D ##### + +variable rng string uniform +variable seed string 198098 +variable temp string 1.0 +variable gamma_r_1 string 2.0 +variable gamma_r_2 string 0.25 +variable gamma_r_3 string 0.1 +variable gamma_t_1 string 5.0 +variable gamma_t_2 string 7.0 +variable gamma_t_3 string 9.0 +variable params string ${rng}_${temp}_${gamma_r_1}_${gamma_r_2}_${gamma_r_3}_${gamma_t_1}_${gamma_t_2}_${gamma_t_3} + +units lj +atom_style hybrid dipole ellipsoid +dimension 3 +newton off + +lattice sc 0.4 +region box block -8 8 -8 8 -8 8 +create_box 1 box +create_atoms 1 box +mass * 1.0 +set type * dipole/random ${seed} 1.0 +set type * shape 3.0 1.0 1.0 +set type * quat/random ${seed} +velocity all create 1.0 1 loop geom + +neighbor 1.0 bin +neigh_modify every 1 delay 1 check yes + +pair_style none + +fix 1 all brownian/asphere ${temp} ${seed} rng ${rng} & + gamma_r_eigen ${gamma_r_1} ${gamma_r_2} ${gamma_r_3} & + gamma_t_eigen ${gamma_t_1} ${gamma_t_2} ${gamma_t_3} & + dipole 1.0 0.0 0.0 + +#initialisation for the main run + +# MSD +compute msd all msd + +thermo_style custom step ke pe c_msd[*] + +#dump 1 all custom 1000 dump_${params}_3d.lammpstrj id type & +# x y z xu yu zu mux muy muz fx fy fz +#dump_modify 1 first yes sort id + +timestep 0.00001 +thermo 100 + +# main run +run 3000 diff --git a/examples/USER/brownian/asphere/log.11May2021.in2d.ellipsoid.g++.1 b/examples/USER/brownian/asphere/log.11May2021.in2d.ellipsoid.g++.1 new file mode 100644 index 0000000000..619ab937c9 --- /dev/null +++ b/examples/USER/brownian/asphere/log.11May2021.in2d.ellipsoid.g++.1 @@ -0,0 +1,145 @@ +LAMMPS (8 Apr 2021) + using 1 OpenMP thread(s) per MPI task +##### overdamped dynamics of non-interacting ellipsoids in 2D ##### + +variable rng string gaussian +variable seed string 198098 +variable temp string 1.0 +variable gamma_r_1 string inf +variable gamma_r_2 string inf +variable gamma_r_3 string 0.1 +variable gamma_t_1 string 5.0 +variable gamma_t_2 string 7.0 +variable gamma_t_3 string inf +variable params string ${rng}_${temp}_${gamma_r_1}_${gamma_r_2}_${gamma_r_3}_${gamma_t_1}_${gamma_t_2}_${gamma_t_3} +variable params string gaussian_${temp}_${gamma_r_1}_${gamma_r_2}_${gamma_r_3}_${gamma_t_1}_${gamma_t_2}_${gamma_t_3} +variable params string gaussian_1.0_${gamma_r_1}_${gamma_r_2}_${gamma_r_3}_${gamma_t_1}_${gamma_t_2}_${gamma_t_3} +variable params string gaussian_1.0_inf_${gamma_r_2}_${gamma_r_3}_${gamma_t_1}_${gamma_t_2}_${gamma_t_3} +variable params string gaussian_1.0_inf_inf_${gamma_r_3}_${gamma_t_1}_${gamma_t_2}_${gamma_t_3} +variable params string gaussian_1.0_inf_inf_0.1_${gamma_t_1}_${gamma_t_2}_${gamma_t_3} +variable params string gaussian_1.0_inf_inf_0.1_5.0_${gamma_t_2}_${gamma_t_3} +variable params string gaussian_1.0_inf_inf_0.1_5.0_7.0_${gamma_t_3} +variable params string gaussian_1.0_inf_inf_0.1_5.0_7.0_inf + +units lj +atom_style hybrid dipole ellipsoid +WARNING: Atom style hybrid defines both, per-type and per-atom masses; both must be set, but only per-atom masses will be used (src/atom_vec_hybrid.cpp:156) +dimension 2 +newton off + +lattice sq 0.4 +Lattice spacing in x,y,z = 1.5811388 1.5811388 1.5811388 +region box block -30 30 -30 30 -0.2 0.2 +create_box 1 box +Created orthogonal box = (-47.434165 -47.434165 -0.31622777) to (47.434165 47.434165 0.31622777) + 1 by 1 by 1 MPI processor grid +create_atoms 1 box +Created 3600 atoms + create_atoms CPU = 0.005 seconds +mass * 1.0 +set type * dipole/random ${seed} 1.0 +set type * dipole/random 198098 1.0 +Setting atom values ... + 3600 settings made for dipole/random +set type * shape 3.0 1.0 1.0 +Setting atom values ... + 3600 settings made for shape +set type * quat/random ${seed} +set type * quat/random 198098 +Setting atom values ... + 3600 settings made for quat/random +velocity all create 1.0 1 loop geom + +neighbor 1.0 bin +neigh_modify every 1 delay 1 check yes + +pair_style none + +fix 1 all brownian/asphere ${temp} ${seed} rng ${rng} gamma_r_eigen ${gamma_r_1} ${gamma_r_2} ${gamma_r_3} gamma_t_eigen ${gamma_t_1} ${gamma_t_2} ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 ${seed} rng ${rng} gamma_r_eigen ${gamma_r_1} ${gamma_r_2} ${gamma_r_3} gamma_t_eigen ${gamma_t_1} ${gamma_t_2} ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 198098 rng ${rng} gamma_r_eigen ${gamma_r_1} ${gamma_r_2} ${gamma_r_3} gamma_t_eigen ${gamma_t_1} ${gamma_t_2} ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 198098 rng gaussian gamma_r_eigen ${gamma_r_1} ${gamma_r_2} ${gamma_r_3} gamma_t_eigen ${gamma_t_1} ${gamma_t_2} ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 198098 rng gaussian gamma_r_eigen inf ${gamma_r_2} ${gamma_r_3} gamma_t_eigen ${gamma_t_1} ${gamma_t_2} ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 198098 rng gaussian gamma_r_eigen inf inf ${gamma_r_3} gamma_t_eigen ${gamma_t_1} ${gamma_t_2} ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 198098 rng gaussian gamma_r_eigen inf inf 0.1 gamma_t_eigen ${gamma_t_1} ${gamma_t_2} ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 198098 rng gaussian gamma_r_eigen inf inf 0.1 gamma_t_eigen 5.0 ${gamma_t_2} ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 198098 rng gaussian gamma_r_eigen inf inf 0.1 gamma_t_eigen 5.0 7.0 ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 198098 rng gaussian gamma_r_eigen inf inf 0.1 gamma_t_eigen 5.0 7.0 inf dipole 1.0 0.0 0.0 + +#initialisation for the main run + +# MSD +compute msd all msd + +thermo_style custom step ke pe c_msd[*] + +#dump 1 all custom 1000 dump_${params}_2d.lammpstrj id type # x y z xu yu zu mux muy muz fx fy fz +#dump_modify 1 first yes sort id + +timestep 0.00001 +thermo 100 + +# main run +run 3000 +WARNING: No pairwise cutoff or binsize set. Atom sorting therefore disabled. (src/atom.cpp:2141) +WARNING: Communication cutoff is 0.0. No ghost atoms will be generated. Atoms may get lost. (src/comm_brick.cpp:167) +Per MPI rank memory allocation (min/avg/max) = 5.114 | 5.114 | 5.114 Mbytes +Step KinEng PotEng c_msd[1] c_msd[2] c_msd[3] c_msd[4] + 0 0.99972222 0 0 0 0 0 + 100 34376.187 0 0.00034728749 0.00034318997 0 0.00069047747 + 200 36135.708 0 0.00067452936 0.0006818928 0 0.0013564222 + 300 34444.929 0 0.0010189319 0.00099849203 0 0.002017424 + 400 35668.312 0 0.0013648699 0.0013311884 0 0.0026960583 + 500 35388.615 0 0.0017330203 0.0016077184 0 0.0033407387 + 600 35180.065 0 0.002052223 0.0019394635 0 0.0039916865 + 700 34035.38 0 0.0024329956 0.0022664905 0 0.0046994861 + 800 34581.664 0 0.002783885 0.0025794872 0 0.0053633723 + 900 34579.945 0 0.003163442 0.0029351952 0 0.0060986372 + 1000 34158.066 0 0.0035589034 0.0032627605 0 0.0068216639 + 1100 33453.827 0 0.0038861895 0.003565372 0 0.0074515615 + 1200 33608.06 0 0.0041325698 0.0038943268 0 0.0080268966 + 1300 34381.633 0 0.004405682 0.0043294156 0 0.0087350976 + 1400 32925.746 0 0.0047383547 0.0046803517 0 0.0094187065 + 1500 34809.764 0 0.0051149571 0.0049309746 0 0.010045932 + 1600 33580.096 0 0.0054893472 0.0052465377 0 0.010735885 + 1700 34596.275 0 0.00581894 0.0056500316 0 0.011468972 + 1800 33926.736 0 0.0062129617 0.0059796125 0 0.012192574 + 1900 35577.131 0 0.0065668637 0.0062530163 0 0.01281988 + 2000 34224.967 0 0.0070005917 0.006598912 0 0.013599504 + 2100 33991.406 0 0.0073134826 0.0069119252 0 0.014225408 + 2200 34647.054 0 0.007659301 0.0073434715 0 0.015002772 + 2300 33956.835 0 0.007965191 0.0076318537 0 0.015597045 + 2400 35272.549 0 0.0082467116 0.007929202 0 0.016175914 + 2500 33901.494 0 0.0086251299 0.0082790757 0 0.016904206 + 2600 34138.227 0 0.0089419364 0.0086639744 0 0.017605911 + 2700 33691.013 0 0.0093083376 0.0090219118 0 0.018330249 + 2800 34716.817 0 0.0095840095 0.0094118945 0 0.018995904 + 2900 34473.982 0 0.0099773501 0.0098167668 0 0.019794117 + 3000 33406.776 0 0.010391969 0.010098625 0 0.020490594 +Loop time of 3.67112 on 1 procs for 3000 steps with 3600 atoms + +Performance: 706.051 tau/day, 817.189 timesteps/s +99.9% CPU use with 1 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0 | 0 | 0 | 0.0 | 0.00 +Neigh | 0 | 0 | 0 | 0.0 | 0.00 +Comm | 0.037973 | 0.037973 | 0.037973 | 0.0 | 1.03 +Output | 0.0040674 | 0.0040674 | 0.0040674 | 0.0 | 0.11 +Modify | 3.515 | 3.515 | 3.515 | 0.0 | 95.75 +Other | | 0.1141 | | | 3.11 + +Nlocal: 3600.00 ave 3600 max 3600 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 121.000 ave 121 max 121 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 0.00000 ave 0 max 0 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 0 +Ave neighs/atom = 0.0000000 +Neighbor list builds = 0 +Dangerous builds = 0 +Total wall time: 0:00:03 diff --git a/examples/USER/brownian/asphere/log.11May2021.in2d.ellipsoid.g++.4 b/examples/USER/brownian/asphere/log.11May2021.in2d.ellipsoid.g++.4 new file mode 100644 index 0000000000..b3da385dda --- /dev/null +++ b/examples/USER/brownian/asphere/log.11May2021.in2d.ellipsoid.g++.4 @@ -0,0 +1,145 @@ +LAMMPS (8 Apr 2021) + using 1 OpenMP thread(s) per MPI task +##### overdamped dynamics of non-interacting ellipsoids in 2D ##### + +variable rng string gaussian +variable seed string 198098 +variable temp string 1.0 +variable gamma_r_1 string inf +variable gamma_r_2 string inf +variable gamma_r_3 string 0.1 +variable gamma_t_1 string 5.0 +variable gamma_t_2 string 7.0 +variable gamma_t_3 string inf +variable params string ${rng}_${temp}_${gamma_r_1}_${gamma_r_2}_${gamma_r_3}_${gamma_t_1}_${gamma_t_2}_${gamma_t_3} +variable params string gaussian_${temp}_${gamma_r_1}_${gamma_r_2}_${gamma_r_3}_${gamma_t_1}_${gamma_t_2}_${gamma_t_3} +variable params string gaussian_1.0_${gamma_r_1}_${gamma_r_2}_${gamma_r_3}_${gamma_t_1}_${gamma_t_2}_${gamma_t_3} +variable params string gaussian_1.0_inf_${gamma_r_2}_${gamma_r_3}_${gamma_t_1}_${gamma_t_2}_${gamma_t_3} +variable params string gaussian_1.0_inf_inf_${gamma_r_3}_${gamma_t_1}_${gamma_t_2}_${gamma_t_3} +variable params string gaussian_1.0_inf_inf_0.1_${gamma_t_1}_${gamma_t_2}_${gamma_t_3} +variable params string gaussian_1.0_inf_inf_0.1_5.0_${gamma_t_2}_${gamma_t_3} +variable params string gaussian_1.0_inf_inf_0.1_5.0_7.0_${gamma_t_3} +variable params string gaussian_1.0_inf_inf_0.1_5.0_7.0_inf + +units lj +atom_style hybrid dipole ellipsoid +WARNING: Atom style hybrid defines both, per-type and per-atom masses; both must be set, but only per-atom masses will be used (src/atom_vec_hybrid.cpp:156) +dimension 2 +newton off + +lattice sq 0.4 +Lattice spacing in x,y,z = 1.5811388 1.5811388 1.5811388 +region box block -30 30 -30 30 -0.2 0.2 +create_box 1 box +Created orthogonal box = (-47.434165 -47.434165 -0.31622777) to (47.434165 47.434165 0.31622777) + 2 by 2 by 1 MPI processor grid +create_atoms 1 box +Created 3600 atoms + create_atoms CPU = 0.007 seconds +mass * 1.0 +set type * dipole/random ${seed} 1.0 +set type * dipole/random 198098 1.0 +Setting atom values ... + 3600 settings made for dipole/random +set type * shape 3.0 1.0 1.0 +Setting atom values ... + 3600 settings made for shape +set type * quat/random ${seed} +set type * quat/random 198098 +Setting atom values ... + 3600 settings made for quat/random +velocity all create 1.0 1 loop geom + +neighbor 1.0 bin +neigh_modify every 1 delay 1 check yes + +pair_style none + +fix 1 all brownian/asphere ${temp} ${seed} rng ${rng} gamma_r_eigen ${gamma_r_1} ${gamma_r_2} ${gamma_r_3} gamma_t_eigen ${gamma_t_1} ${gamma_t_2} ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 ${seed} rng ${rng} gamma_r_eigen ${gamma_r_1} ${gamma_r_2} ${gamma_r_3} gamma_t_eigen ${gamma_t_1} ${gamma_t_2} ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 198098 rng ${rng} gamma_r_eigen ${gamma_r_1} ${gamma_r_2} ${gamma_r_3} gamma_t_eigen ${gamma_t_1} ${gamma_t_2} ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 198098 rng gaussian gamma_r_eigen ${gamma_r_1} ${gamma_r_2} ${gamma_r_3} gamma_t_eigen ${gamma_t_1} ${gamma_t_2} ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 198098 rng gaussian gamma_r_eigen inf ${gamma_r_2} ${gamma_r_3} gamma_t_eigen ${gamma_t_1} ${gamma_t_2} ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 198098 rng gaussian gamma_r_eigen inf inf ${gamma_r_3} gamma_t_eigen ${gamma_t_1} ${gamma_t_2} ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 198098 rng gaussian gamma_r_eigen inf inf 0.1 gamma_t_eigen ${gamma_t_1} ${gamma_t_2} ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 198098 rng gaussian gamma_r_eigen inf inf 0.1 gamma_t_eigen 5.0 ${gamma_t_2} ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 198098 rng gaussian gamma_r_eigen inf inf 0.1 gamma_t_eigen 5.0 7.0 ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 198098 rng gaussian gamma_r_eigen inf inf 0.1 gamma_t_eigen 5.0 7.0 inf dipole 1.0 0.0 0.0 + +#initialisation for the main run + +# MSD +compute msd all msd + +thermo_style custom step ke pe c_msd[*] + +#dump 1 all custom 1000 dump_${params}_2d.lammpstrj id type # x y z xu yu zu mux muy muz fx fy fz +#dump_modify 1 first yes sort id + +timestep 0.00001 +thermo 100 + +# main run +run 3000 +WARNING: No pairwise cutoff or binsize set. Atom sorting therefore disabled. (src/atom.cpp:2141) +WARNING: Communication cutoff is 0.0. No ghost atoms will be generated. Atoms may get lost. (src/comm_brick.cpp:167) +Per MPI rank memory allocation (min/avg/max) = 5.102 | 5.102 | 5.102 Mbytes +Step KinEng PotEng c_msd[1] c_msd[2] c_msd[3] c_msd[4] + 0 0.99972222 0 0 0 0 0 + 100 33874.438 0 0.0003458147 0.00033165629 0 0.00067747099 + 200 34893.188 0 0.00066290209 0.00068146332 0 0.0013443654 + 300 34494.226 0 0.0010064902 0.0010276646 0 0.0020341548 + 400 34537.887 0 0.0013457339 0.0014057042 0 0.0027514381 + 500 34458.46 0 0.0017006949 0.0017120083 0 0.0034127033 + 600 33229.977 0 0.0020841613 0.0020485346 0 0.0041326959 + 700 33288.631 0 0.0024270272 0.0023673304 0 0.0047943576 + 800 35317.512 0 0.0027924435 0.0026950912 0 0.0054875347 + 900 33094.299 0 0.0031503627 0.0030789319 0 0.0062292946 + 1000 35801.751 0 0.003489398 0.0034594626 0 0.0069488607 + 1100 33427.701 0 0.0038547506 0.0038375809 0 0.0076923316 + 1200 34675.07 0 0.0041824195 0.0042017298 0 0.0083841493 + 1300 33080.294 0 0.0045258945 0.0045816356 0 0.0091075301 + 1400 34927.288 0 0.0048252992 0.0049215701 0 0.0097468693 + 1500 34338.558 0 0.0051959155 0.0053020102 0 0.010497926 + 1600 34686.248 0 0.0055111463 0.0056220225 0 0.011133169 + 1700 34336.158 0 0.0059240394 0.0059060319 0 0.011830071 + 1800 34315.859 0 0.0063027944 0.0063004467 0 0.012603241 + 1900 35096.721 0 0.0066098525 0.00672222 0 0.013332073 + 2000 33544.18 0 0.0069401261 0.007074124 0 0.01401425 + 2100 33863.219 0 0.0072726502 0.0074175954 0 0.014690246 + 2200 34705.892 0 0.0075586722 0.0077552683 0 0.015313941 + 2300 34025.357 0 0.0079046728 0.0081760519 0 0.016080725 + 2400 34741.849 0 0.008252969 0.0085203087 0 0.016773278 + 2500 34406.959 0 0.0085370091 0.0088556377 0 0.017392647 + 2600 34062.63 0 0.0088134153 0.0092536326 0 0.018067048 + 2700 34677.666 0 0.0090592854 0.0096225881 0 0.018681874 + 2800 33464.216 0 0.0093984162 0.0099647695 0 0.019363186 + 2900 32920.721 0 0.0098222985 0.010366517 0 0.020188816 + 3000 34539.66 0 0.010133317 0.01068102 0 0.020814337 +Loop time of 1.12143 on 4 procs for 3000 steps with 3600 atoms + +Performance: 2311.341 tau/day, 2675.163 timesteps/s +96.1% CPU use with 4 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0 | 0 | 0 | 0.0 | 0.00 +Neigh | 0 | 0 | 0 | 0.0 | 0.00 +Comm | 0.036017 | 0.042828 | 0.051558 | 2.7 | 3.82 +Output | 0.0012608 | 0.0025993 | 0.0063775 | 4.3 | 0.23 +Modify | 0.9002 | 0.93095 | 0.99546 | 3.9 | 83.01 +Other | | 0.1451 | | | 12.93 + +Nlocal: 900.000 ave 900 max 900 min +Histogram: 4 0 0 0 0 0 0 0 0 0 +Nghost: 61.0000 ave 61 max 61 min +Histogram: 4 0 0 0 0 0 0 0 0 0 +Neighs: 0.00000 ave 0 max 0 min +Histogram: 4 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 0 +Ave neighs/atom = 0.0000000 +Neighbor list builds = 0 +Dangerous builds = 0 +Total wall time: 0:00:01 diff --git a/examples/USER/brownian/asphere/log.11May2021.in3d.ellipsoid.g++.1 b/examples/USER/brownian/asphere/log.11May2021.in3d.ellipsoid.g++.1 new file mode 100644 index 0000000000..a193fc98e2 --- /dev/null +++ b/examples/USER/brownian/asphere/log.11May2021.in3d.ellipsoid.g++.1 @@ -0,0 +1,145 @@ +LAMMPS (8 Apr 2021) + using 1 OpenMP thread(s) per MPI task +##### overdamped dynamics of non-interacting ellipsoids in 3D ##### + +variable rng string uniform +variable seed string 198098 +variable temp string 1.0 +variable gamma_r_1 string 2.0 +variable gamma_r_2 string 0.25 +variable gamma_r_3 string 0.1 +variable gamma_t_1 string 5.0 +variable gamma_t_2 string 7.0 +variable gamma_t_3 string 9.0 +variable params string ${rng}_${temp}_${gamma_r_1}_${gamma_r_2}_${gamma_r_3}_${gamma_t_1}_${gamma_t_2}_${gamma_t_3} +variable params string uniform_${temp}_${gamma_r_1}_${gamma_r_2}_${gamma_r_3}_${gamma_t_1}_${gamma_t_2}_${gamma_t_3} +variable params string uniform_1.0_${gamma_r_1}_${gamma_r_2}_${gamma_r_3}_${gamma_t_1}_${gamma_t_2}_${gamma_t_3} +variable params string uniform_1.0_2.0_${gamma_r_2}_${gamma_r_3}_${gamma_t_1}_${gamma_t_2}_${gamma_t_3} +variable params string uniform_1.0_2.0_0.25_${gamma_r_3}_${gamma_t_1}_${gamma_t_2}_${gamma_t_3} +variable params string uniform_1.0_2.0_0.25_0.1_${gamma_t_1}_${gamma_t_2}_${gamma_t_3} +variable params string uniform_1.0_2.0_0.25_0.1_5.0_${gamma_t_2}_${gamma_t_3} +variable params string uniform_1.0_2.0_0.25_0.1_5.0_7.0_${gamma_t_3} +variable params string uniform_1.0_2.0_0.25_0.1_5.0_7.0_9.0 + +units lj +atom_style hybrid dipole ellipsoid +WARNING: Atom style hybrid defines both, per-type and per-atom masses; both must be set, but only per-atom masses will be used (src/atom_vec_hybrid.cpp:156) +dimension 3 +newton off + +lattice sc 0.4 +Lattice spacing in x,y,z = 1.3572088 1.3572088 1.3572088 +region box block -8 8 -8 8 -8 8 +create_box 1 box +Created orthogonal box = (-10.857670 -10.857670 -10.857670) to (10.857670 10.857670 10.857670) + 1 by 1 by 1 MPI processor grid +create_atoms 1 box +Created 4096 atoms + create_atoms CPU = 0.005 seconds +mass * 1.0 +set type * dipole/random ${seed} 1.0 +set type * dipole/random 198098 1.0 +Setting atom values ... + 4096 settings made for dipole/random +set type * shape 3.0 1.0 1.0 +Setting atom values ... + 4096 settings made for shape +set type * quat/random ${seed} +set type * quat/random 198098 +Setting atom values ... + 4096 settings made for quat/random +velocity all create 1.0 1 loop geom + +neighbor 1.0 bin +neigh_modify every 1 delay 1 check yes + +pair_style none + +fix 1 all brownian/asphere ${temp} ${seed} rng ${rng} gamma_r_eigen ${gamma_r_1} ${gamma_r_2} ${gamma_r_3} gamma_t_eigen ${gamma_t_1} ${gamma_t_2} ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 ${seed} rng ${rng} gamma_r_eigen ${gamma_r_1} ${gamma_r_2} ${gamma_r_3} gamma_t_eigen ${gamma_t_1} ${gamma_t_2} ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 198098 rng ${rng} gamma_r_eigen ${gamma_r_1} ${gamma_r_2} ${gamma_r_3} gamma_t_eigen ${gamma_t_1} ${gamma_t_2} ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 198098 rng uniform gamma_r_eigen ${gamma_r_1} ${gamma_r_2} ${gamma_r_3} gamma_t_eigen ${gamma_t_1} ${gamma_t_2} ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 198098 rng uniform gamma_r_eigen 2.0 ${gamma_r_2} ${gamma_r_3} gamma_t_eigen ${gamma_t_1} ${gamma_t_2} ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 198098 rng uniform gamma_r_eigen 2.0 0.25 ${gamma_r_3} gamma_t_eigen ${gamma_t_1} ${gamma_t_2} ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 198098 rng uniform gamma_r_eigen 2.0 0.25 0.1 gamma_t_eigen ${gamma_t_1} ${gamma_t_2} ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 198098 rng uniform gamma_r_eigen 2.0 0.25 0.1 gamma_t_eigen 5.0 ${gamma_t_2} ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 198098 rng uniform gamma_r_eigen 2.0 0.25 0.1 gamma_t_eigen 5.0 7.0 ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 198098 rng uniform gamma_r_eigen 2.0 0.25 0.1 gamma_t_eigen 5.0 7.0 9.0 dipole 1.0 0.0 0.0 + +#initialisation for the main run + +# MSD +compute msd all msd + +thermo_style custom step ke pe c_msd[*] + +#dump 1 all custom 1000 dump_${params}_3d.lammpstrj id type # x y z xu yu zu mux muy muz fx fy fz +#dump_modify 1 first yes sort id + +timestep 0.00001 +thermo 100 + +# main run +run 3000 +WARNING: No pairwise cutoff or binsize set. Atom sorting therefore disabled. (src/atom.cpp:2141) +WARNING: Communication cutoff is 0.0. No ghost atoms will be generated. Atoms may get lost. (src/comm_brick.cpp:167) +Per MPI rank memory allocation (min/avg/max) = 5.219 | 5.219 | 5.219 Mbytes +Step KinEng PotEng c_msd[1] c_msd[2] c_msd[3] c_msd[4] + 0 1.4996338 0 0 0 0 0 + 100 45690.838 0 0.00029994317 0.00029953902 0.00030002809 0.00089951027 + 200 45571.166 0 0.00061376797 0.00060955238 0.00061153551 0.0018348559 + 300 44693.418 0 0.00093058034 0.00089383536 0.00091554588 0.0027399616 + 400 44831.846 0 0.001250227 0.0012230128 0.0012120517 0.0036852914 + 500 45028.015 0 0.0015448869 0.0015339549 0.0014978843 0.0045767262 + 600 45895.442 0 0.0018621952 0.0018169905 0.0018352784 0.0055144641 + 700 45858.744 0 0.0021617097 0.0021137714 0.0021360394 0.0064115206 + 800 45155.215 0 0.002428445 0.0024288837 0.0024516737 0.0073090023 + 900 45427.427 0 0.0027265978 0.0027662531 0.0027329878 0.0082258387 + 1000 45398.166 0 0.0030685345 0.0030805014 0.0029765916 0.0091256275 + 1100 44622.428 0 0.0033766954 0.0033976168 0.0032745406 0.010048853 + 1200 45500.277 0 0.0036410565 0.0036840528 0.0035831659 0.010908275 + 1300 45265.8 0 0.0039143146 0.0039419334 0.0038761633 0.011732411 + 1400 45482.435 0 0.0042006542 0.0043373651 0.004164002 0.012702021 + 1500 45126.629 0 0.0044647379 0.0046021855 0.004487041 0.013553965 + 1600 45178.172 0 0.0047726618 0.0049110287 0.0048012671 0.014484958 + 1700 44918.685 0 0.005104787 0.0052522662 0.0050844375 0.015441491 + 1800 44776.678 0 0.0054395368 0.0056092038 0.0054623875 0.016511128 + 1900 46035.987 0 0.0057735872 0.0059357043 0.0057296009 0.017438892 + 2000 45436.517 0 0.0060837459 0.0063485717 0.0059769119 0.018409229 + 2100 45871.502 0 0.0063736337 0.0066551978 0.0063077439 0.019336575 + 2200 45511.847 0 0.0066419141 0.0069700452 0.0065553318 0.020167291 + 2300 45597.047 0 0.0069251517 0.0073015716 0.0068945654 0.021121289 + 2400 44832.007 0 0.0071894253 0.0076238221 0.0071638554 0.021977103 + 2500 45668.42 0 0.0074351304 0.0079594991 0.0075390719 0.022933701 + 2600 45248.483 0 0.007781496 0.008293944 0.0077956068 0.023871047 + 2700 45308.515 0 0.0080302993 0.0086329679 0.0081457335 0.024809001 + 2800 45637.72 0 0.0083889026 0.0089173198 0.0086032427 0.025909465 + 2900 45909.343 0 0.0087169392 0.009181179 0.0088778569 0.026775975 + 3000 45213.613 0 0.0090508891 0.0094253485 0.0092660321 0.02774227 +Loop time of 4.13937 on 1 procs for 3000 steps with 4096 atoms + +Performance: 626.183 tau/day, 724.749 timesteps/s +99.9% CPU use with 1 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0 | 0 | 0 | 0.0 | 0.00 +Neigh | 0 | 0 | 0 | 0.0 | 0.00 +Comm | 0.24709 | 0.24709 | 0.24709 | 0.0 | 5.97 +Output | 0.004636 | 0.004636 | 0.004636 | 0.0 | 0.11 +Modify | 3.7604 | 3.7604 | 3.7604 | 0.0 | 90.85 +Other | | 0.1272 | | | 3.07 + +Nlocal: 4096.00 ave 4096 max 4096 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 817.000 ave 817 max 817 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 0.00000 ave 0 max 0 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 0 +Ave neighs/atom = 0.0000000 +Neighbor list builds = 0 +Dangerous builds = 0 +Total wall time: 0:00:04 diff --git a/examples/USER/brownian/asphere/log.11May2021.in3d.ellipsoid.g++.4 b/examples/USER/brownian/asphere/log.11May2021.in3d.ellipsoid.g++.4 new file mode 100644 index 0000000000..7f6d8a81f0 --- /dev/null +++ b/examples/USER/brownian/asphere/log.11May2021.in3d.ellipsoid.g++.4 @@ -0,0 +1,145 @@ +LAMMPS (8 Apr 2021) + using 1 OpenMP thread(s) per MPI task +##### overdamped dynamics of non-interacting ellipsoids in 3D ##### + +variable rng string uniform +variable seed string 198098 +variable temp string 1.0 +variable gamma_r_1 string 2.0 +variable gamma_r_2 string 0.25 +variable gamma_r_3 string 0.1 +variable gamma_t_1 string 5.0 +variable gamma_t_2 string 7.0 +variable gamma_t_3 string 9.0 +variable params string ${rng}_${temp}_${gamma_r_1}_${gamma_r_2}_${gamma_r_3}_${gamma_t_1}_${gamma_t_2}_${gamma_t_3} +variable params string uniform_${temp}_${gamma_r_1}_${gamma_r_2}_${gamma_r_3}_${gamma_t_1}_${gamma_t_2}_${gamma_t_3} +variable params string uniform_1.0_${gamma_r_1}_${gamma_r_2}_${gamma_r_3}_${gamma_t_1}_${gamma_t_2}_${gamma_t_3} +variable params string uniform_1.0_2.0_${gamma_r_2}_${gamma_r_3}_${gamma_t_1}_${gamma_t_2}_${gamma_t_3} +variable params string uniform_1.0_2.0_0.25_${gamma_r_3}_${gamma_t_1}_${gamma_t_2}_${gamma_t_3} +variable params string uniform_1.0_2.0_0.25_0.1_${gamma_t_1}_${gamma_t_2}_${gamma_t_3} +variable params string uniform_1.0_2.0_0.25_0.1_5.0_${gamma_t_2}_${gamma_t_3} +variable params string uniform_1.0_2.0_0.25_0.1_5.0_7.0_${gamma_t_3} +variable params string uniform_1.0_2.0_0.25_0.1_5.0_7.0_9.0 + +units lj +atom_style hybrid dipole ellipsoid +WARNING: Atom style hybrid defines both, per-type and per-atom masses; both must be set, but only per-atom masses will be used (src/atom_vec_hybrid.cpp:156) +dimension 3 +newton off + +lattice sc 0.4 +Lattice spacing in x,y,z = 1.3572088 1.3572088 1.3572088 +region box block -8 8 -8 8 -8 8 +create_box 1 box +Created orthogonal box = (-10.857670 -10.857670 -10.857670) to (10.857670 10.857670 10.857670) + 2 by 1 by 2 MPI processor grid +create_atoms 1 box +Created 4096 atoms + create_atoms CPU = 0.002 seconds +mass * 1.0 +set type * dipole/random ${seed} 1.0 +set type * dipole/random 198098 1.0 +Setting atom values ... + 4096 settings made for dipole/random +set type * shape 3.0 1.0 1.0 +Setting atom values ... + 4096 settings made for shape +set type * quat/random ${seed} +set type * quat/random 198098 +Setting atom values ... + 4096 settings made for quat/random +velocity all create 1.0 1 loop geom + +neighbor 1.0 bin +neigh_modify every 1 delay 1 check yes + +pair_style none + +fix 1 all brownian/asphere ${temp} ${seed} rng ${rng} gamma_r_eigen ${gamma_r_1} ${gamma_r_2} ${gamma_r_3} gamma_t_eigen ${gamma_t_1} ${gamma_t_2} ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 ${seed} rng ${rng} gamma_r_eigen ${gamma_r_1} ${gamma_r_2} ${gamma_r_3} gamma_t_eigen ${gamma_t_1} ${gamma_t_2} ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 198098 rng ${rng} gamma_r_eigen ${gamma_r_1} ${gamma_r_2} ${gamma_r_3} gamma_t_eigen ${gamma_t_1} ${gamma_t_2} ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 198098 rng uniform gamma_r_eigen ${gamma_r_1} ${gamma_r_2} ${gamma_r_3} gamma_t_eigen ${gamma_t_1} ${gamma_t_2} ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 198098 rng uniform gamma_r_eigen 2.0 ${gamma_r_2} ${gamma_r_3} gamma_t_eigen ${gamma_t_1} ${gamma_t_2} ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 198098 rng uniform gamma_r_eigen 2.0 0.25 ${gamma_r_3} gamma_t_eigen ${gamma_t_1} ${gamma_t_2} ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 198098 rng uniform gamma_r_eigen 2.0 0.25 0.1 gamma_t_eigen ${gamma_t_1} ${gamma_t_2} ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 198098 rng uniform gamma_r_eigen 2.0 0.25 0.1 gamma_t_eigen 5.0 ${gamma_t_2} ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 198098 rng uniform gamma_r_eigen 2.0 0.25 0.1 gamma_t_eigen 5.0 7.0 ${gamma_t_3} dipole 1.0 0.0 0.0 +fix 1 all brownian/asphere 1.0 198098 rng uniform gamma_r_eigen 2.0 0.25 0.1 gamma_t_eigen 5.0 7.0 9.0 dipole 1.0 0.0 0.0 + +#initialisation for the main run + +# MSD +compute msd all msd + +thermo_style custom step ke pe c_msd[*] + +#dump 1 all custom 1000 dump_${params}_3d.lammpstrj id type # x y z xu yu zu mux muy muz fx fy fz +#dump_modify 1 first yes sort id + +timestep 0.00001 +thermo 100 + +# main run +run 3000 +WARNING: No pairwise cutoff or binsize set. Atom sorting therefore disabled. (src/atom.cpp:2141) +WARNING: Communication cutoff is 0.0. No ghost atoms will be generated. Atoms may get lost. (src/comm_brick.cpp:167) +Per MPI rank memory allocation (min/avg/max) = 5.152 | 5.152 | 5.152 Mbytes +Step KinEng PotEng c_msd[1] c_msd[2] c_msd[3] c_msd[4] + 0 1.4996338 0 0 0 0 0 + 100 45236.508 0 0.00030817418 0.00030717742 0.0003019227 0.0009172743 + 200 45564.566 0 0.00062027526 0.00062110132 0.0006080391 0.0018494157 + 300 46232.801 0 0.00091155216 0.00094473459 0.00093009391 0.0027863807 + 400 45250.414 0 0.0011980791 0.0012538262 0.0012201461 0.0036720513 + 500 45217.133 0 0.0015186813 0.0015752994 0.001509437 0.0046034177 + 600 45531.276 0 0.0018194588 0.0019243758 0.0018209246 0.0055647592 + 700 44834.624 0 0.0021277747 0.0022417115 0.0021352036 0.0065046898 + 800 45413.998 0 0.0024558838 0.0025741787 0.0024088704 0.0074389329 + 900 45668.624 0 0.0027366171 0.002858242 0.0027580782 0.0083529374 + 1000 45809.223 0 0.0030331425 0.003186293 0.0030414906 0.0092609261 + 1100 45193.019 0 0.0033199824 0.0034668659 0.003298885 0.010085733 + 1200 44522.927 0 0.0036503132 0.0037490684 0.0036089852 0.011008367 + 1300 45214.567 0 0.0039958617 0.0040881934 0.0038709079 0.011954963 + 1400 45217.997 0 0.004276499 0.0044624985 0.0041104891 0.012849487 + 1500 45497.171 0 0.0045943272 0.0047116875 0.0044113504 0.013717365 + 1600 45905.187 0 0.0049004996 0.0049982014 0.0047394999 0.014638201 + 1700 45551.346 0 0.0051540939 0.0053187249 0.0050861052 0.015558924 + 1800 45347.782 0 0.0054101891 0.0056306 0.0053515873 0.016392376 + 1900 45107.895 0 0.005743705 0.0059584896 0.0056220384 0.017324233 + 2000 45043.389 0 0.0059803588 0.006230449 0.005911555 0.018122363 + 2100 45433.293 0 0.0062610364 0.0066140744 0.0062152977 0.019090408 + 2200 45804.217 0 0.0064995183 0.0068831274 0.0064971789 0.019879825 + 2300 45697.516 0 0.0067910846 0.0071845673 0.0068046192 0.020780271 + 2400 45447.422 0 0.0071022706 0.0074743709 0.0070983185 0.02167496 + 2500 45395.18 0 0.0073817023 0.0077467991 0.0074263196 0.022554821 + 2600 45943.044 0 0.0075953233 0.007997707 0.0076508583 0.023243889 + 2700 45859.978 0 0.0079082128 0.0082090043 0.0078853376 0.024002555 + 2800 45822.007 0 0.0082607534 0.0084510061 0.0081985549 0.024910314 + 2900 45438.456 0 0.0085958203 0.0088807705 0.0084755353 0.025952126 + 3000 45060.957 0 0.0089017992 0.0090966159 0.0086718875 0.026670303 +Loop time of 1.23282 on 4 procs for 3000 steps with 4096 atoms + +Performance: 2102.502 tau/day, 2433.452 timesteps/s +97.9% CPU use with 4 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0 | 0 | 0 | 0.0 | 0.00 +Neigh | 8.5831e-06 | 8.5831e-06 | 8.5831e-06 | 0.0 | 0.00 +Comm | 0.10931 | 0.11473 | 0.11748 | 0.9 | 9.31 +Output | 0.001375 | 0.0018924 | 0.0034099 | 2.0 | 0.15 +Modify | 0.97744 | 0.99158 | 1.0089 | 1.3 | 80.43 +Other | | 0.1246 | | | 10.11 + +Nlocal: 1024.00 ave 1035 max 1016 min +Histogram: 2 0 0 0 0 0 1 0 0 1 +Nghost: 0.00000 ave 0 max 0 min +Histogram: 4 0 0 0 0 0 0 0 0 0 +Neighs: 0.00000 ave 0 max 0 min +Histogram: 4 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 0 +Ave neighs/atom = 0.0000000 +Neighbor list builds = 1 +Dangerous builds = 0 +Total wall time: 0:00:01 diff --git a/examples/USER/brownian/point/in2d.point b/examples/USER/brownian/point/in2d.point new file mode 100644 index 0000000000..bf790f8fc2 --- /dev/null +++ b/examples/USER/brownian/point/in2d.point @@ -0,0 +1,43 @@ +##### dynamics of non-interacting point particles in 2D ##### + +variable rng string gaussian +variable seed string 198098 +variable temp string 5.0 +variable gamma_t string 1.0 +variable params string ${rng}_${temp}_${gamma_t} + +units lj +atom_style atomic +dimension 2 +newton off + +lattice sq 0.4 +region box block -30 30 -30 30 -0.2 0.2 +create_box 1 box +create_atoms 1 box +mass * 1.0 +velocity all create 1.0 1 loop geom + +neighbor 1.0 bin +neigh_modify every 1 delay 1 check yes + +pair_style none + +fix 1 all brownian ${temp} ${seed} rng ${rng} gamma_t ${gamma_t} + +#initialisation for the main run + +# MSD +compute msd all msd + +thermo_style custom step ke pe c_msd[*] + +#dump 1 all custom 1000 dump_${params}_2d.lammpstrj id type & +# x y z xu yu zu fx fy fz +#dump_modify 1 first yes sort id + +timestep 0.00001 +thermo 100 + +# main run +run 3000 diff --git a/examples/USER/brownian/point/in3d.point b/examples/USER/brownian/point/in3d.point new file mode 100644 index 0000000000..974f08866d --- /dev/null +++ b/examples/USER/brownian/point/in3d.point @@ -0,0 +1,44 @@ +##### overdamped dynamics of non-interacting point particles in 3D ##### + +variable rng string gaussian +variable seed string 198098 +variable temp string 5.0 +variable gamma_t string 1.0 +variable params string ${rng}_${temp}_${gamma_t} + +units lj +atom_style atomic +dimension 3 +newton off + +lattice sc 0.4 +region box block -8 8 -8 8 -8 8 +create_box 1 box +create_atoms 1 box +mass * 1.0 +velocity all create 1.0 1 loop geom + +neighbor 1.0 bin +neigh_modify every 1 delay 1 check yes + +pair_style none + +fix 1 all brownian ${temp} ${seed} rng ${rng} gamma_t ${gamma_t} + +#initialisation for the main run + +# MSD +compute msd all msd + +thermo_style custom step ke pe c_msd[*] + +#dump 1 all custom 1000 dump_${params}_3d.lammpstrj id type & +# x y z xu yu zu fx fy fz +#dump_modify 1 first yes sort id + +timestep 0.00001 +thermo 100 + +# main run +run 3000 + diff --git a/examples/USER/brownian/point/log.11May2021.in2d.point.g++.1 b/examples/USER/brownian/point/log.11May2021.in2d.point.g++.1 new file mode 100644 index 0000000000..b875ccedad --- /dev/null +++ b/examples/USER/brownian/point/log.11May2021.in2d.point.g++.1 @@ -0,0 +1,119 @@ +LAMMPS (8 Apr 2021) + using 1 OpenMP thread(s) per MPI task +##### dynamics of non-interacting point particles in 2D ##### + +variable rng string gaussian +variable seed string 198098 +variable temp string 5.0 +variable gamma_t string 1.0 +variable params string ${rng}_${temp}_${gamma_t} +variable params string gaussian_${temp}_${gamma_t} +variable params string gaussian_5.0_${gamma_t} +variable params string gaussian_5.0_1.0 + +units lj +atom_style atomic +dimension 2 +newton off + +lattice sq 0.4 +Lattice spacing in x,y,z = 1.5811388 1.5811388 1.5811388 +region box block -30 30 -30 30 -0.2 0.2 +create_box 1 box +Created orthogonal box = (-47.434165 -47.434165 -0.31622777) to (47.434165 47.434165 0.31622777) + 1 by 1 by 1 MPI processor grid +create_atoms 1 box +Created 3600 atoms + create_atoms CPU = 0.003 seconds +mass * 1.0 +velocity all create 1.0 1 loop geom + +neighbor 1.0 bin +neigh_modify every 1 delay 1 check yes + +pair_style none + +fix 1 all brownian ${temp} ${seed} rng ${rng} gamma_t ${gamma_t} +fix 1 all brownian 5.0 ${seed} rng ${rng} gamma_t ${gamma_t} +fix 1 all brownian 5.0 198098 rng ${rng} gamma_t ${gamma_t} +fix 1 all brownian 5.0 198098 rng gaussian gamma_t ${gamma_t} +fix 1 all brownian 5.0 198098 rng gaussian gamma_t 1.0 + +#initialisation for the main run + +# MSD +compute msd all msd + +thermo_style custom step ke pe c_msd[*] + +#dump 1 all custom 1000 dump_${params}_2d.lammpstrj id type # x y z xu yu zu fx fy fz +#dump_modify 1 first yes sort id + +timestep 0.00001 +thermo 100 + +# main run +run 3000 +WARNING: No pairwise cutoff or binsize set. Atom sorting therefore disabled. (src/atom.cpp:2141) +WARNING: Communication cutoff is 0.0. No ghost atoms will be generated. Atoms may get lost. (src/comm_brick.cpp:167) +Per MPI rank memory allocation (min/avg/max) = 2.664 | 2.664 | 2.664 Mbytes +Step KinEng PotEng c_msd[1] c_msd[2] c_msd[3] c_msd[4] + 0 0.99972222 0 0 0 0 0 + 100 1022861.2 0 0.010252464 0.0095481044 0 0.019800568 + 200 986781.19 0 0.020552091 0.019485252 0 0.040037343 + 300 1030219 0 0.030642552 0.028377678 0 0.05902023 + 400 1003322.5 0 0.040610693 0.038179284 0 0.078789978 + 500 989343.12 0 0.049978908 0.047445856 0 0.097424764 + 600 1029781.3 0 0.059551719 0.057941149 0 0.11749287 + 700 999447.72 0 0.06979546 0.067552325 0 0.13734778 + 800 995373.97 0 0.080049251 0.078006344 0 0.1580556 + 900 1011991.4 0 0.089753134 0.087065214 0 0.17681835 + 1000 1006017.1 0 0.10041092 0.097934217 0 0.19834514 + 1100 997762.63 0 0.11229742 0.10841547 0 0.22071289 + 1200 1011707.8 0 0.12006388 0.1190115 0 0.23907538 + 1300 1012099.1 0 0.13097486 0.12996632 0 0.26094117 + 1400 997602.43 0 0.14345778 0.13830585 0 0.28176362 + 1500 1005358.1 0 0.15441686 0.14927539 0 0.30369225 + 1600 1007081.8 0 0.16496828 0.15936363 0 0.3243319 + 1700 990284.9 0 0.1747286 0.16818246 0 0.34291106 + 1800 969006.97 0 0.18228778 0.17972813 0 0.3620159 + 1900 998066.69 0 0.19338277 0.19226121 0 0.38564397 + 2000 972300.66 0 0.20352485 0.20145928 0 0.40498413 + 2100 985025.88 0 0.21283854 0.21090075 0 0.42373929 + 2200 1010964.6 0 0.22279055 0.22110734 0 0.44389789 + 2300 975819.44 0 0.23128131 0.23226488 0 0.46354619 + 2400 977043.53 0 0.24284105 0.24301689 0 0.48585794 + 2500 969708.21 0 0.25415238 0.25354284 0 0.50769522 + 2600 981969.5 0 0.26457173 0.26318018 0 0.52775192 + 2700 987261.1 0 0.27497004 0.27761213 0 0.55258218 + 2800 1005751.1 0 0.28530448 0.28715428 0 0.57245876 + 2900 975930.11 0 0.29394811 0.29896948 0 0.59291759 + 3000 997388.08 0 0.30674701 0.31193573 0 0.61868274 +Loop time of 1.501 on 1 procs for 3000 steps with 3600 atoms + +Performance: 1726.852 tau/day, 1998.672 timesteps/s +99.9% CPU use with 1 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0 | 0 | 0 | 0.0 | 0.00 +Neigh | 0.00059271 | 0.00059271 | 0.00059271 | 0.0 | 0.04 +Comm | 0.0055437 | 0.0055437 | 0.0055437 | 0.0 | 0.37 +Output | 0.0039999 | 0.0039999 | 0.0039999 | 0.0 | 0.27 +Modify | 1.3852 | 1.3852 | 1.3852 | 0.0 | 92.28 +Other | | 0.1057 | | | 7.04 + +Nlocal: 3600.00 ave 3600 max 3600 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 0.00000 ave 0 max 0 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 0.00000 ave 0 max 0 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 0 +Ave neighs/atom = 0.0000000 +Neighbor list builds = 21 +Dangerous builds = 0 + +Total wall time: 0:00:01 diff --git a/examples/USER/brownian/point/log.11May2021.in2d.point.g++.4 b/examples/USER/brownian/point/log.11May2021.in2d.point.g++.4 new file mode 100644 index 0000000000..d12d71118e --- /dev/null +++ b/examples/USER/brownian/point/log.11May2021.in2d.point.g++.4 @@ -0,0 +1,119 @@ +LAMMPS (8 Apr 2021) + using 1 OpenMP thread(s) per MPI task +##### dynamics of non-interacting point particles in 2D ##### + +variable rng string gaussian +variable seed string 198098 +variable temp string 5.0 +variable gamma_t string 1.0 +variable params string ${rng}_${temp}_${gamma_t} +variable params string gaussian_${temp}_${gamma_t} +variable params string gaussian_5.0_${gamma_t} +variable params string gaussian_5.0_1.0 + +units lj +atom_style atomic +dimension 2 +newton off + +lattice sq 0.4 +Lattice spacing in x,y,z = 1.5811388 1.5811388 1.5811388 +region box block -30 30 -30 30 -0.2 0.2 +create_box 1 box +Created orthogonal box = (-47.434165 -47.434165 -0.31622777) to (47.434165 47.434165 0.31622777) + 2 by 2 by 1 MPI processor grid +create_atoms 1 box +Created 3600 atoms + create_atoms CPU = 0.001 seconds +mass * 1.0 +velocity all create 1.0 1 loop geom + +neighbor 1.0 bin +neigh_modify every 1 delay 1 check yes + +pair_style none + +fix 1 all brownian ${temp} ${seed} rng ${rng} gamma_t ${gamma_t} +fix 1 all brownian 5.0 ${seed} rng ${rng} gamma_t ${gamma_t} +fix 1 all brownian 5.0 198098 rng ${rng} gamma_t ${gamma_t} +fix 1 all brownian 5.0 198098 rng gaussian gamma_t ${gamma_t} +fix 1 all brownian 5.0 198098 rng gaussian gamma_t 1.0 + +#initialisation for the main run + +# MSD +compute msd all msd + +thermo_style custom step ke pe c_msd[*] + +#dump 1 all custom 1000 dump_${params}_2d.lammpstrj id type # x y z xu yu zu fx fy fz +#dump_modify 1 first yes sort id + +timestep 0.00001 +thermo 100 + +# main run +run 3000 +WARNING: No pairwise cutoff or binsize set. Atom sorting therefore disabled. (src/atom.cpp:2141) +WARNING: Communication cutoff is 0.0. No ghost atoms will be generated. Atoms may get lost. (src/comm_brick.cpp:167) +Per MPI rank memory allocation (min/avg/max) = 2.664 | 2.664 | 2.664 Mbytes +Step KinEng PotEng c_msd[1] c_msd[2] c_msd[3] c_msd[4] + 0 0.99972222 0 0 0 0 0 + 100 1017972.1 0 0.010094052 0.0097502899 0 0.019844342 + 200 1004552.1 0 0.020125116 0.01957629 0 0.039701406 + 300 1017712.9 0 0.030271373 0.029411656 0 0.059683029 + 400 1016693.8 0 0.040610061 0.038605869 0 0.07921593 + 500 999527.84 0 0.049451389 0.049042225 0 0.098493614 + 600 961157.92 0 0.059691948 0.059033176 0 0.11872512 + 700 1006804.9 0 0.071205977 0.069972106 0 0.14117808 + 800 1007321.8 0 0.081136977 0.079825976 0 0.16096295 + 900 1002801.7 0 0.091236148 0.090833816 0 0.18206996 + 1000 1010134.7 0 0.10091362 0.10023906 0 0.20115269 + 1100 990246.55 0 0.1118367 0.11141049 0 0.22324719 + 1200 1010555.5 0 0.12091736 0.12355456 0 0.24447192 + 1300 997117.19 0 0.13099592 0.13292775 0 0.26392367 + 1400 1020817.1 0 0.14167961 0.14172898 0 0.28340859 + 1500 1015048.1 0 0.15225884 0.15162948 0 0.30388833 + 1600 990291.98 0 0.16460973 0.16251919 0 0.32712891 + 1700 980848.58 0 0.17380313 0.17351201 0 0.34731513 + 1800 1000673.8 0 0.18383991 0.18175453 0 0.36559445 + 1900 1009388.9 0 0.19411523 0.19367453 0 0.38778976 + 2000 1005935.9 0 0.2015342 0.20585359 0 0.40738779 + 2100 985500.56 0 0.21161056 0.21238463 0 0.42399519 + 2200 997241.34 0 0.21841986 0.22117922 0 0.43959908 + 2300 1011672.3 0 0.22688099 0.23155741 0 0.4584384 + 2400 989837.68 0 0.23849839 0.24219 0 0.48068839 + 2500 1035706.8 0 0.24541408 0.24947563 0 0.49488971 + 2600 992370.08 0 0.25537803 0.25758332 0 0.51296135 + 2700 990586.56 0 0.26542605 0.26762286 0 0.53304892 + 2800 1002767.3 0 0.27570392 0.27874972 0 0.55445363 + 2900 995307.27 0 0.28580946 0.29115624 0 0.5769657 + 3000 1024317.7 0 0.29493208 0.30208924 0 0.59702132 +Loop time of 0.413047 on 4 procs for 3000 steps with 3600 atoms + +Performance: 6275.312 tau/day, 7263.093 timesteps/s +98.0% CPU use with 4 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0 | 0 | 0 | 0.0 | 0.00 +Neigh | 0.00016236 | 0.00016338 | 0.00016403 | 0.0 | 0.04 +Comm | 0.0026367 | 0.0030084 | 0.0031497 | 0.4 | 0.73 +Output | 0.0011849 | 0.0013574 | 0.0018065 | 0.7 | 0.33 +Modify | 0.34447 | 0.35223 | 0.36357 | 1.2 | 85.28 +Other | | 0.05629 | | | 13.63 + +Nlocal: 900.000 ave 906 max 891 min +Histogram: 1 0 0 0 0 0 1 0 1 1 +Nghost: 0.00000 ave 0 max 0 min +Histogram: 4 0 0 0 0 0 0 0 0 0 +Neighs: 0.00000 ave 0 max 0 min +Histogram: 4 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 0 +Ave neighs/atom = 0.0000000 +Neighbor list builds = 22 +Dangerous builds = 0 + +Total wall time: 0:00:00 diff --git a/examples/USER/brownian/point/log.11May2021.in3d.point.g++.1 b/examples/USER/brownian/point/log.11May2021.in3d.point.g++.1 new file mode 100644 index 0000000000..febb7657e7 --- /dev/null +++ b/examples/USER/brownian/point/log.11May2021.in3d.point.g++.1 @@ -0,0 +1,119 @@ +LAMMPS (8 Apr 2021) + using 1 OpenMP thread(s) per MPI task +##### overdamped dynamics of non-interacting point particles in 3D ##### + +variable rng string gaussian +variable seed string 198098 +variable temp string 5.0 +variable gamma_t string 1.0 +variable params string ${rng}_${temp}_${gamma_t} +variable params string gaussian_${temp}_${gamma_t} +variable params string gaussian_5.0_${gamma_t} +variable params string gaussian_5.0_1.0 + +units lj +atom_style atomic +dimension 3 +newton off + +lattice sc 0.4 +Lattice spacing in x,y,z = 1.3572088 1.3572088 1.3572088 +region box block -8 8 -8 8 -8 8 +create_box 1 box +Created orthogonal box = (-10.857670 -10.857670 -10.857670) to (10.857670 10.857670 10.857670) + 1 by 1 by 1 MPI processor grid +create_atoms 1 box +Created 4096 atoms + create_atoms CPU = 0.002 seconds +mass * 1.0 +velocity all create 1.0 1 loop geom + +neighbor 1.0 bin +neigh_modify every 1 delay 1 check yes + +pair_style none + +fix 1 all brownian ${temp} ${seed} rng ${rng} gamma_t ${gamma_t} +fix 1 all brownian 5.0 ${seed} rng ${rng} gamma_t ${gamma_t} +fix 1 all brownian 5.0 198098 rng ${rng} gamma_t ${gamma_t} +fix 1 all brownian 5.0 198098 rng gaussian gamma_t ${gamma_t} +fix 1 all brownian 5.0 198098 rng gaussian gamma_t 1.0 + +#initialisation for the main run + +# MSD +compute msd all msd + +thermo_style custom step ke pe c_msd[*] + +#dump 1 all custom 1000 dump_${params}_3d.lammpstrj id type # x y z xu yu zu fx fy fz +#dump_modify 1 first yes sort id + +timestep 0.00001 +thermo 100 + +# main run +run 3000 +WARNING: No pairwise cutoff or binsize set. Atom sorting therefore disabled. (src/atom.cpp:2141) +WARNING: Communication cutoff is 0.0. No ghost atoms will be generated. Atoms may get lost. (src/comm_brick.cpp:167) +Per MPI rank memory allocation (min/avg/max) = 2.694 | 2.694 | 2.694 Mbytes +Step KinEng PotEng c_msd[1] c_msd[2] c_msd[3] c_msd[4] + 0 1.4996338 0 0 0 0 0 + 100 1500286.3 0 0.0098123603 0.010352169 0.010242435 0.030406964 + 200 1488308 0 0.019934427 0.019968198 0.020471735 0.06037436 + 300 1484472.4 0 0.029397156 0.030749312 0.030121294 0.090267762 + 400 1517938.7 0 0.039217504 0.041440617 0.040512943 0.12117106 + 500 1492769.5 0 0.04890343 0.051561801 0.050614941 0.15108017 + 600 1510159.6 0 0.059770181 0.061650364 0.061298117 0.18271866 + 700 1485424.1 0 0.070537955 0.071144877 0.071141546 0.21282438 + 800 1496377.2 0 0.081291995 0.082546059 0.080653381 0.24449144 + 900 1484409.1 0 0.090940427 0.093298981 0.091328056 0.27556746 + 1000 1503322.4 0 0.10176921 0.10246052 0.10151773 0.30574747 + 1100 1503322.4 0 0.11295993 0.11052632 0.11053406 0.33402031 + 1200 1489236.2 0 0.12509723 0.11961982 0.12146498 0.36618203 + 1300 1476050.3 0 0.13449034 0.12941323 0.1309765 0.39488007 + 1400 1520818.7 0 0.14613571 0.13788044 0.14083944 0.42485558 + 1500 1498936.4 0 0.15752286 0.15057712 0.15063399 0.45873397 + 1600 1507524.1 0 0.16793678 0.16095681 0.16063531 0.4895289 + 1700 1480581.2 0 0.17748019 0.172614 0.16922383 0.51931802 + 1800 1505353.6 0 0.18850931 0.18304171 0.18063119 0.55218221 + 1900 1491234.7 0 0.19836402 0.19306339 0.1929707 0.58439811 + 2000 1519868.8 0 0.20698191 0.20211344 0.20328302 0.61237838 + 2100 1493919.5 0 0.21453524 0.21186097 0.21423293 0.64062914 + 2200 1517098.6 0 0.2257338 0.22381647 0.22474081 0.67429108 + 2300 1481270.7 0 0.23499747 0.23348379 0.23498244 0.70346369 + 2400 1495445.1 0 0.24535894 0.24290239 0.24229161 0.73055293 + 2500 1522839.3 0 0.25695938 0.25109669 0.25214541 0.76020148 + 2600 1518697.4 0 0.26680819 0.26120216 0.2604112 0.78842155 + 2700 1529283.1 0 0.27524422 0.26942681 0.27148042 0.81615146 + 2800 1500557.5 0 0.28436226 0.27957592 0.27935619 0.84329437 + 2900 1509711.1 0 0.2948528 0.28562401 0.29055956 0.87103637 + 3000 1522712.8 0 0.30347033 0.2975063 0.30121685 0.90219348 +Loop time of 2.35056 on 1 procs for 3000 steps with 4096 atoms + +Performance: 1102.718 tau/day, 1276.293 timesteps/s +99.9% CPU use with 1 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0 | 0 | 0 | 0.0 | 0.00 +Neigh | 0.00077772 | 0.00077772 | 0.00077772 | 0.0 | 0.03 +Comm | 0.010985 | 0.010985 | 0.010985 | 0.0 | 0.47 +Output | 0.0045807 | 0.0045807 | 0.0045807 | 0.0 | 0.19 +Modify | 2.2116 | 2.2116 | 2.2116 | 0.0 | 94.09 +Other | | 0.1226 | | | 5.22 + +Nlocal: 4096.00 ave 4096 max 4096 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 0.00000 ave 0 max 0 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 0.00000 ave 0 max 0 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 0 +Ave neighs/atom = 0.0000000 +Neighbor list builds = 24 +Dangerous builds = 0 + +Total wall time: 0:00:02 diff --git a/examples/USER/brownian/point/log.11May2021.in3d.point.g++.4 b/examples/USER/brownian/point/log.11May2021.in3d.point.g++.4 new file mode 100644 index 0000000000..6a41a12f0f --- /dev/null +++ b/examples/USER/brownian/point/log.11May2021.in3d.point.g++.4 @@ -0,0 +1,119 @@ +LAMMPS (8 Apr 2021) + using 1 OpenMP thread(s) per MPI task +##### overdamped dynamics of non-interacting point particles in 3D ##### + +variable rng string gaussian +variable seed string 198098 +variable temp string 5.0 +variable gamma_t string 1.0 +variable params string ${rng}_${temp}_${gamma_t} +variable params string gaussian_${temp}_${gamma_t} +variable params string gaussian_5.0_${gamma_t} +variable params string gaussian_5.0_1.0 + +units lj +atom_style atomic +dimension 3 +newton off + +lattice sc 0.4 +Lattice spacing in x,y,z = 1.3572088 1.3572088 1.3572088 +region box block -8 8 -8 8 -8 8 +create_box 1 box +Created orthogonal box = (-10.857670 -10.857670 -10.857670) to (10.857670 10.857670 10.857670) + 2 by 1 by 2 MPI processor grid +create_atoms 1 box +Created 4096 atoms + create_atoms CPU = 0.001 seconds +mass * 1.0 +velocity all create 1.0 1 loop geom + +neighbor 1.0 bin +neigh_modify every 1 delay 1 check yes + +pair_style none + +fix 1 all brownian ${temp} ${seed} rng ${rng} gamma_t ${gamma_t} +fix 1 all brownian 5.0 ${seed} rng ${rng} gamma_t ${gamma_t} +fix 1 all brownian 5.0 198098 rng ${rng} gamma_t ${gamma_t} +fix 1 all brownian 5.0 198098 rng gaussian gamma_t ${gamma_t} +fix 1 all brownian 5.0 198098 rng gaussian gamma_t 1.0 + +#initialisation for the main run + +# MSD +compute msd all msd + +thermo_style custom step ke pe c_msd[*] + +#dump 1 all custom 1000 dump_${params}_3d.lammpstrj id type # x y z xu yu zu fx fy fz +#dump_modify 1 first yes sort id + +timestep 0.00001 +thermo 100 + +# main run +run 3000 +WARNING: No pairwise cutoff or binsize set. Atom sorting therefore disabled. (src/atom.cpp:2141) +WARNING: Communication cutoff is 0.0. No ghost atoms will be generated. Atoms may get lost. (src/comm_brick.cpp:167) +Per MPI rank memory allocation (min/avg/max) = 2.672 | 2.672 | 2.672 Mbytes +Step KinEng PotEng c_msd[1] c_msd[2] c_msd[3] c_msd[4] + 0 1.4996338 0 0 0 0 0 + 100 1515328.2 0 0.010465453 0.010044629 0.0097242319 0.030234314 + 200 1510820.8 0 0.020658886 0.019954762 0.020008864 0.060622512 + 300 1482006.5 0 0.030402195 0.029802874 0.030047586 0.090252655 + 400 1492228.5 0 0.039622543 0.038899144 0.040381854 0.11890354 + 500 1494985.5 0 0.050523465 0.050022913 0.050186478 0.15073286 + 600 1516047.4 0 0.061111845 0.061433818 0.059195364 0.18174103 + 700 1510021.8 0 0.071636778 0.072829755 0.06946406 0.21393059 + 800 1505964.7 0 0.08240965 0.08433785 0.078799851 0.24554735 + 900 1491035.9 0 0.093659937 0.094517749 0.08812559 0.27630328 + 1000 1516599.6 0 0.10436496 0.10431759 0.097480868 0.30616342 + 1100 1495170.3 0 0.11468757 0.111397 0.1069763 0.33306087 + 1200 1500630.6 0 0.12360977 0.12264534 0.11583999 0.3620951 + 1300 1474889.5 0 0.13432447 0.13471694 0.12702491 0.39606632 + 1400 1487145.8 0 0.14573239 0.14431493 0.13669403 0.42674135 + 1500 1519496.7 0 0.15610742 0.15505416 0.14600182 0.4571634 + 1600 1525674.1 0 0.16728653 0.1649354 0.15562133 0.48784325 + 1700 1540725.4 0 0.17846447 0.17666562 0.16531781 0.52044791 + 1800 1512334.8 0 0.18872753 0.18538847 0.17450009 0.54861609 + 1900 1498371.4 0 0.19688928 0.19333299 0.18581712 0.5760394 + 2000 1546459.4 0 0.20955053 0.20243854 0.19613897 0.60812803 + 2100 1509712.9 0 0.21922567 0.20940597 0.20567239 0.63430404 + 2200 1509630.4 0 0.23067999 0.21856734 0.21619911 0.66544645 + 2300 1483929.1 0 0.24160803 0.231048 0.22617193 0.69882797 + 2400 1488492.1 0 0.25399491 0.24082678 0.23972356 0.73454526 + 2500 1508107.9 0 0.26608734 0.25316913 0.2486814 0.76793787 + 2600 1511952.1 0 0.27523956 0.2623673 0.25706539 0.79467225 + 2700 1488888.8 0 0.28518299 0.27425585 0.26728622 0.82672506 + 2800 1515428.4 0 0.29595429 0.28589969 0.27781327 0.85966725 + 2900 1504312.1 0 0.30393798 0.29533034 0.28725362 0.88652194 + 3000 1521521.3 0 0.31445132 0.30117607 0.29959324 0.91522062 +Loop time of 0.708196 on 4 procs for 3000 steps with 4096 atoms + +Performance: 3660.004 tau/day, 4236.115 timesteps/s +97.2% CPU use with 4 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0 | 0 | 0 | 0.0 | 0.00 +Neigh | 0.00020647 | 0.00021023 | 0.0002141 | 0.0 | 0.03 +Comm | 0.0045607 | 0.0050649 | 0.0053098 | 0.4 | 0.72 +Output | 0.0013759 | 0.002265 | 0.0037355 | 1.9 | 0.32 +Modify | 0.57353 | 0.58931 | 0.6109 | 1.8 | 83.21 +Other | | 0.1113 | | | 15.72 + +Nlocal: 1024.00 ave 1043 max 1001 min +Histogram: 1 0 0 0 0 2 0 0 0 1 +Nghost: 0.00000 ave 0 max 0 min +Histogram: 4 0 0 0 0 0 0 0 0 0 +Neighs: 0.00000 ave 0 max 0 min +Histogram: 4 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 0 +Ave neighs/atom = 0.0000000 +Neighbor list builds = 25 +Dangerous builds = 0 + +Total wall time: 0:00:00 diff --git a/examples/USER/brownian/sphere/in2d.sphere b/examples/USER/brownian/sphere/in2d.sphere new file mode 100644 index 0000000000..a194f967cc --- /dev/null +++ b/examples/USER/brownian/sphere/in2d.sphere @@ -0,0 +1,45 @@ +##### overdamped dynamics of a sphere (with dipole attached to it) in 2D ##### + +variable rng string uniform +variable seed string 198098 +variable temp string 1.0 +variable gamma_t string 5.0 +variable gamma_r string 0.7 +variable params string ${rng}_${temp}_${gamma_r}_${gamma_t} + +units lj +atom_style hybrid dipole sphere +dimension 2 +newton off + +lattice sq 0.4 +region box block -30 30 -30 30 -0.2 0.2 +create_box 1 box +create_atoms 1 box +mass * 1.0 +set type * dipole/random ${seed} 1.0 +velocity all create 1.0 1 loop geom + +neighbor 1.0 bin +neigh_modify every 1 delay 1 check yes + +pair_style none + +fix 1 all brownian/sphere ${temp} ${seed} rng ${rng} gamma_r ${gamma_r} gamma_t ${gamma_t} + +#initialisation for the main run + +# MSD +compute msd all msd + +thermo_style custom step ke pe c_msd[*] + +#dump 1 all custom 1000 dump_${params}_2d.lammpstrj id type & +# x y z xu yu zu mux muy muz fx fy fz +#dump_modify 1 first yes sort id + +timestep 0.00001 +thermo 100 + +# main run +run 3000 diff --git a/examples/USER/brownian/sphere/in3d.sphere b/examples/USER/brownian/sphere/in3d.sphere new file mode 100644 index 0000000000..d3264ee450 --- /dev/null +++ b/examples/USER/brownian/sphere/in3d.sphere @@ -0,0 +1,45 @@ +##### overdamped dynamics of a sphere (with dipole attached to it) in 3D##### + +variable rng string uniform +variable seed string 198098 +variable temp string 1.0 +variable gamma_t string 5.0 +variable gamma_r string 0.7 +variable params string ${rng}_${temp}_${gamma_r}_${gamma_t} + +units lj +atom_style hybrid dipole sphere +dimension 3 +newton off + +lattice sc 0.4 +region box block -8 8 -8 8 -8 8 +create_box 1 box +create_atoms 1 box +mass * 1.0 +set type * dipole/random ${seed} 1.0 +velocity all create 1.0 1 loop geom + +neighbor 1.0 bin +neigh_modify every 1 delay 1 check yes + +pair_style none + +fix 1 all brownian/sphere ${temp} ${seed} rng ${rng} gamma_r ${gamma_r} gamma_t ${gamma_t} + +#initialisation for the main run + +# MSD +compute msd all msd + +thermo_style custom step ke pe c_msd[*] + +#dump 1 all custom 1000 dump_${params}_3d.lammpstrj id type & +# x y z xu yu zu mux muy muz fx fy fz +#dump_modify 1 first yes sort id + +timestep 0.00001 +thermo 100 + +# main run +run 3000 diff --git a/examples/USER/brownian/sphere/log.11May2021.in2d.sphere.g++.1 b/examples/USER/brownian/sphere/log.11May2021.in2d.sphere.g++.1 new file mode 100644 index 0000000000..917f2b63fb --- /dev/null +++ b/examples/USER/brownian/sphere/log.11May2021.in2d.sphere.g++.1 @@ -0,0 +1,126 @@ +LAMMPS (8 Apr 2021) + using 1 OpenMP thread(s) per MPI task +##### overdamped dynamics of a sphere (with dipole attached to it) in 2D ##### + +variable rng string uniform +variable seed string 198098 +variable temp string 1.0 +variable gamma_t string 5.0 +variable gamma_r string 0.7 +variable params string ${rng}_${temp}_${gamma_r}_${gamma_t} +variable params string uniform_${temp}_${gamma_r}_${gamma_t} +variable params string uniform_1.0_${gamma_r}_${gamma_t} +variable params string uniform_1.0_0.7_${gamma_t} +variable params string uniform_1.0_0.7_5.0 + +units lj +atom_style hybrid dipole sphere +WARNING: Atom style hybrid defines both, per-type and per-atom masses; both must be set, but only per-atom masses will be used (src/atom_vec_hybrid.cpp:156) +dimension 2 +newton off + +lattice sq 0.4 +Lattice spacing in x,y,z = 1.5811388 1.5811388 1.5811388 +region box block -30 30 -30 30 -0.2 0.2 +create_box 1 box +Created orthogonal box = (-47.434165 -47.434165 -0.31622777) to (47.434165 47.434165 0.31622777) + 1 by 1 by 1 MPI processor grid +create_atoms 1 box +Created 3600 atoms + create_atoms CPU = 0.005 seconds +mass * 1.0 +set type * dipole/random ${seed} 1.0 +set type * dipole/random 198098 1.0 +Setting atom values ... + 3600 settings made for dipole/random +velocity all create 1.0 1 loop geom + +neighbor 1.0 bin +neigh_modify every 1 delay 1 check yes + +pair_style none + +fix 1 all brownian/sphere ${temp} ${seed} rng ${rng} gamma_r ${gamma_r} gamma_t ${gamma_t} +fix 1 all brownian/sphere 1.0 ${seed} rng ${rng} gamma_r ${gamma_r} gamma_t ${gamma_t} +fix 1 all brownian/sphere 1.0 198098 rng ${rng} gamma_r ${gamma_r} gamma_t ${gamma_t} +fix 1 all brownian/sphere 1.0 198098 rng uniform gamma_r ${gamma_r} gamma_t ${gamma_t} +fix 1 all brownian/sphere 1.0 198098 rng uniform gamma_r 0.7 gamma_t ${gamma_t} +fix 1 all brownian/sphere 1.0 198098 rng uniform gamma_r 0.7 gamma_t 5.0 + +#initialisation for the main run + +# MSD +compute msd all msd + +thermo_style custom step ke pe c_msd[*] + +#dump 1 all custom 1000 dump_${params}_2d.lammpstrj id type # x y z xu yu zu mux muy muz fx fy fz +#dump_modify 1 first yes sort id + +timestep 0.00001 +thermo 100 + +# main run +run 3000 +WARNING: No pairwise cutoff or binsize set. Atom sorting therefore disabled. (src/atom.cpp:2141) +WARNING: Communication cutoff is 0.0. No ghost atoms will be generated. Atoms may get lost. (src/comm_brick.cpp:167) +Per MPI rank memory allocation (min/avg/max) = 4.664 | 4.664 | 4.664 Mbytes +Step KinEng PotEng c_msd[1] c_msd[2] c_msd[3] c_msd[4] + 0 0.99972222 0 0 0 0 0 + 100 20867.136 0 0.00040006704 0.00039570887 0 0.00079577592 + 200 20835.491 0 0.00077560901 0.00080455484 0 0.0015801638 + 300 20813.122 0 0.0011737739 0.0012172689 0 0.0023910428 + 400 21137.397 0 0.0015587675 0.0016096093 0 0.0031683768 + 500 21167.188 0 0.0019294105 0.0020251322 0 0.0039545428 + 600 21345.908 0 0.0023105313 0.0024111742 0 0.0047217054 + 700 21086.272 0 0.0027236116 0.0027846006 0 0.0055082122 + 800 20840.906 0 0.0031505299 0.0031810732 0 0.0063316031 + 900 20916.456 0 0.0035525852 0.0035981301 0 0.0071507153 + 1000 20752.249 0 0.0039147929 0.0039791172 0 0.0078939101 + 1100 20643.612 0 0.0042977921 0.0043701484 0 0.0086679405 + 1200 21085.63 0 0.0045584242 0.0047475091 0 0.0093059332 + 1300 20900.794 0 0.0049718803 0.0051481706 0 0.010120051 + 1400 20980.731 0 0.0054234603 0.0054230724 0 0.010846533 + 1500 20916.308 0 0.0058502946 0.0058114313 0 0.011661726 + 1600 20949.786 0 0.0062258463 0.006208129 0 0.012433975 + 1700 20531.205 0 0.0066276219 0.006595921 0 0.013223543 + 1800 21418.472 0 0.0070077409 0.007030461 0 0.014038202 + 1900 21291.928 0 0.0074052208 0.0074333041 0 0.014838525 + 2000 20893.895 0 0.0077407477 0.007901402 0 0.01564215 + 2100 21218.001 0 0.0080384756 0.0082611258 0 0.016299601 + 2200 21116.189 0 0.0084325164 0.008617977 0 0.017050493 + 2300 20718.83 0 0.0089455345 0.0091768161 0 0.018122351 + 2400 20719.164 0 0.0093666455 0.0095272546 0 0.0188939 + 2500 20991.382 0 0.009706795 0.0098256506 0 0.019532446 + 2600 20515.74 0 0.0099247069 0.010329841 0 0.020254548 + 2700 21001.55 0 0.010448354 0.010693502 0 0.021141855 + 2800 21363.824 0 0.010990971 0.011142092 0 0.022133063 + 2900 20497.025 0 0.011399704 0.011504868 0 0.022904573 + 3000 20726.572 0 0.011785354 0.01187482 0 0.023660175 +Loop time of 1.76023 on 1 procs for 3000 steps with 3600 atoms + +Performance: 1472.538 tau/day, 1704.326 timesteps/s +99.9% CPU use with 1 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0 | 0 | 0 | 0.0 | 0.00 +Neigh | 0 | 0 | 0 | 0.0 | 0.00 +Comm | 0.026518 | 0.026518 | 0.026518 | 0.0 | 1.51 +Output | 0.0040107 | 0.0040107 | 0.0040107 | 0.0 | 0.23 +Modify | 1.6194 | 1.6194 | 1.6194 | 0.0 | 92.00 +Other | | 0.1103 | | | 6.27 + +Nlocal: 3600.00 ave 3600 max 3600 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 121.000 ave 121 max 121 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 0.00000 ave 0 max 0 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 0 +Ave neighs/atom = 0.0000000 +Neighbor list builds = 0 +Dangerous builds = 0 +Total wall time: 0:00:01 diff --git a/examples/USER/brownian/sphere/log.11May2021.in2d.sphere.g++.4 b/examples/USER/brownian/sphere/log.11May2021.in2d.sphere.g++.4 new file mode 100644 index 0000000000..95602a809c --- /dev/null +++ b/examples/USER/brownian/sphere/log.11May2021.in2d.sphere.g++.4 @@ -0,0 +1,126 @@ +LAMMPS (8 Apr 2021) + using 1 OpenMP thread(s) per MPI task +##### overdamped dynamics of a sphere (with dipole attached to it) in 2D ##### + +variable rng string uniform +variable seed string 198098 +variable temp string 1.0 +variable gamma_t string 5.0 +variable gamma_r string 0.7 +variable params string ${rng}_${temp}_${gamma_r}_${gamma_t} +variable params string uniform_${temp}_${gamma_r}_${gamma_t} +variable params string uniform_1.0_${gamma_r}_${gamma_t} +variable params string uniform_1.0_0.7_${gamma_t} +variable params string uniform_1.0_0.7_5.0 + +units lj +atom_style hybrid dipole sphere +WARNING: Atom style hybrid defines both, per-type and per-atom masses; both must be set, but only per-atom masses will be used (src/atom_vec_hybrid.cpp:156) +dimension 2 +newton off + +lattice sq 0.4 +Lattice spacing in x,y,z = 1.5811388 1.5811388 1.5811388 +region box block -30 30 -30 30 -0.2 0.2 +create_box 1 box +Created orthogonal box = (-47.434165 -47.434165 -0.31622777) to (47.434165 47.434165 0.31622777) + 2 by 2 by 1 MPI processor grid +create_atoms 1 box +Created 3600 atoms + create_atoms CPU = 0.002 seconds +mass * 1.0 +set type * dipole/random ${seed} 1.0 +set type * dipole/random 198098 1.0 +Setting atom values ... + 3600 settings made for dipole/random +velocity all create 1.0 1 loop geom + +neighbor 1.0 bin +neigh_modify every 1 delay 1 check yes + +pair_style none + +fix 1 all brownian/sphere ${temp} ${seed} rng ${rng} gamma_r ${gamma_r} gamma_t ${gamma_t} +fix 1 all brownian/sphere 1.0 ${seed} rng ${rng} gamma_r ${gamma_r} gamma_t ${gamma_t} +fix 1 all brownian/sphere 1.0 198098 rng ${rng} gamma_r ${gamma_r} gamma_t ${gamma_t} +fix 1 all brownian/sphere 1.0 198098 rng uniform gamma_r ${gamma_r} gamma_t ${gamma_t} +fix 1 all brownian/sphere 1.0 198098 rng uniform gamma_r 0.7 gamma_t ${gamma_t} +fix 1 all brownian/sphere 1.0 198098 rng uniform gamma_r 0.7 gamma_t 5.0 + +#initialisation for the main run + +# MSD +compute msd all msd + +thermo_style custom step ke pe c_msd[*] + +#dump 1 all custom 1000 dump_${params}_2d.lammpstrj id type # x y z xu yu zu mux muy muz fx fy fz +#dump_modify 1 first yes sort id + +timestep 0.00001 +thermo 100 + +# main run +run 3000 +WARNING: No pairwise cutoff or binsize set. Atom sorting therefore disabled. (src/atom.cpp:2141) +WARNING: Communication cutoff is 0.0. No ghost atoms will be generated. Atoms may get lost. (src/comm_brick.cpp:167) +Per MPI rank memory allocation (min/avg/max) = 4.664 | 4.664 | 4.664 Mbytes +Step KinEng PotEng c_msd[1] c_msd[2] c_msd[3] c_msd[4] + 0 0.99972222 0 0 0 0 0 + 100 21085.797 0 0.00042014118 0.00040399828 0 0.00082413946 + 200 20598.717 0 0.00081715618 0.00082613236 0 0.0016432885 + 300 21040.226 0 0.0012412527 0.0012520475 0 0.0024933002 + 400 21289.734 0 0.0016129899 0.001634482 0 0.003247472 + 500 20951.595 0 0.0020104279 0.0020197694 0 0.0040301973 + 600 20984.974 0 0.0023965593 0.0024277086 0 0.0048242679 + 700 21252.602 0 0.0028349303 0.0028407812 0 0.0056757114 + 800 20951.95 0 0.0032674595 0.0032573476 0 0.0065248071 + 900 20828.611 0 0.003647953 0.0036650963 0 0.0073130493 + 1000 21073.256 0 0.0040238604 0.0040103537 0 0.0080342142 + 1100 21104.396 0 0.0043694059 0.0044146515 0 0.0087840574 + 1200 20580.591 0 0.0047638237 0.0047646659 0 0.0095284896 + 1300 20667.623 0 0.0051512568 0.0051134445 0 0.010264701 + 1400 20466.72 0 0.0055921578 0.005517863 0 0.011110021 + 1500 20842.366 0 0.0059747304 0.0059374031 0 0.011912134 + 1600 20867.02 0 0.0065493697 0.0064163066 0 0.012965676 + 1700 21021.077 0 0.0070208005 0.0068164842 0 0.013837285 + 1800 21191.183 0 0.0073708939 0.0073226521 0 0.014693546 + 1900 20792.8 0 0.0076984189 0.0077400043 0 0.015438423 + 2000 21296.326 0 0.0081882545 0.0081503672 0 0.016338622 + 2100 21085.097 0 0.008596146 0.0086041272 0 0.017200273 + 2200 20506.523 0 0.0089905439 0.0091045462 0 0.01809509 + 2300 21068.555 0 0.0094163509 0.0094703314 0 0.018886682 + 2400 21128.867 0 0.0097349212 0.0098535832 0 0.019588504 + 2500 21009.514 0 0.010218059 0.010244621 0 0.020462679 + 2600 21177.52 0 0.01060437 0.010642719 0 0.021247089 + 2700 20832.56 0 0.011052623 0.011078899 0 0.022131522 + 2800 21298.053 0 0.011439708 0.011587702 0 0.023027411 + 2900 21002.28 0 0.011863624 0.01199682 0 0.023860443 + 3000 20737.939 0 0.012229608 0.012324579 0 0.024554186 +Loop time of 0.492798 on 4 procs for 3000 steps with 3600 atoms + +Performance: 5259.763 tau/day, 6087.689 timesteps/s +96.5% CPU use with 4 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0 | 0 | 0 | 0.0 | 0.00 +Neigh | 0 | 0 | 0 | 0.0 | 0.00 +Comm | 0.018005 | 0.021524 | 0.025207 | 2.2 | 4.37 +Output | 0.0013187 | 0.0018334 | 0.0032332 | 1.9 | 0.37 +Modify | 0.37545 | 0.38775 | 0.40664 | 1.9 | 78.68 +Other | | 0.08169 | | | 16.58 + +Nlocal: 900.000 ave 900 max 900 min +Histogram: 4 0 0 0 0 0 0 0 0 0 +Nghost: 61.0000 ave 61 max 61 min +Histogram: 4 0 0 0 0 0 0 0 0 0 +Neighs: 0.00000 ave 0 max 0 min +Histogram: 4 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 0 +Ave neighs/atom = 0.0000000 +Neighbor list builds = 0 +Dangerous builds = 0 +Total wall time: 0:00:00 diff --git a/examples/USER/brownian/sphere/log.11May2021.in3d.sphere.g++.1 b/examples/USER/brownian/sphere/log.11May2021.in3d.sphere.g++.1 new file mode 100644 index 0000000000..20048f96f1 --- /dev/null +++ b/examples/USER/brownian/sphere/log.11May2021.in3d.sphere.g++.1 @@ -0,0 +1,126 @@ +LAMMPS (8 Apr 2021) + using 1 OpenMP thread(s) per MPI task +##### overdamped dynamics of a sphere (with dipole attached to it) in 3D##### + +variable rng string uniform +variable seed string 198098 +variable temp string 1.0 +variable gamma_t string 5.0 +variable gamma_r string 0.7 +variable params string ${rng}_${temp}_${gamma_r}_${gamma_t} +variable params string uniform_${temp}_${gamma_r}_${gamma_t} +variable params string uniform_1.0_${gamma_r}_${gamma_t} +variable params string uniform_1.0_0.7_${gamma_t} +variable params string uniform_1.0_0.7_5.0 + +units lj +atom_style hybrid dipole sphere +WARNING: Atom style hybrid defines both, per-type and per-atom masses; both must be set, but only per-atom masses will be used (src/atom_vec_hybrid.cpp:156) +dimension 3 +newton off + +lattice sc 0.4 +Lattice spacing in x,y,z = 1.3572088 1.3572088 1.3572088 +region box block -8 8 -8 8 -8 8 +create_box 1 box +Created orthogonal box = (-10.857670 -10.857670 -10.857670) to (10.857670 10.857670 10.857670) + 1 by 1 by 1 MPI processor grid +create_atoms 1 box +Created 4096 atoms + create_atoms CPU = 0.005 seconds +mass * 1.0 +set type * dipole/random ${seed} 1.0 +set type * dipole/random 198098 1.0 +Setting atom values ... + 4096 settings made for dipole/random +velocity all create 1.0 1 loop geom + +neighbor 1.0 bin +neigh_modify every 1 delay 1 check yes + +pair_style none + +fix 1 all brownian/sphere ${temp} ${seed} rng ${rng} gamma_r ${gamma_r} gamma_t ${gamma_t} +fix 1 all brownian/sphere 1.0 ${seed} rng ${rng} gamma_r ${gamma_r} gamma_t ${gamma_t} +fix 1 all brownian/sphere 1.0 198098 rng ${rng} gamma_r ${gamma_r} gamma_t ${gamma_t} +fix 1 all brownian/sphere 1.0 198098 rng uniform gamma_r ${gamma_r} gamma_t ${gamma_t} +fix 1 all brownian/sphere 1.0 198098 rng uniform gamma_r 0.7 gamma_t ${gamma_t} +fix 1 all brownian/sphere 1.0 198098 rng uniform gamma_r 0.7 gamma_t 5.0 + +#initialisation for the main run + +# MSD +compute msd all msd + +thermo_style custom step ke pe c_msd[*] + +#dump 1 all custom 1000 dump_${params}_3d.lammpstrj id type # x y z xu yu zu mux muy muz fx fy fz +#dump_modify 1 first yes sort id + +timestep 0.00001 +thermo 100 + +# main run +run 3000 +WARNING: No pairwise cutoff or binsize set. Atom sorting therefore disabled. (src/atom.cpp:2141) +WARNING: Communication cutoff is 0.0. No ghost atoms will be generated. Atoms may get lost. (src/comm_brick.cpp:167) +Per MPI rank memory allocation (min/avg/max) = 4.737 | 4.737 | 4.737 Mbytes +Step KinEng PotEng c_msd[1] c_msd[2] c_msd[3] c_msd[4] + 0 1.4996338 0 0 0 0 0 + 100 32032.279 0 0.00040227568 0.00039159837 0.00040147813 0.0011953522 + 200 31673.093 0 0.00077657885 0.00077292327 0.00079963705 0.0023491392 + 300 31476.164 0 0.0011712083 0.0011606723 0.0012089537 0.0035408343 + 400 31911.374 0 0.0015803424 0.001562091 0.0016042321 0.0047466655 + 500 31182.011 0 0.0019677217 0.0019269105 0.0020015977 0.0058962298 + 600 31206.05 0 0.0023360975 0.0023134398 0.0024213372 0.0070708745 + 700 31278.057 0 0.0026966955 0.0027129858 0.0028721373 0.0082818187 + 800 31677.724 0 0.0031197964 0.003134834 0.0032726303 0.0095272607 + 900 31312.741 0 0.0035636612 0.0035573653 0.0037328373 0.010853864 + 1000 31426.075 0 0.0039774626 0.003952159 0.0041879386 0.01211756 + 1100 31361.699 0 0.0044256852 0.004320566 0.004638132 0.013384383 + 1200 31559.778 0 0.0048338539 0.0047210601 0.0050296056 0.01458452 + 1300 31716.797 0 0.0052239651 0.0050796723 0.0054794684 0.015783106 + 1400 31231.077 0 0.0055890568 0.005472377 0.0059264123 0.016987846 + 1500 31605.513 0 0.0059876582 0.0058974054 0.0063452478 0.018230311 + 1600 31551.402 0 0.006413094 0.0062665632 0.0067442106 0.019423868 + 1700 31725.868 0 0.0068244611 0.0067189707 0.0071424779 0.02068591 + 1800 31385.794 0 0.0071570297 0.0070502303 0.0075240296 0.02173129 + 1900 31754.094 0 0.0075638662 0.0074243015 0.0079935325 0.0229817 + 2000 31668.959 0 0.0080059944 0.0079019753 0.0084000614 0.024308031 + 2100 31781.994 0 0.0084108141 0.0082719077 0.0088004977 0.02548322 + 2200 31455.021 0 0.0088844434 0.0086931769 0.0091916929 0.026769313 + 2300 31273.079 0 0.0093155639 0.0091027782 0.0095364621 0.027954804 + 2400 31283.781 0 0.0098441686 0.0094496218 0.0099279073 0.029221698 + 2500 31758.315 0 0.010372129 0.0097843406 0.010334653 0.030491123 + 2600 31780.442 0 0.010770862 0.010313119 0.010637545 0.031721525 + 2700 31552.277 0 0.011268703 0.010693437 0.01110762 0.033069759 + 2800 31124.693 0 0.011661333 0.011100115 0.011480624 0.034242072 + 2900 31438.795 0 0.012068847 0.011346633 0.011842006 0.035257486 + 3000 31574.258 0 0.012482632 0.011691477 0.012210207 0.036384317 +Loop time of 2.8531 on 1 procs for 3000 steps with 4096 atoms + +Performance: 908.486 tau/day, 1051.488 timesteps/s +99.9% CPU use with 1 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0 | 0 | 0 | 0.0 | 0.00 +Neigh | 3.2425e-05 | 3.2425e-05 | 3.2425e-05 | 0.0 | 0.00 +Comm | 0.13219 | 0.13219 | 0.13219 | 0.0 | 4.63 +Output | 0.0045686 | 0.0045686 | 0.0045686 | 0.0 | 0.16 +Modify | 2.5857 | 2.5857 | 2.5857 | 0.0 | 90.63 +Other | | 0.1307 | | | 4.58 + +Nlocal: 4096.00 ave 4096 max 4096 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 0.00000 ave 0 max 0 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 0.00000 ave 0 max 0 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 0 +Ave neighs/atom = 0.0000000 +Neighbor list builds = 1 +Dangerous builds = 0 +Total wall time: 0:00:02 diff --git a/examples/USER/brownian/sphere/log.11May2021.in3d.sphere.g++.4 b/examples/USER/brownian/sphere/log.11May2021.in3d.sphere.g++.4 new file mode 100644 index 0000000000..57dc6d5f11 --- /dev/null +++ b/examples/USER/brownian/sphere/log.11May2021.in3d.sphere.g++.4 @@ -0,0 +1,126 @@ +LAMMPS (8 Apr 2021) + using 1 OpenMP thread(s) per MPI task +##### overdamped dynamics of a sphere (with dipole attached to it) in 3D##### + +variable rng string uniform +variable seed string 198098 +variable temp string 1.0 +variable gamma_t string 5.0 +variable gamma_r string 0.7 +variable params string ${rng}_${temp}_${gamma_r}_${gamma_t} +variable params string uniform_${temp}_${gamma_r}_${gamma_t} +variable params string uniform_1.0_${gamma_r}_${gamma_t} +variable params string uniform_1.0_0.7_${gamma_t} +variable params string uniform_1.0_0.7_5.0 + +units lj +atom_style hybrid dipole sphere +WARNING: Atom style hybrid defines both, per-type and per-atom masses; both must be set, but only per-atom masses will be used (src/atom_vec_hybrid.cpp:156) +dimension 3 +newton off + +lattice sc 0.4 +Lattice spacing in x,y,z = 1.3572088 1.3572088 1.3572088 +region box block -8 8 -8 8 -8 8 +create_box 1 box +Created orthogonal box = (-10.857670 -10.857670 -10.857670) to (10.857670 10.857670 10.857670) + 2 by 1 by 2 MPI processor grid +create_atoms 1 box +Created 4096 atoms + create_atoms CPU = 0.006 seconds +mass * 1.0 +set type * dipole/random ${seed} 1.0 +set type * dipole/random 198098 1.0 +Setting atom values ... + 4096 settings made for dipole/random +velocity all create 1.0 1 loop geom + +neighbor 1.0 bin +neigh_modify every 1 delay 1 check yes + +pair_style none + +fix 1 all brownian/sphere ${temp} ${seed} rng ${rng} gamma_r ${gamma_r} gamma_t ${gamma_t} +fix 1 all brownian/sphere 1.0 ${seed} rng ${rng} gamma_r ${gamma_r} gamma_t ${gamma_t} +fix 1 all brownian/sphere 1.0 198098 rng ${rng} gamma_r ${gamma_r} gamma_t ${gamma_t} +fix 1 all brownian/sphere 1.0 198098 rng uniform gamma_r ${gamma_r} gamma_t ${gamma_t} +fix 1 all brownian/sphere 1.0 198098 rng uniform gamma_r 0.7 gamma_t ${gamma_t} +fix 1 all brownian/sphere 1.0 198098 rng uniform gamma_r 0.7 gamma_t 5.0 + +#initialisation for the main run + +# MSD +compute msd all msd + +thermo_style custom step ke pe c_msd[*] + +#dump 1 all custom 1000 dump_${params}_3d.lammpstrj id type # x y z xu yu zu mux muy muz fx fy fz +#dump_modify 1 first yes sort id + +timestep 0.00001 +thermo 100 + +# main run +run 3000 +WARNING: No pairwise cutoff or binsize set. Atom sorting therefore disabled. (src/atom.cpp:2141) +WARNING: Communication cutoff is 0.0. No ghost atoms will be generated. Atoms may get lost. (src/comm_brick.cpp:167) +Per MPI rank memory allocation (min/avg/max) = 4.694 | 4.694 | 4.694 Mbytes +Step KinEng PotEng c_msd[1] c_msd[2] c_msd[3] c_msd[4] + 0 1.4996338 0 0 0 0 0 + 100 30882.707 0 0.00040787161 0.00039391576 0.00040796913 0.0012097565 + 200 31370.751 0 0.00081742036 0.00078240455 0.0008195167 0.0024193416 + 300 31469.51 0 0.0012118299 0.0011819412 0.0012477119 0.003641483 + 400 31696.58 0 0.0015540547 0.0015849689 0.0015836091 0.0047226327 + 500 31488.269 0 0.0019638041 0.0019659637 0.0020547832 0.005984551 + 600 30942.589 0 0.0023273784 0.0023572171 0.0024715245 0.00715612 + 700 31228.473 0 0.0027821732 0.002735338 0.0028734675 0.0083909787 + 800 31426.92 0 0.0031663838 0.0031092782 0.0033231014 0.0095987634 + 900 31447.595 0 0.003539588 0.003564381 0.003753036 0.010857005 + 1000 31363.911 0 0.0039854308 0.003937555 0.0041203919 0.012043378 + 1100 31522.958 0 0.0043009285 0.0043676491 0.0044799414 0.013148519 + 1200 31403.033 0 0.0046361199 0.0047513598 0.0049014974 0.014288977 + 1300 31752.182 0 0.0049824718 0.0051327113 0.0053130614 0.015428244 + 1400 31336.955 0 0.0054251445 0.0055442325 0.0057472998 0.016716677 + 1500 31224.306 0 0.0059295596 0.0059920697 0.0061375228 0.018059152 + 1600 31744.535 0 0.0063845142 0.0063600989 0.0064833215 0.019227935 + 1700 31472.081 0 0.0068360092 0.0067985824 0.0069464303 0.020581022 + 1800 31577.334 0 0.0073001079 0.0071355564 0.0073400543 0.021775719 + 1900 31521.234 0 0.0077178677 0.0074371106 0.007708008 0.022862986 + 2000 31045.148 0 0.0080515968 0.0078583776 0.0081000219 0.024009996 + 2100 31289.809 0 0.0084280175 0.0082322226 0.0084475904 0.02510783 + 2200 31505.455 0 0.008802925 0.0085708943 0.0087648194 0.026138639 + 2300 31882.722 0 0.0092223105 0.0089242925 0.0092643028 0.027410906 + 2400 31028.15 0 0.0095737559 0.0093585981 0.0096771837 0.028609538 + 2500 31581.041 0 0.0099316284 0.009785264 0.010100235 0.029817127 + 2600 31272.119 0 0.010332986 0.01007291 0.010474606 0.030880502 + 2700 31537.8 0 0.010751592 0.010565273 0.01093107 0.032247935 + 2800 31060.697 0 0.011156729 0.011010751 0.011260025 0.033427506 + 2900 31541.612 0 0.011542003 0.011499419 0.011642873 0.034684295 + 3000 31305.382 0 0.011876832 0.011866445 0.012052577 0.035795854 +Loop time of 1.00142 on 4 procs for 3000 steps with 4096 atoms + +Performance: 2588.329 tau/day, 2995.751 timesteps/s +95.2% CPU use with 4 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0 | 0 | 0 | 0.0 | 0.00 +Neigh | 0 | 0 | 0 | 0.0 | 0.00 +Comm | 0.084416 | 0.10685 | 0.12695 | 5.7 | 10.67 +Output | 0.001471 | 0.0019466 | 0.0033245 | 1.8 | 0.19 +Modify | 0.66585 | 0.70645 | 0.78995 | 6.0 | 70.54 +Other | | 0.1862 | | | 18.59 + +Nlocal: 1024.00 ave 1024 max 1024 min +Histogram: 4 0 0 0 0 0 0 0 0 0 +Nghost: 353.000 ave 353 max 353 min +Histogram: 4 0 0 0 0 0 0 0 0 0 +Neighs: 0.00000 ave 0 max 0 min +Histogram: 4 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 0 +Ave neighs/atom = 0.0000000 +Neighbor list builds = 0 +Dangerous builds = 0 +Total wall time: 0:00:01 diff --git a/examples/USER/brownian/spherical_ABP/in2d.abp b/examples/USER/brownian/spherical_ABP/in2d.abp new file mode 100644 index 0000000000..bd7dce13b6 --- /dev/null +++ b/examples/USER/brownian/spherical_ABP/in2d.abp @@ -0,0 +1,60 @@ +# 2D overdamped active brownian particle dynamics (ABP) +# with WCA potential + +variable gamma_t string 1.0 +variable gamma_r string 1.0 +variable temp string 1.0 +variable seed equal 1974019 +variable fp string 4.0 +variable params string ${temp}_${gamma_t}_${gamma_r}_${fp} + +units lj +atom_style hybrid dipole sphere +dimension 2 +newton off + +lattice sq 0.4 +region box block -16 16 -16 16 -0.2 0.2 +create_box 1 box +create_atoms 1 box +mass * 1.0 +set type * dipole/random ${seed} 1.0 +velocity all create 1.0 1 loop geom + +# more careful with neighbors since higher diffusion in abps +neighbor 1.0 bin +neigh_modify every 1 delay 1 check yes + +# WCA potential (purely repulsive) +pair_style lj/cut 2.5 +pair_coeff * * 1.0 1.0 1.1224 +pair_modify shift yes + +# overdamped brownian dynamics time-step +fix step all brownian/sphere ${temp} ${seed} gamma_t ${gamma_t} gamma_r ${gamma_r} +# self-propulsion force along the dipole direction +fix activity all propel/self dipole ${fp} +fix 2 all enforce2d + +compute press all pressure NULL virial + +thermo_style custom step pe ke c_press + +#equilibration +timestep 0.0000000001 +thermo 100 +run 5000 +reset_timestep 0 + + +# MSD +compute msd all msd + +thermo_style custom step temp epair c_msd[*] c_press + + +timestep 0.00001 +thermo 1000 + +# main run +run 20000 diff --git a/examples/USER/brownian/spherical_ABP/in3d.ideal_abp b/examples/USER/brownian/spherical_ABP/in3d.ideal_abp new file mode 100644 index 0000000000..6a6575b510 --- /dev/null +++ b/examples/USER/brownian/spherical_ABP/in3d.ideal_abp @@ -0,0 +1,67 @@ +# 3D overdamped active brownian dynamics with no interactions + +variable gamma_t string 3.0 +variable gamma_r string 1.0 +variable temp string 1.0 +variable seed equal 1974019 +variable fp string 4.0 +variable params string ${temp}_${gamma_t}_${gamma_r}_${fp} + +units lj +atom_style hybrid dipole sphere +dimension 3 +newton off + +lattice sc 0.4 +region box block -8 8 -8 8 -8 8 +create_box 1 box +create_atoms 1 box +mass * 1.0 +set type * dipole/random ${seed} 1.0 +velocity all create 1.0 1 loop geom + +pair_style none + +# overdamped brownian dynamics time-step +fix step all brownian/sphere ${temp} ${seed} gamma_t ${gamma_t} gamma_r ${gamma_r} +# self-propulsion force along the dipole direction +fix activity all propel/self dipole ${fp} + +compute press all pressure NULL virial + +thermo_style custom step ke pe c_press + +#equilibration +timestep 0.0000000001 +thermo 100 +run 5000 +reset_timestep 0 + +# MSD to demonstrate expected diffusive behaviour for ideal active +# brownian motion, which is +# +# MSD = (2*d*kb*T/gamma_t + 2*fp**2*gamma_r/(kb*T*gamma_t**2*(d-1)))*t +# + 2*fp**2*gamma_r**2/(gamma_t**2*(d-1)**2*(kb*T)**2)*(e^(-(d-1)*t*kb*T/gamma_r)-1) +# +# with d being simulation dimension +compute msd all msd + +thermo_style custom step ke pe c_msd[*] c_press + + +timestep 0.00001 +thermo 1000 + +# main run +run 12000 + +# if you want to check that rotational diffusion is behaving as expected, +# uncomment next three lines for dump output and then plot , +# which should decay exponentially with timescale (d-1)*D_r (with d +# being simulation dimension) + +#dump 1 all custom 2000 dump_ideal_${params}_3d.lammpstrj id type & +# x y xu yu mux muy muz fx fy fz +#dump_modify 1 first yes sort id + +#run 120000 diff --git a/examples/USER/brownian/spherical_ABP/log.11May2021.in2d.apb.g++.1 b/examples/USER/brownian/spherical_ABP/log.11May2021.in2d.apb.g++.1 new file mode 100644 index 0000000000..452d8a3d52 --- /dev/null +++ b/examples/USER/brownian/spherical_ABP/log.11May2021.in2d.apb.g++.1 @@ -0,0 +1,221 @@ +LAMMPS (8 Apr 2021) + using 1 OpenMP thread(s) per MPI task +# 2D overdamped active brownian particle dynamics (ABP) +# with WCA potential + +variable gamma_t string 1.0 +variable gamma_r string 1.0 +variable temp string 1.0 +variable seed equal 1974019 +variable fp string 4.0 +variable params string ${temp}_${gamma_t}_${gamma_r}_${fp} +variable params string 1.0_${gamma_t}_${gamma_r}_${fp} +variable params string 1.0_1.0_${gamma_r}_${fp} +variable params string 1.0_1.0_1.0_${fp} +variable params string 1.0_1.0_1.0_4.0 + +units lj +atom_style hybrid dipole sphere +WARNING: Atom style hybrid defines both, per-type and per-atom masses; both must be set, but only per-atom masses will be used (src/atom_vec_hybrid.cpp:156) +dimension 2 +newton off + +lattice sq 0.4 +Lattice spacing in x,y,z = 1.5811388 1.5811388 1.5811388 +region box block -16 16 -16 16 -0.2 0.2 +create_box 1 box +Created orthogonal box = (-25.298221 -25.298221 -0.31622777) to (25.298221 25.298221 0.31622777) + 1 by 1 by 1 MPI processor grid +create_atoms 1 box +Created 1024 atoms + create_atoms CPU = 0.002 seconds +mass * 1.0 +set type * dipole/random ${seed} 1.0 +set type * dipole/random 1974019 1.0 +Setting atom values ... + 1024 settings made for dipole/random +velocity all create 1.0 1 loop geom + +# more careful with neighbors since higher diffusion in abps +neighbor 1.0 bin +neigh_modify every 1 delay 1 check yes + +# WCA potential (purely repulsive) +pair_style lj/cut 2.5 +pair_coeff * * 1.0 1.0 1.1224 +pair_modify shift yes + +# overdamped brownian dynamics time-step +fix step all brownian/sphere ${temp} ${seed} gamma_t ${gamma_t} gamma_r ${gamma_r} +fix step all brownian/sphere 1.0 ${seed} gamma_t ${gamma_t} gamma_r ${gamma_r} +fix step all brownian/sphere 1.0 1974019 gamma_t ${gamma_t} gamma_r ${gamma_r} +fix step all brownian/sphere 1.0 1974019 gamma_t 1.0 gamma_r ${gamma_r} +fix step all brownian/sphere 1.0 1974019 gamma_t 1.0 gamma_r 1.0 +# self-propulsion force along the dipole direction +fix activity all propel/self dipole ${fp} +fix activity all propel/self dipole 4.0 +fix 2 all enforce2d + +compute press all pressure NULL virial + +thermo_style custom step pe ke c_press + +#equilibration +timestep 0.0000000001 +thermo 100 +run 5000 +Neighbor list info ... + update every 1 steps, delay 1 steps, check yes + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 2.1224 + ghost atom cutoff = 2.1224 + binsize = 1.0612, bins = 48 48 1 + 1 neighbor lists, perpetual/occasional/extra = 1 0 0 + (1) pair lj/cut, perpetual + attributes: half, newton off + pair build: half/bin/newtoff + stencil: half/bin/2d/newtoff + bin: standard +Per MPI rank memory allocation (min/avg/max) = 5.066 | 5.066 | 5.066 Mbytes +Step PotEng KinEng c_press + 0 0 0.99902344 -0.53979198 + 100 0 1.026585e+10 -0.5398101 + 200 0 1.0630628e+10 -0.53977393 + 300 0 1.03483e+10 -0.53977041 + 400 0 1.049279e+10 -0.53974314 + 500 0 1.0832067e+10 -0.53979451 + 600 0 1.0403632e+10 -0.53976233 + 700 0 1.0334726e+10 -0.53976174 + 800 0 1.0119596e+10 -0.53969338 + 900 0 1.0786136e+10 -0.53970415 + 1000 0 1.0539036e+10 -0.53974577 + 1100 0 1.0643695e+10 -0.53982431 + 1200 0 1.0234642e+10 -0.53976823 + 1300 0 1.036268e+10 -0.53981454 + 1400 0 1.0605702e+10 -0.53988117 + 1500 0 1.0517916e+10 -0.53989207 + 1600 0 1.0564482e+10 -0.53993016 + 1700 0 1.0460152e+10 -0.53984454 + 1800 0 1.0468566e+10 -0.53985574 + 1900 0 1.0474075e+10 -0.53985439 + 2000 0 1.0683568e+10 -0.53987349 + 2100 0 1.0269077e+10 -0.53990709 + 2200 0 1.0386943e+10 -0.53990068 + 2300 0 1.0406078e+10 -0.53978402 + 2400 0 1.0482072e+10 -0.53980757 + 2500 0 1.0442975e+10 -0.53982657 + 2600 0 1.0292103e+10 -0.53985533 + 2700 0 1.1106453e+10 -0.53991861 + 2800 0 1.0395289e+10 -0.53990138 + 2900 0 1.034021e+10 -0.53992375 + 3000 0 1.0434718e+10 -0.53995566 + 3100 0 1.0194094e+10 -0.53993997 + 3200 0 1.0411552e+10 -0.54000097 + 3300 0 1.0214175e+10 -0.53999884 + 3400 0 1.0434719e+10 -0.54000005 + 3500 0 1.0529638e+10 -0.53998281 + 3600 0 1.0406541e+10 -0.54000141 + 3700 0 1.0577151e+10 -0.54002354 + 3800 0 1.0488249e+10 -0.53996003 + 3900 0 1.0316153e+10 -0.54002024 + 4000 0 1.0491289e+10 -0.5400259 + 4100 0 1.0587981e+10 -0.5399811 + 4200 0 1.0332035e+10 -0.53997951 + 4300 0 1.0776469e+10 -0.53994151 + 4400 0 1.0982142e+10 -0.53983842 + 4500 0 1.0796919e+10 -0.5398414 + 4600 0 1.0324249e+10 -0.53979712 + 4700 0 1.0420899e+10 -0.53981967 + 4800 0 1.0274188e+10 -0.53976759 + 4900 0 1.0411535e+10 -0.5397757 + 5000 0 1.0399215e+10 -0.53980199 +Loop time of 1.34285 on 1 procs for 5000 steps with 1024 atoms + +Performance: 0.032 tau/day, 3723.422 timesteps/s +99.8% CPU use with 1 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0.25309 | 0.25309 | 0.25309 | 0.0 | 18.85 +Neigh | 0 | 0 | 0 | 0.0 | 0.00 +Comm | 0.068734 | 0.068734 | 0.068734 | 0.0 | 5.12 +Output | 0.0012887 | 0.0012887 | 0.0012887 | 0.0 | 0.10 +Modify | 0.96552 | 0.96552 | 0.96552 | 0.0 | 71.90 +Other | | 0.05422 | | | 4.04 + +Nlocal: 1024.00 ave 1024 max 1024 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 201.000 ave 201 max 201 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 2112.00 ave 2112 max 2112 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 2112 +Ave neighs/atom = 2.0625000 +Neighbor list builds = 0 +Dangerous builds = 0 +reset_timestep 0 + + +# MSD +compute msd all msd + +thermo_style custom step temp epair c_msd[*] c_press + + +timestep 0.00001 +thermo 1000 + +# main run +run 20000 +Per MPI rank memory allocation (min/avg/max) = 5.441 | 5.441 | 5.441 Mbytes +Step Temp E_pair c_msd[1] c_msd[2] c_msd[3] c_msd[4] c_press + 0 1.0409381e+10 0 0 0 0 0 -0.53980199 + 1000 107022.73 0.0080050427 0.020451432 0.021388798 0 0.04184023 -0.54900967 + 2000 107475.82 0.017262846 0.040669645 0.044251149 0 0.084920794 -0.42740968 + 3000 105388.35 0.042257875 0.062828995 0.05845782 0 0.12128682 -0.31792184 + 4000 106238.38 0.052733384 0.079036841 0.079396453 0 0.15843329 -0.24243699 + 5000 102904.54 0.088524456 0.095977642 0.099533961 0 0.1955116 -0.093468615 + 6000 105274.15 0.065334999 0.11591691 0.11675531 0 0.23267222 -0.21904478 + 7000 108903.41 0.06724271 0.13694218 0.13914947 0 0.27609164 -0.15913012 + 8000 101451.44 0.097201152 0.15704893 0.16178845 0 0.31883738 -0.055786965 + 9000 106808.72 0.084301668 0.18029391 0.175753 0 0.3560469 0.014898739 + 10000 107381.19 0.088583354 0.2000753 0.19569789 0 0.39577319 0.19417596 + 11000 102105.78 0.081066654 0.22042599 0.21914042 0 0.43956641 0.060574143 + 12000 105384.94 0.098716908 0.24382064 0.24673594 0 0.49055657 0.17067875 + 13000 107479.53 0.099989043 0.26942088 0.27207566 0 0.54149654 0.25514896 + 14000 102938.12 0.093252916 0.28529564 0.28698837 0 0.57228401 0.19976355 + 15000 104408.02 0.11900926 0.31291315 0.31195058 0 0.62486373 0.36956014 + 16000 103447.68 0.09627777 0.34145225 0.33159885 0 0.6730511 0.29857404 + 17000 108400.05 0.11433561 0.36561966 0.36068301 0 0.72630267 0.41922801 + 18000 103363.68 0.11040153 0.38709746 0.39228677 0 0.77938423 0.38111686 + 19000 103310.43 0.10660536 0.41406235 0.40975085 0 0.8238132 0.36022184 + 20000 102692.1 0.13517651 0.43870812 0.44138776 0 0.88009588 0.51144366 +Loop time of 5.66207 on 1 procs for 20000 steps with 1024 atoms + +Performance: 3051.889 tau/day, 3532.279 timesteps/s +99.9% CPU use with 1 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 1.3123 | 1.3123 | 1.3123 | 0.0 | 23.18 +Neigh | 0.011856 | 0.011856 | 0.011856 | 0.0 | 0.21 +Comm | 0.2747 | 0.2747 | 0.2747 | 0.0 | 4.85 +Output | 0.0011516 | 0.0011516 | 0.0011516 | 0.0 | 0.02 +Modify | 3.8451 | 3.8451 | 3.8451 | 0.0 | 67.91 +Other | | 0.2169 | | | 3.83 + +Nlocal: 1024.00 ave 1024 max 1024 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 184.000 ave 184 max 184 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 2558.00 ave 2558 max 2558 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 2558 +Ave neighs/atom = 2.4980469 +Neighbor list builds = 23 +Dangerous builds = 0 +Total wall time: 0:00:07 diff --git a/examples/USER/brownian/spherical_ABP/log.11May2021.in2d.apb.g++.4 b/examples/USER/brownian/spherical_ABP/log.11May2021.in2d.apb.g++.4 new file mode 100644 index 0000000000..aa902fbe80 --- /dev/null +++ b/examples/USER/brownian/spherical_ABP/log.11May2021.in2d.apb.g++.4 @@ -0,0 +1,221 @@ +LAMMPS (8 Apr 2021) + using 1 OpenMP thread(s) per MPI task +# 2D overdamped active brownian particle dynamics (ABP) +# with WCA potential + +variable gamma_t string 1.0 +variable gamma_r string 1.0 +variable temp string 1.0 +variable seed equal 1974019 +variable fp string 4.0 +variable params string ${temp}_${gamma_t}_${gamma_r}_${fp} +variable params string 1.0_${gamma_t}_${gamma_r}_${fp} +variable params string 1.0_1.0_${gamma_r}_${fp} +variable params string 1.0_1.0_1.0_${fp} +variable params string 1.0_1.0_1.0_4.0 + +units lj +atom_style hybrid dipole sphere +WARNING: Atom style hybrid defines both, per-type and per-atom masses; both must be set, but only per-atom masses will be used (src/atom_vec_hybrid.cpp:156) +dimension 2 +newton off + +lattice sq 0.4 +Lattice spacing in x,y,z = 1.5811388 1.5811388 1.5811388 +region box block -16 16 -16 16 -0.2 0.2 +create_box 1 box +Created orthogonal box = (-25.298221 -25.298221 -0.31622777) to (25.298221 25.298221 0.31622777) + 2 by 2 by 1 MPI processor grid +create_atoms 1 box +Created 1024 atoms + create_atoms CPU = 0.001 seconds +mass * 1.0 +set type * dipole/random ${seed} 1.0 +set type * dipole/random 1974019 1.0 +Setting atom values ... + 1024 settings made for dipole/random +velocity all create 1.0 1 loop geom + +# more careful with neighbors since higher diffusion in abps +neighbor 1.0 bin +neigh_modify every 1 delay 1 check yes + +# WCA potential (purely repulsive) +pair_style lj/cut 2.5 +pair_coeff * * 1.0 1.0 1.1224 +pair_modify shift yes + +# overdamped brownian dynamics time-step +fix step all brownian/sphere ${temp} ${seed} gamma_t ${gamma_t} gamma_r ${gamma_r} +fix step all brownian/sphere 1.0 ${seed} gamma_t ${gamma_t} gamma_r ${gamma_r} +fix step all brownian/sphere 1.0 1974019 gamma_t ${gamma_t} gamma_r ${gamma_r} +fix step all brownian/sphere 1.0 1974019 gamma_t 1.0 gamma_r ${gamma_r} +fix step all brownian/sphere 1.0 1974019 gamma_t 1.0 gamma_r 1.0 +# self-propulsion force along the dipole direction +fix activity all propel/self dipole ${fp} +fix activity all propel/self dipole 4.0 +fix 2 all enforce2d + +compute press all pressure NULL virial + +thermo_style custom step pe ke c_press + +#equilibration +timestep 0.0000000001 +thermo 100 +run 5000 +Neighbor list info ... + update every 1 steps, delay 1 steps, check yes + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 2.1224 + ghost atom cutoff = 2.1224 + binsize = 1.0612, bins = 48 48 1 + 1 neighbor lists, perpetual/occasional/extra = 1 0 0 + (1) pair lj/cut, perpetual + attributes: half, newton off + pair build: half/bin/newtoff + stencil: half/bin/2d/newtoff + bin: standard +Per MPI rank memory allocation (min/avg/max) = 5.052 | 5.052 | 5.052 Mbytes +Step PotEng KinEng c_press + 0 0 0.99902344 -0.53979198 + 100 0 1.0503521e+10 -0.53983092 + 200 0 1.0390343e+10 -0.5398287 + 300 0 1.0493441e+10 -0.53979247 + 400 0 1.0545991e+10 -0.53978678 + 500 0 1.0266398e+10 -0.53986297 + 600 0 1.0484775e+10 -0.53978746 + 700 0 1.0583596e+10 -0.53969114 + 800 0 1.0521919e+10 -0.53968754 + 900 0 1.0492019e+10 -0.53958643 + 1000 0 1.0298052e+10 -0.53949872 + 1100 0 1.0531424e+10 -0.53955431 + 1200 0 1.0635635e+10 -0.53960048 + 1300 0 1.0633405e+10 -0.53966331 + 1400 0 1.0195401e+10 -0.53968849 + 1500 0 1.0593758e+10 -0.53969763 + 1600 0 1.0425238e+10 -0.53971936 + 1700 0 1.0470017e+10 -0.53981957 + 1800 0 1.0545953e+10 -0.53987747 + 1900 0 1.0425015e+10 -0.53990412 + 2000 0 1.0655092e+10 -0.5399511 + 2100 0 1.0197224e+10 -0.53988687 + 2200 0 1.0448012e+10 -0.53986066 + 2300 0 1.0355268e+10 -0.53980415 + 2400 0 1.0246979e+10 -0.53979737 + 2500 0 1.0021539e+10 -0.5397919 + 2600 0 1.0200824e+10 -0.5397575 + 2700 0 1.0721591e+10 -0.53973512 + 2800 0 1.0354562e+10 -0.5397127 + 2900 0 1.0306795e+10 -0.5396946 + 3000 0 1.0301339e+10 -0.53968642 + 3100 0 1.0435826e+10 -0.53970945 + 3200 0 1.019524e+10 -0.53969746 + 3300 0 1.0550481e+10 -0.53967977 + 3400 0 1.0283446e+10 -0.53971102 + 3500 0 1.0956695e+10 -0.53976173 + 3600 0 1.0271033e+10 -0.53983632 + 3700 0 1.0389461e+10 -0.53977293 + 3800 0 1.0680515e+10 -0.53977425 + 3900 0 1.0072183e+10 -0.53982922 + 4000 0 1.0458036e+10 -0.53980042 + 4100 0 1.0588689e+10 -0.53971405 + 4200 0 1.0068308e+10 -0.5398033 + 4300 0 1.0502064e+10 -0.53981291 + 4400 0 1.0590544e+10 -0.5398346 + 4500 0 1.0411612e+10 -0.5397916 + 4600 0 1.0518596e+10 -0.53984868 + 4700 0 1.0386105e+10 -0.53977803 + 4800 0 1.029525e+10 -0.53970882 + 4900 0 1.0519112e+10 -0.53969616 + 5000 0 1.0335841e+10 -0.53976477 +Loop time of 0.471663 on 4 procs for 5000 steps with 1024 atoms + +Performance: 0.092 tau/day, 10600.781 timesteps/s +95.8% CPU use with 4 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0.067099 | 0.07105 | 0.077898 | 1.6 | 15.06 +Neigh | 0 | 0 | 0 | 0.0 | 0.00 +Comm | 0.0581 | 0.066084 | 0.072322 | 2.0 | 14.01 +Output | 0.0014644 | 0.002618 | 0.0037239 | 1.6 | 0.56 +Modify | 0.24817 | 0.25719 | 0.26697 | 1.3 | 54.53 +Other | | 0.07472 | | | 15.84 + +Nlocal: 256.000 ave 256 max 256 min +Histogram: 4 0 0 0 0 0 0 0 0 0 +Nghost: 105.000 ave 105 max 105 min +Histogram: 4 0 0 0 0 0 0 0 0 0 +Neighs: 544.000 ave 544 max 544 min +Histogram: 4 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 2176 +Ave neighs/atom = 2.1250000 +Neighbor list builds = 0 +Dangerous builds = 0 +reset_timestep 0 + + +# MSD +compute msd all msd + +thermo_style custom step temp epair c_msd[*] c_press + + +timestep 0.00001 +thermo 1000 + +# main run +run 20000 +Per MPI rank memory allocation (min/avg/max) = 5.427 | 5.427 | 5.427 Mbytes +Step Temp E_pair c_msd[1] c_msd[2] c_msd[3] c_msd[4] c_press + 0 1.0345945e+10 0 0 0 0 0 -0.53976477 + 1000 100114.28 0.0029703577 0.020320684 0.020950989 0 0.041271673 -0.43948247 + 2000 106825.83 0.020969054 0.039616412 0.039459167 0 0.079075578 -0.22765541 + 3000 105287.4 0.037343571 0.056828177 0.058639835 0 0.11546801 -0.11728136 + 4000 104522.23 0.052237136 0.080264931 0.080863543 0 0.16112847 0.033230576 + 5000 103277.94 0.053791862 0.099188864 0.10141444 0 0.20060331 0.073591503 + 6000 104252.87 0.073304776 0.11964238 0.1215576 0 0.24119999 0.22062305 + 7000 105184.19 0.089054043 0.13691291 0.14216478 0 0.27907769 0.29015692 + 8000 104211.82 0.072577918 0.15820522 0.15658491 0 0.31479013 0.25908291 + 9000 99242.172 0.071616004 0.17658708 0.17479704 0 0.35138412 0.26305532 + 10000 105070.83 0.077009979 0.20175025 0.19871513 0 0.40046538 0.34120567 + 11000 106421.07 0.098623061 0.22472634 0.22671582 0 0.45144216 0.44021335 + 12000 103209.85 0.12032847 0.25004966 0.25368441 0 0.50373406 0.57344873 + 13000 107156.89 0.1058386 0.27283231 0.2744873 0 0.54731961 0.47957408 + 14000 108119.3 0.1204768 0.29333677 0.30054535 0 0.59388213 0.51832639 + 15000 105477.62 0.12510026 0.32217621 0.32806599 0 0.6502422 0.50174158 + 16000 106676.27 0.10893618 0.34980866 0.36031184 0 0.7101205 0.44769198 + 17000 103048.41 0.10625673 0.3781797 0.37970499 0 0.75788468 0.42803898 + 18000 109454.8 0.10555778 0.40997694 0.41396777 0 0.82394471 0.41380982 + 19000 107459.73 0.11267582 0.43757738 0.43577856 0 0.87335594 0.4917748 + 20000 101991.9 0.084279008 0.45363612 0.46278076 0 0.91641688 0.41707912 +Loop time of 1.80877 on 4 procs for 20000 steps with 1024 atoms + +Performance: 9553.439 tau/day, 11057.221 timesteps/s +98.5% CPU use with 4 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0.34461 | 0.36424 | 0.40948 | 4.4 | 20.14 +Neigh | 0.0031493 | 0.003215 | 0.0032432 | 0.1 | 0.18 +Comm | 0.19538 | 0.20419 | 0.2104 | 1.2 | 11.29 +Output | 0.00054121 | 0.00087297 | 0.0018425 | 0.0 | 0.05 +Modify | 0.98335 | 1.0156 | 1.0791 | 3.8 | 56.15 +Other | | 0.2207 | | | 12.20 + +Nlocal: 256.000 ave 261 max 252 min +Histogram: 1 1 0 0 0 0 1 0 0 1 +Nghost: 93.0000 ave 100 max 83 min +Histogram: 1 0 0 0 0 0 1 1 0 1 +Neighs: 662.250 ave 693 max 635 min +Histogram: 1 0 1 0 0 0 1 0 0 1 + +Total # of neighbors = 2649 +Ave neighs/atom = 2.5869141 +Neighbor list builds = 23 +Dangerous builds = 0 +Total wall time: 0:00:02 diff --git a/examples/USER/brownian/spherical_ABP/log.11May2021.in3d.ideal_apb.g++.1 b/examples/USER/brownian/spherical_ABP/log.11May2021.in3d.ideal_apb.g++.1 new file mode 100644 index 0000000000..0820471faa --- /dev/null +++ b/examples/USER/brownian/spherical_ABP/log.11May2021.in3d.ideal_apb.g++.1 @@ -0,0 +1,210 @@ +LAMMPS (8 Apr 2021) + using 1 OpenMP thread(s) per MPI task +# 3D overdamped active brownian dynamics with no interactions + +variable gamma_t string 3.0 +variable gamma_r string 1.0 +variable temp string 1.0 +variable seed equal 1974019 +variable fp string 4.0 +variable params string ${temp}_${gamma_t}_${gamma_r}_${fp} +variable params string 1.0_${gamma_t}_${gamma_r}_${fp} +variable params string 1.0_3.0_${gamma_r}_${fp} +variable params string 1.0_3.0_1.0_${fp} +variable params string 1.0_3.0_1.0_4.0 + +units lj +atom_style hybrid dipole sphere +WARNING: Atom style hybrid defines both, per-type and per-atom masses; both must be set, but only per-atom masses will be used (src/atom_vec_hybrid.cpp:156) +dimension 3 +newton off + +lattice sc 0.4 +Lattice spacing in x,y,z = 1.3572088 1.3572088 1.3572088 +region box block -8 8 -8 8 -8 8 +create_box 1 box +Created orthogonal box = (-10.857670 -10.857670 -10.857670) to (10.857670 10.857670 10.857670) + 1 by 1 by 1 MPI processor grid +create_atoms 1 box +Created 4096 atoms + create_atoms CPU = 0.004 seconds +mass * 1.0 +set type * dipole/random ${seed} 1.0 +set type * dipole/random 1974019 1.0 +Setting atom values ... + 4096 settings made for dipole/random +velocity all create 1.0 1 loop geom + +pair_style none + +# overdamped brownian dynamics time-step +fix step all brownian/sphere ${temp} ${seed} gamma_t ${gamma_t} gamma_r ${gamma_r} +fix step all brownian/sphere 1.0 ${seed} gamma_t ${gamma_t} gamma_r ${gamma_r} +fix step all brownian/sphere 1.0 1974019 gamma_t ${gamma_t} gamma_r ${gamma_r} +fix step all brownian/sphere 1.0 1974019 gamma_t 3.0 gamma_r ${gamma_r} +fix step all brownian/sphere 1.0 1974019 gamma_t 3.0 gamma_r 1.0 +# self-propulsion force along the dipole direction +fix activity all propel/self dipole ${fp} +fix activity all propel/self dipole 4.0 + +compute press all pressure NULL virial + +thermo_style custom step ke pe c_press + +#equilibration +timestep 0.0000000001 +thermo 100 +run 5000 +WARNING: No pairwise cutoff or binsize set. Atom sorting therefore disabled. (src/atom.cpp:2141) +WARNING: Communication cutoff is 0.0. No ghost atoms will be generated. Atoms may get lost. (src/comm_brick.cpp:167) +Per MPI rank memory allocation (min/avg/max) = 4.362 | 4.362 | 4.362 Mbytes +Step KinEng PotEng c_press + 0 1.4996338 0 0.068021726 + 100 5.184227e+09 0 0.06801544 + 200 5.2165482e+09 0 0.068010729 + 300 5.2782092e+09 0 0.068009058 + 400 5.3244927e+09 0 0.068003481 + 500 5.2376606e+09 0 0.067998237 + 600 5.2735634e+09 0 0.067998037 + 700 5.2692439e+09 0 0.068025402 + 800 5.2667984e+09 0 0.068030143 + 900 5.242057e+09 0 0.0680246 + 1000 5.2557468e+09 0 0.068028348 + 1100 5.2975687e+09 0 0.068029528 + 1200 5.2081927e+09 0 0.068017542 + 1300 5.2636873e+09 0 0.068012572 + 1400 5.2187907e+09 0 0.06802049 + 1500 5.2349541e+09 0 0.0680373 + 1600 5.216092e+09 0 0.068056885 + 1700 5.2598019e+09 0 0.068069504 + 1800 5.2569065e+09 0 0.068065306 + 1900 5.2072055e+09 0 0.068074863 + 2000 5.2092961e+09 0 0.068061619 + 2100 5.2918572e+09 0 0.068076418 + 2200 5.2680626e+09 0 0.068072149 + 2300 5.242958e+09 0 0.06806486 + 2400 5.2494099e+09 0 0.06805038 + 2500 5.2055798e+09 0 0.068072194 + 2600 5.2264829e+09 0 0.068069312 + 2700 5.3557342e+09 0 0.068064812 + 2800 5.2186177e+09 0 0.068042942 + 2900 5.2652497e+09 0 0.068044214 + 3000 5.1894899e+09 0 0.068044801 + 3100 5.241524e+09 0 0.068056675 + 3200 5.1915006e+09 0 0.06805641 + 3300 5.2367825e+09 0 0.068049946 + 3400 5.2288011e+09 0 0.068060182 + 3500 5.2704335e+09 0 0.068070881 + 3600 5.2886558e+09 0 0.068050439 + 3700 5.1976022e+09 0 0.068045927 + 3800 5.1525512e+09 0 0.068054494 + 3900 5.2212395e+09 0 0.068061432 + 4000 5.2309575e+09 0 0.068070842 + 4100 5.2260184e+09 0 0.068078378 + 4200 5.2829349e+09 0 0.068071652 + 4300 5.2204917e+09 0 0.068083072 + 4400 5.255242e+09 0 0.068066175 + 4500 5.2435681e+09 0 0.068050802 + 4600 5.2483356e+09 0 0.06805658 + 4700 5.2365098e+09 0 0.068041845 + 4800 5.2254325e+09 0 0.068038583 + 4900 5.1842852e+09 0 0.068028401 + 5000 5.2240722e+09 0 0.068031544 +Loop time of 5.14275 on 1 procs for 5000 steps with 4096 atoms + +Performance: 0.008 tau/day, 972.242 timesteps/s +99.9% CPU use with 1 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0 | 0 | 0 | 0.0 | 0.00 +Neigh | 0 | 0 | 0 | 0.0 | 0.00 +Comm | 0.26842 | 0.26842 | 0.26842 | 0.0 | 5.22 +Output | 0.0035088 | 0.0035088 | 0.0035088 | 0.0 | 0.07 +Modify | 4.6588 | 4.6588 | 4.6588 | 0.0 | 90.59 +Other | | 0.212 | | | 4.12 + +Nlocal: 4096.00 ave 4096 max 4096 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 817.000 ave 817 max 817 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 0.00000 ave 0 max 0 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 0 +Ave neighs/atom = 0.0000000 +Neighbor list builds = 0 +Dangerous builds = 0 +reset_timestep 0 + +# MSD to demonstrate expected diffusive behaviour for ideal active +# brownian motion, which is +# +# MSD = (2*d*kb*T/gamma_t + 2*fp**2*gamma_r/(kb*T*gamma_t**2*(d-1)))*t +# + 2*fp**2*gamma_r**2/(gamma_t**2*(d-1)**2*(kb*T)**2)*(e^(-(d-1)*t*kb*T/gamma_r)-1) +# +# with d being simulation dimension +compute msd all msd + +thermo_style custom step ke pe c_msd[*] c_press + + +timestep 0.00001 +thermo 1000 + +# main run +run 12000 +WARNING: Communication cutoff is 0.0. No ghost atoms will be generated. Atoms may get lost. (src/comm_brick.cpp:167) +Per MPI rank memory allocation (min/avg/max) = 4.737 | 4.737 | 4.737 Mbytes +Step KinEng PotEng c_msd[1] c_msd[2] c_msd[3] c_msd[4] c_press + 0 5.2240722e+09 0 0 0 0 0 0.068031544 + 1000 52651.581 0 0.0066842466 0.0067977045 0.0066831353 0.020165086 0.060774985 + 2000 52835.806 0 0.013693443 0.014008773 0.013518945 0.041221161 0.094748037 + 3000 52097.629 0 0.020666918 0.021696789 0.020665685 0.063029392 0.10673866 + 4000 52579.452 0 0.028145318 0.028504548 0.02830967 0.084959536 0.13358122 + 5000 51255.456 0 0.035019271 0.034644123 0.03638843 0.10605182 0.13507609 + 6000 52730.035 0 0.041412307 0.042689213 0.043339117 0.12744064 0.16497663 + 7000 52247.642 0 0.048119396 0.050556395 0.050706527 0.14938232 0.16360301 + 8000 52169.849 0 0.055241196 0.058678631 0.059373122 0.17329295 0.1676169 + 9000 52520.526 0 0.063519587 0.066592779 0.066988842 0.19710121 0.17142694 + 10000 53519.297 0 0.07164814 0.074576535 0.075619236 0.22184391 0.15619444 + 11000 52937.293 0 0.077992504 0.083184462 0.082988794 0.24416576 0.15257327 + 12000 51762.283 0 0.085959749 0.090992292 0.08984213 0.26679417 0.15996211 +Loop time of 11.6748 on 1 procs for 12000 steps with 4096 atoms + +Performance: 888.063 tau/day, 1027.851 timesteps/s +99.9% CPU use with 1 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0 | 0 | 0 | 0.0 | 0.00 +Neigh | 0.0024164 | 0.0024164 | 0.0024164 | 0.0 | 0.02 +Comm | 0.048127 | 0.048127 | 0.048127 | 0.0 | 0.41 +Output | 0.0019393 | 0.0019393 | 0.0019393 | 0.0 | 0.02 +Modify | 11.12 | 11.12 | 11.12 | 0.0 | 95.24 +Other | | 0.5027 | | | 4.31 + +Nlocal: 4096.00 ave 4096 max 4096 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 0.00000 ave 0 max 0 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 0.00000 ave 0 max 0 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 0 +Ave neighs/atom = 0.0000000 +Neighbor list builds = 75 +Dangerous builds = 0 + +# if you want to check that rotational diffusion is behaving as expected, +# uncomment next three lines for dump output and then plot , +# which should decay exponentially with timescale (d-1)*D_r (with d +# being simulation dimension) + +#dump 1 all custom 2000 dump_ideal_${params}_3d.lammpstrj id type # x y xu yu mux muy muz fx fy fz +#dump_modify 1 first yes sort id + +#run 120000 +Total wall time: 0:00:16 diff --git a/examples/USER/brownian/spherical_ABP/log.11May2021.in3d.ideal_apb.g++.4 b/examples/USER/brownian/spherical_ABP/log.11May2021.in3d.ideal_apb.g++.4 new file mode 100644 index 0000000000..c6e5b82090 --- /dev/null +++ b/examples/USER/brownian/spherical_ABP/log.11May2021.in3d.ideal_apb.g++.4 @@ -0,0 +1,210 @@ +LAMMPS (8 Apr 2021) + using 1 OpenMP thread(s) per MPI task +# 3D overdamped active brownian dynamics with no interactions + +variable gamma_t string 3.0 +variable gamma_r string 1.0 +variable temp string 1.0 +variable seed equal 1974019 +variable fp string 4.0 +variable params string ${temp}_${gamma_t}_${gamma_r}_${fp} +variable params string 1.0_${gamma_t}_${gamma_r}_${fp} +variable params string 1.0_3.0_${gamma_r}_${fp} +variable params string 1.0_3.0_1.0_${fp} +variable params string 1.0_3.0_1.0_4.0 + +units lj +atom_style hybrid dipole sphere +WARNING: Atom style hybrid defines both, per-type and per-atom masses; both must be set, but only per-atom masses will be used (src/atom_vec_hybrid.cpp:156) +dimension 3 +newton off + +lattice sc 0.4 +Lattice spacing in x,y,z = 1.3572088 1.3572088 1.3572088 +region box block -8 8 -8 8 -8 8 +create_box 1 box +Created orthogonal box = (-10.857670 -10.857670 -10.857670) to (10.857670 10.857670 10.857670) + 2 by 1 by 2 MPI processor grid +create_atoms 1 box +Created 4096 atoms + create_atoms CPU = 0.002 seconds +mass * 1.0 +set type * dipole/random ${seed} 1.0 +set type * dipole/random 1974019 1.0 +Setting atom values ... + 4096 settings made for dipole/random +velocity all create 1.0 1 loop geom + +pair_style none + +# overdamped brownian dynamics time-step +fix step all brownian/sphere ${temp} ${seed} gamma_t ${gamma_t} gamma_r ${gamma_r} +fix step all brownian/sphere 1.0 ${seed} gamma_t ${gamma_t} gamma_r ${gamma_r} +fix step all brownian/sphere 1.0 1974019 gamma_t ${gamma_t} gamma_r ${gamma_r} +fix step all brownian/sphere 1.0 1974019 gamma_t 3.0 gamma_r ${gamma_r} +fix step all brownian/sphere 1.0 1974019 gamma_t 3.0 gamma_r 1.0 +# self-propulsion force along the dipole direction +fix activity all propel/self dipole ${fp} +fix activity all propel/self dipole 4.0 + +compute press all pressure NULL virial + +thermo_style custom step ke pe c_press + +#equilibration +timestep 0.0000000001 +thermo 100 +run 5000 +WARNING: No pairwise cutoff or binsize set. Atom sorting therefore disabled. (src/atom.cpp:2141) +WARNING: Communication cutoff is 0.0. No ghost atoms will be generated. Atoms may get lost. (src/comm_brick.cpp:167) +Per MPI rank memory allocation (min/avg/max) = 4.319 | 4.319 | 4.319 Mbytes +Step KinEng PotEng c_press + 0 1.4996338 0 0.068021726 + 100 5.2484581e+09 0 0.068010768 + 200 5.277936e+09 0 0.068024039 + 300 5.2651427e+09 0 0.068014821 + 400 5.2066432e+09 0 0.068027583 + 500 5.2250276e+09 0 0.068030242 + 600 5.239771e+09 0 0.068018406 + 700 5.1953674e+09 0 0.068017081 + 800 5.2097107e+09 0 0.068010167 + 900 5.2559863e+09 0 0.068012923 + 1000 5.2743197e+09 0 0.068017855 + 1100 5.1999741e+09 0 0.068014189 + 1200 5.3216344e+09 0 0.068005604 + 1300 5.2839264e+09 0 0.067982558 + 1400 5.2462761e+09 0 0.067977843 + 1500 5.2208208e+09 0 0.067979594 + 1600 5.2740284e+09 0 0.067972573 + 1700 5.1919692e+09 0 0.067974452 + 1800 5.2497614e+09 0 0.067966417 + 1900 5.2910442e+09 0 0.067976096 + 2000 5.27238e+09 0 0.067963979 + 2100 5.3305398e+09 0 0.06795661 + 2200 5.205471e+09 0 0.067970212 + 2300 5.1803713e+09 0 0.067931775 + 2400 5.2134311e+09 0 0.067941825 + 2500 5.2367424e+09 0 0.067963456 + 2600 5.2246738e+09 0 0.067957556 + 2700 5.2514573e+09 0 0.067960724 + 2800 5.2601577e+09 0 0.067965167 + 2900 5.2422855e+09 0 0.067956561 + 3000 5.1796674e+09 0 0.067946764 + 3100 5.2308189e+09 0 0.067946585 + 3200 5.1835395e+09 0 0.067951909 + 3300 5.2762112e+09 0 0.067963199 + 3400 5.3224133e+09 0 0.067944918 + 3500 5.2314242e+09 0 0.06795318 + 3600 5.2760337e+09 0 0.067958005 + 3700 5.2549349e+09 0 0.06795228 + 3800 5.3343065e+09 0 0.067944561 + 3900 5.2440993e+09 0 0.067947433 + 4000 5.2565026e+09 0 0.067962624 + 4100 5.1766738e+09 0 0.067949542 + 4200 5.2058437e+09 0 0.067959946 + 4300 5.2777775e+09 0 0.067945883 + 4400 5.2020331e+09 0 0.067953495 + 4500 5.1417619e+09 0 0.067944161 + 4600 5.2672994e+09 0 0.067936777 + 4700 5.222847e+09 0 0.067943025 + 4800 5.2467842e+09 0 0.06794191 + 4900 5.2784378e+09 0 0.067939495 + 5000 5.2563969e+09 0 0.067940246 +Loop time of 1.55848 on 4 procs for 5000 steps with 4096 atoms + +Performance: 0.028 tau/day, 3208.260 timesteps/s +97.1% CPU use with 4 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0 | 0 | 0 | 0.0 | 0.00 +Neigh | 0 | 0 | 0 | 0.0 | 0.00 +Comm | 0.15304 | 0.15677 | 0.16459 | 1.2 | 10.06 +Output | 0.0012078 | 0.0021182 | 0.0047011 | 3.2 | 0.14 +Modify | 1.1966 | 1.2236 | 1.2761 | 2.8 | 78.51 +Other | | 0.176 | | | 11.29 + +Nlocal: 1024.00 ave 1024 max 1024 min +Histogram: 4 0 0 0 0 0 0 0 0 0 +Nghost: 353.000 ave 353 max 353 min +Histogram: 4 0 0 0 0 0 0 0 0 0 +Neighs: 0.00000 ave 0 max 0 min +Histogram: 4 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 0 +Ave neighs/atom = 0.0000000 +Neighbor list builds = 0 +Dangerous builds = 0 +reset_timestep 0 + +# MSD to demonstrate expected diffusive behaviour for ideal active +# brownian motion, which is +# +# MSD = (2*d*kb*T/gamma_t + 2*fp**2*gamma_r/(kb*T*gamma_t**2*(d-1)))*t +# + 2*fp**2*gamma_r**2/(gamma_t**2*(d-1)**2*(kb*T)**2)*(e^(-(d-1)*t*kb*T/gamma_r)-1) +# +# with d being simulation dimension +compute msd all msd + +thermo_style custom step ke pe c_msd[*] c_press + + +timestep 0.00001 +thermo 1000 + +# main run +run 12000 +WARNING: Communication cutoff is 0.0. No ghost atoms will be generated. Atoms may get lost. (src/comm_brick.cpp:167) +Per MPI rank memory allocation (min/avg/max) = 4.694 | 4.694 | 4.694 Mbytes +Step KinEng PotEng c_msd[1] c_msd[2] c_msd[3] c_msd[4] c_press + 0 5.2563969e+09 0 0 0 0 0 0.067940246 + 1000 52568.549 0 0.0067249858 0.0066478843 0.0066014231 0.019974293 0.066777589 + 2000 52836.937 0 0.013611101 0.013799663 0.013161144 0.040571907 0.066769693 + 3000 52129.467 0 0.020360834 0.02089829 0.01995025 0.061209374 0.060026879 + 4000 52075.177 0 0.027638751 0.028062314 0.026895904 0.082596969 0.078290387 + 5000 52203.996 0 0.034087112 0.034933104 0.033832559 0.10285278 0.083657551 + 6000 52986.764 0 0.041562413 0.042238976 0.040542538 0.12434393 0.11542014 + 7000 51941.229 0 0.049216989 0.049250201 0.047598008 0.1460652 0.13739893 + 8000 52618.713 0 0.057198947 0.057409217 0.05404895 0.16865711 0.13681938 + 9000 52501.332 0 0.066447829 0.065262287 0.062271789 0.19398191 0.14306596 + 10000 52545.628 0 0.073800792 0.072510553 0.070100713 0.21641206 0.14689578 + 11000 52416.561 0 0.081881868 0.080638809 0.078969817 0.24149049 0.15608324 + 12000 52271.578 0 0.090521937 0.088555992 0.08592156 0.26499949 0.1474981 +Loop time of 3.13506 on 4 procs for 12000 steps with 4096 atoms + +Performance: 3307.113 tau/day, 3827.677 timesteps/s +99.2% CPU use with 4 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0 | 0 | 0 | 0.0 | 0.00 +Neigh | 0.00060225 | 0.00060934 | 0.00061345 | 0.0 | 0.02 +Comm | 0.029197 | 0.029376 | 0.029582 | 0.1 | 0.94 +Output | 0.00060606 | 0.00087148 | 0.0016448 | 0.0 | 0.03 +Modify | 2.84 | 2.8773 | 2.8942 | 1.3 | 91.78 +Other | | 0.2269 | | | 7.24 + +Nlocal: 1024.00 ave 1037 max 999 min +Histogram: 1 0 0 0 0 0 0 1 1 1 +Nghost: 0.00000 ave 0 max 0 min +Histogram: 4 0 0 0 0 0 0 0 0 0 +Neighs: 0.00000 ave 0 max 0 min +Histogram: 4 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 0 +Ave neighs/atom = 0.0000000 +Neighbor list builds = 73 +Dangerous builds = 0 + +# if you want to check that rotational diffusion is behaving as expected, +# uncomment next three lines for dump output and then plot , +# which should decay exponentially with timescale (d-1)*D_r (with d +# being simulation dimension) + +#dump 1 all custom 2000 dump_ideal_${params}_3d.lammpstrj id type # x y xu yu mux muy muz fx fy fz +#dump_modify 1 first yes sort id + +#run 120000 +Total wall time: 0:00:04 diff --git a/examples/USER/mdi/README b/examples/USER/mdi/README new file mode 100644 index 0000000000..086341f785 --- /dev/null +++ b/examples/USER/mdi/README @@ -0,0 +1,24 @@ +This dir contains scripts that demonstrate how to use LAMMPS as an +MDI engine. LAMMPS as an engine performs the MD timestepping. +The driver is a simple Python script. Every timestep the driver +sends one or more commands to LAMMPS. + +-------------- + +The Script.sh file has comands to perform some very simple example +runs. + +-------------- + +More complex calculations using LAMMPS as an MDI engine will +typically require the use of an MDI driver. Several MDI drivers +support calculations with LAMMPS, and include: + +Ab Initio Molecular Dynamics (AIMD) Driver: +https://github.com/MolSSI-MDI/MDI_AIMD_Driver + +Nudged Elastic Band (NEB) Driver: +https://github.com/MolSSI-MDI/MDI_NEB_Driver + +Metadynamics Driver: +https://github.com/MolSSI-MDI/MDI_Metadynamics diff --git a/examples/USER/mdi/Script.sh b/examples/USER/mdi/Script.sh new file mode 100644 index 0000000000..9e3f6e5f62 --- /dev/null +++ b/examples/USER/mdi/Script.sh @@ -0,0 +1,16 @@ +#!/bin/bash +# sample launch scripts + + +# TCP, running LAMMPS on one proc + +python driver.py -mdi "-name driver -role DRIVER -method TCP -port 8021" & +../../../src/lmp_mdi -mdi "-name LAMMPS -role ENGINE -method TCP -port 8021 -hostname localhost" -in lammps.in > lammps.out & +wait + + +# TCP, running LAMMPS on two procs + +python driver.py -mdi "-name driver -role DRIVER -method TCP -port 8021" & +mpiexec -n 2 ../../../src/lmp_mdi -mdi "-name LAMMPS -role ENGINE -method TCP -port 8021 -hostname localhost" -in lammps.in > lammps.out & +wait diff --git a/examples/USER/mdi/driver.py b/examples/USER/mdi/driver.py new file mode 100644 index 0000000000..934f6f7be1 --- /dev/null +++ b/examples/USER/mdi/driver.py @@ -0,0 +1,24 @@ +import sys +import mdi + +use_mpi4py = False +try: + from mpi4py import MPI + use_mpi4py = True +except: + pass + +# Initialize the MDI Library +mdi.MDI_Init(sys.argv[2]) + +# Connect to the engine +comm = mdi.MDI_Accept_communicator() + +# Determine the name of the engine +mdi.MDI_Send_Command(" +namespace LAMMPS_AL { +#define LJSMOOTHT LJSMOOTH + +extern Device device; + +template +LJSMOOTHT::LJSMOOTH() : BaseAtomic(), _allocated(false) { +} + +template +LJSMOOTHT::~LJSMOOTH() { + clear(); +} + +template +int LJSMOOTHT::bytes_per_atom(const int max_nbors) const { + return this->bytes_per_atom_atomic(max_nbors); +} + +template +int LJSMOOTHT::init(const int ntypes, + double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double *host_special_lj, const int nlocal, + const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *_screen, + double **host_ljsw0, double **host_ljsw1, double **host_ljsw2, double **host_ljsw3, + double **host_ljsw4, + double **cut_inner, double **cut_inner_sq) { + const int max_shared_types=this->device->max_shared_types(); + + int onetype=0; + #ifdef USE_OPENCL + if (maxspecial==0) + for (int i=1; i0) { + if (onetype>0) + onetype=-1; + else if (onetype==0) + onetype=i*max_shared_types+j; + } + if (onetype<0) onetype=0; + #endif + + int success; + success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, + _screen,lj_smooth,"k_lj_smooth",onetype); + if (success!=0) + return success; + + // If atom type constants fit in shared memory use fast kernel + int lj_types=ntypes; + shared_types=false; + if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) { + lj_types=max_shared_types; + shared_types=true; + } + _lj_types=lj_types; + + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(lj_types*lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; iucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2, + host_cutsq, cut_inner_sq); + + lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset); + + ljsw.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack4(ntypes,lj_types,ljsw,host_write,host_ljsw1,host_ljsw2, + host_ljsw3,host_ljsw4); + + ljsw0.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY); + this->atom->type_pack2(ntypes,lj_types,ljsw0,host_write,host_ljsw0,cut_inner); + + UCL_H_Vec dview; + sp_lj.alloc(4,*(this->ucl_device),UCL_READ_ONLY); + dview.view(host_special_lj,4,*(this->ucl_device)); + ucl_copy(sp_lj,dview,false); + + _allocated=true; + this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+ljsw.row_bytes()+ljsw0.row_bytes()+sp_lj.row_bytes(); + return 0; +} + +template +void LJSMOOTHT::reinit(const int ntypes, double **host_cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double **host_ljsw0, double **host_ljsw1, double **host_ljsw2, double **host_ljsw3, + double **host_ljsw4, + double **cut_inner, double **cut_inner_sq) { + // Allocate a host write buffer for data initialization + UCL_H_Vec host_write(_lj_types*_lj_types*32,*(this->ucl_device), + UCL_WRITE_ONLY); + + for (int i=0; i<_lj_types*_lj_types; i++) + host_write[i]=0.0; + + this->atom->type_pack4(ntypes,_lj_types,lj1,host_write,host_lj1,host_lj2, + host_cutsq,cut_inner_sq); + this->atom->type_pack4(ntypes,_lj_types,lj3,host_write,host_lj3,host_lj4, + host_offset); + this->atom->type_pack4(ntypes,_lj_types,ljsw,host_write,host_ljsw1,host_ljsw2, + host_ljsw3,host_ljsw4); + this->atom->type_pack2(ntypes,_lj_types,ljsw0,host_write,host_ljsw0,cut_inner); +} + +template +void LJSMOOTHT::clear() { + if (!_allocated) + return; + _allocated=false; + + lj1.clear(); + lj3.clear(); + ljsw.clear(); + ljsw0.clear(); + sp_lj.clear(); + this->clear_atomic(); +} + +template +double LJSMOOTHT::host_memory_usage() const { + return this->host_memory_usage_atomic()+sizeof(LJSMOOTH); +} + +// --------------------------------------------------------------------------- +// Calculate energies, forces, and torques +// --------------------------------------------------------------------------- +template +int LJSMOOTHT::loop(const int eflag, const int vflag) { + // Compute the block size and grid size to keep all cores busy + const int BX=this->block_size(); + + int GX=static_cast(ceil(static_cast(this->ans->inum())/ + (BX/this->_threads_per_atom))); + + int ainum=this->ans->inum(); + int nbor_pitch=this->nbor->nbor_pitch(); + this->time_pair.start(); + if (shared_types) { + this->k_pair_sel->set_size(GX,BX); + this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &ljsw, &ljsw0, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, + &vflag, &ainum, &nbor_pitch, + &this->_threads_per_atom); + } else { + this->k_pair.set_size(GX,BX); + this->k_pair.run(&this->atom->x, &lj1, &lj3, &ljsw, &ljsw0, &_lj_types, &sp_lj, + &this->nbor->dev_nbor, &this->_nbor_data->begin(), + &this->ans->force, &this->ans->engv, &eflag, &vflag, + &ainum, &nbor_pitch, &this->_threads_per_atom); + } + this->time_pair.stop(); + return GX; +} + +template class LJSMOOTH; +} diff --git a/lib/gpu/lal_lj_smooth.cu b/lib/gpu/lal_lj_smooth.cu new file mode 100644 index 0000000000..d4a99ed3a7 --- /dev/null +++ b/lib/gpu/lal_lj_smooth.cu @@ -0,0 +1,259 @@ +// ************************************************************************** +// lj_smooth.cu +// ------------------- +// Gurgen Melikyan (HSE University) +// +// Device code for acceleration of the lj/smooth pair style +// +// __________________________________________________________________________ +// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) +// __________________________________________________________________________ +// +// begin : +// email : gkmeliyan@edu.hse.ru +// *************************************************************************** + +#if defined(NV_KERNEL) || defined(USE_HIP) +#include "lal_aux_fun1.h" +#ifndef _DOUBLE_DOUBLE +_texture( pos_tex,float4); +#else +_texture_2d( pos_tex,int4); +#endif +#else +#define pos_tex x_ +#endif + +__kernel void k_lj_smooth(const __global numtyp4 *restrict x_, + const __global numtyp4 *restrict lj1, + const __global numtyp4 *restrict lj3, + const __global numtyp4 *restrict ljsw, + const __global numtyp2 *restrict ljsw0, + const int lj_types, + const __global numtyp *restrict sp_lj, + const __global int * dev_nbor, + const __global int * dev_packed, + __global acctyp4 *restrict ans, + __global acctyp *restrict engv, + const int eflag, const int vflag, const int inum, + const int nbor_pitch, const int t_per_atom) { + int tid, ii, offset; + atom_info(t_per_atom,ii,tid,offset); + + int n_stride; + local_allocate_store_pair(); + + acctyp4 f; + f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0; + acctyp energy, virial[6]; + if (EVFLAG) { + energy=(acctyp)0; + for (int i=0; i<6; i++) virial[i]=(acctyp)0; + } + + if (ii +class LJSMOOTH : public BaseAtomic { + public: + LJSMOOTH(); + ~LJSMOOTH(); + + /// Clear any previous data and set up for a new LAMMPS run + /** \param max_nbors initial number of rows in the neighbor matrix + * \param cell_size cutoff + skin + * \param gpu_split fraction of particles handled by device + * + * Returns: + * - 0 if successful + * - -1 if fix gpu not found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ + int init(const int ntypes, double **host_cutsq, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, double *host_special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, + const double gpu_split, FILE *screen, + double **host_ljsw0, double **host_ljsw1, double **host_ljsw2, + double **host_ljsw3, double **host_ljsw4, + double **cut_inner, double **cut_inner_sq); + + /// Send updated coeffs from host to device (to be compatible with fix adapt) + void reinit(const int ntypes, double **host_cutsq, + double **host_lj1, double **host_lj2, double **host_lj3, + double **host_lj4, double **host_offset, + double **host_ljsw0, double **host_ljsw1, double **host_ljsw2, + double **host_ljsw3, double **host_ljsw4, + double **cut_inner, double **cut_inner_sq); + + /// Clear all host and device data + /** \note This is called at the beginning of the init() routine **/ + void clear(); + + /// Returns memory usage on device per atom + int bytes_per_atom(const int max_nbors) const; + + /// Total host memory used by library for pair style + double host_memory_usage() const; + + // --------------------------- TYPE DATA -------------------------- + + /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = cut_inner_sq + UCL_D_Vec lj1; + /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset + UCL_D_Vec lj3; + /// ljsw.x = ljsw1, ljsw.y = ljsw2, ljsw.z = ljsw3, ljsw.w = ljsw4 + UCL_D_Vec ljsw; + /// ljsw0.x = ljsw0 ljsw0.y = cut_inner + UCL_D_Vec ljsw0; + /// Special LJ values + UCL_D_Vec sp_lj; + + /// If atom type constants fit in shared memory, use fast kernels + bool shared_types; + + /// Number of atom types + int _lj_types; + + private: + bool _allocated; + int loop(const int _eflag, const int _vflag); +}; + +} + +#endif diff --git a/lib/gpu/lal_lj_smooth_ext.cpp b/lib/gpu/lal_lj_smooth_ext.cpp new file mode 100644 index 0000000000..48dad74071 --- /dev/null +++ b/lib/gpu/lal_lj_smooth_ext.cpp @@ -0,0 +1,144 @@ +/*************************************************************************** + lj_smooth_ext.cpp + ------------------- + Gurgen Melikyan (HSE University) + + Functions for LAMMPS access to lj/smooth acceleration routines. + + __________________________________________________________________________ + This file is part of the LAMMPS Accelerator Library (LAMMPS_AL) + __________________________________________________________________________ + + begin : + email : gkmeliyan@edu.hse.ru + ***************************************************************************/ + +#include +#include +#include + +#include "lal_lj_smooth.h" + +using namespace std; +using namespace LAMMPS_AL; + +static LJSMOOTH LJSMTMF; + +// --------------------------------------------------------------------------- +// Allocate memory on host and device and copy constants to device +// --------------------------------------------------------------------------- +int ljsmt_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double *special_lj, const int inum, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen, + double **host_ljsw0, double **host_ljsw1, double **host_ljsw2, double **host_ljsw3, + double **host_ljsw4, double **cut_inner, double **cut_inner_sq) { + LJSMTMF.clear(); + gpu_mode=LJSMTMF.device->gpu_mode(); + double gpu_split=LJSMTMF.device->particle_split(); + int first_gpu=LJSMTMF.device->first_device(); + int last_gpu=LJSMTMF.device->last_device(); + int world_me=LJSMTMF.device->world_me(); + int gpu_rank=LJSMTMF.device->gpu_rank(); + int procs_per_gpu=LJSMTMF.device->procs_per_gpu(); + + LJSMTMF.device->init_message(screen,"lj/smooth",first_gpu,last_gpu); + + bool message=false; + if (LJSMTMF.device->replica_me()==0 && screen) + message=true; + + if (message) { + fprintf(screen,"Initializing Device and compiling on process 0..."); + fflush(screen); + } + + int init_ok=0; + if (world_me==0) + init_ok=LJSMTMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, + host_lj4, offset, special_lj, inum, nall, max_nbors, + maxspecial, cell_size, gpu_split, screen, + host_ljsw0, host_ljsw1, host_ljsw2, host_ljsw3, host_ljsw4, cut_inner, cut_inner_sq); + + LJSMTMF.device->world_barrier(); + if (message) + fprintf(screen,"Done.\n"); + + for (int i=0; igpu_barrier(); + if (message) + fprintf(screen,"Done.\n"); + } + if (message) + fprintf(screen,"\n"); + + if (init_ok==0) + LJSMTMF.estimate_gpu_overhead(); + return init_ok; +} + +// --------------------------------------------------------------------------- +// Copy updated coeffs from host to device +// --------------------------------------------------------------------------- +void ljsmt_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double **host_ljsw0, double **host_ljsw1, double **host_ljsw2, double **host_ljsw3, + double **host_ljsw4, double **cut_inner, double **cut_inner_sq) { + int world_me=LJSMTMF.device->world_me(); + int gpu_rank=LJSMTMF.device->gpu_rank(); + int procs_per_gpu=LJSMTMF.device->procs_per_gpu(); + + if (world_me==0) + LJSMTMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4, offset, host_ljsw0, host_ljsw1, host_ljsw2, host_ljsw3, host_ljsw4, cut_inner, cut_inner_sq); + LJSMTMF.device->world_barrier(); + + for (int i=0; igpu_barrier(); + } +} + +void ljsmt_gpu_clear() { + LJSMTMF.clear(); +} + +int ** ljsmt_gpu_compute_n(const int ago, const int inum_full, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, tagint *tag, int **nspecial, + tagint **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, const double cpu_time, + bool &success) { + return LJSMTMF.compute(ago, inum_full, nall, host_x, host_type, sublo, + subhi, tag, nspecial, special, eflag, vflag, eatom, + vatom, host_start, ilist, jnum, cpu_time, success); +} + +void ljsmt_gpu_compute(const int ago, const int inum_full, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success) { + LJSMTMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj, + firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success); +} + +double ljsmt_gpu_bytes() { + return LJSMTMF.host_memory_usage(); +} diff --git a/lib/kokkos/CHANGELOG.md b/lib/kokkos/CHANGELOG.md index c759181aa2..3ce38c37d8 100644 --- a/lib/kokkos/CHANGELOG.md +++ b/lib/kokkos/CHANGELOG.md @@ -1,5 +1,168 @@ # Change Log +## [3.4.00](https://github.com/kokkos/kokkos/tree/3.4.00) (2021-04-25) +[Full Changelog](https://github.com/kokkos/kokkos/compare/3.3.01...3.4.00) + +**Highlights:** +- SYCL Backend Almost Feature Complete +- OpenMPTarget Backend Almost Feature Complete +- Performance Improvements for HIP backend +- Require CMake 3.16 or newer +- Tool Callback Interface Enhancements +- cmath wrapper functions available now in Kokkos::Experimental + +**Features:** +- Implement parallel_scan with ThreadVectorRange and Reducer [\#3861](https://github.com/kokkos/kokkos/pull/3861) +- Implement SYCL Random [\#3849](https://github.com/kokkos/kokkos/pull/3849) +- OpenMPTarget: Adding Implementation for nested reducers [\#3845](https://github.com/kokkos/kokkos/pull/3845) +- Implement UniqueToken for SYCL [\#3833](https://github.com/kokkos/kokkos/pull/3833) +- OpenMPTarget: UniqueToken::Global implementation [\#3823](https://github.com/kokkos/kokkos/pull/3823) +- DualView sync's on ExecutionSpaces [\#3822](https://github.com/kokkos/kokkos/pull/3822) +- SYCL outer TeamPolicy parallel_reduce [\#3818](https://github.com/kokkos/kokkos/pull/3818) +- SYCL TeamPolicy::team_scan [\#3815](https://github.com/kokkos/kokkos/pull/3815) +- SYCL MDRangePolicy parallel_reduce [\#3801](https://github.com/kokkos/kokkos/pull/3801) +- Enable use of execution space instances in ScatterView [\#3786](https://github.com/kokkos/kokkos/pull/3786) +- SYCL TeamPolicy nested parallel_reduce [\#3783](https://github.com/kokkos/kokkos/pull/3783) +- OpenMPTarget: MDRange with TagType for parallel_for [\#3781](https://github.com/kokkos/kokkos/pull/3781) +- Adding OpenMPTarget parallel_scan [\#3655](https://github.com/kokkos/kokkos/pull/3655) +- SYCL basic TeamPolicy [\#3654](https://github.com/kokkos/kokkos/pull/3654) +- OpenMPTarget: scratch memory implementation [\#3611](https://github.com/kokkos/kokkos/pull/3611) + +**Implemented enhancements Backends and Archs:** +- SYCL choose a specific GPU [\#3918](https://github.com/kokkos/kokkos/pull/3918) +- [HIP] Lock access to scratch memory when using Teams [\#3916](https://github.com/kokkos/kokkos/pull/3916) +- [HIP] fix multithreaded access to get_next_driver [\#3908](https://github.com/kokkos/kokkos/pull/3908) +- Forward declare HIPHostPinnedSpace and SYCLSharedUSMSpace [\#3902](https://github.com/kokkos/kokkos/pull/3902) +- Let SYCL USMObjectMem use SharedAllocationRecord [\#3898](https://github.com/kokkos/kokkos/pull/3898) +- Implement clock_tic for SYCL [\#3893](https://github.com/kokkos/kokkos/pull/3893) +- Don't use a static variable in HIPInternal::scratch_space [\#3866](https://github.com/kokkos/kokkos/pull/3866)(https://github.com/kokkos/kokkos/pull/3866) +- Reuse memory for SYCL parallel_reduce [\#3873](https://github.com/kokkos/kokkos/pull/3873) +- Update SYCL compiler in CI [\#3826](https://github.com/kokkos/kokkos/pull/3826) +- Introduce HostSharedPtr to manage m_space_instance for Cuda/HIP/SYCL [\#3824](https://github.com/kokkos/kokkos/pull/3824) +- [HIP] Use shuffle for range reduction [\#3811](https://github.com/kokkos/kokkos/pull/3811) +- OpenMPTarget: Changes to the hierarchical parallelism [\#3808](https://github.com/kokkos/kokkos/pull/3808) +- Remove ExtendedReferenceWrapper for SYCL parallel_reduce [\#3802](https://github.com/kokkos/kokkos/pull/3802) +- Eliminate sycl_indirect_launch [\#3777](https://github.com/kokkos/kokkos/pull/3777) +- OpenMPTarget: scratch implementation for parallel_reduce [\#3776](https://github.com/kokkos/kokkos/pull/3776) +- Allow initializing SYCL execution space from sycl::queue and SYCL::impl_static_fence [\#3767](https://github.com/kokkos/kokkos/pull/3767) +- SYCL TeamPolicy scratch memory alternative [\#3763](https://github.com/kokkos/kokkos/pull/3763) +- Alternative implementation for SYCL TeamPolicy [\#3759](https://github.com/kokkos/kokkos/pull/3759) +- Unify handling of synchronous errors in SYCL [\#3754](https://github.com/kokkos/kokkos/pull/3754) +- core/Cuda: Half_t updates for cgsolve [\#3746](https://github.com/kokkos/kokkos/pull/3746) +- Unify HIPParallelLaunch structures [\#3733](https://github.com/kokkos/kokkos/pull/3733) +- Improve performance for SYCL parallel_reduce [\#3732](https://github.com/kokkos/kokkos/pull/3732) +- Use consistent types in Kokkos_OpenMPTarget_Parallel.hpp [\#3703](https://github.com/kokkos/kokkos/pull/3703) +- Implement non-blocking kernel launches for HIP backend [\#3697](https://github.com/kokkos/kokkos/pull/3697) +- Change SYCLInternal::m_queue std::unique_ptr -> std::optional [\#3677](https://github.com/kokkos/kokkos/pull/3677) +- Use alternative SYCL parallel_reduce implementation [\#3671](https://github.com/kokkos/kokkos/pull/3671) +- Use runtime values in KokkosExp_MDRangePolicy.hpp [\#3626](https://github.com/kokkos/kokkos/pull/3626) +- Clean up AnalyzePolicy [\#3564](https://github.com/kokkos/kokkos/pull/3564) +- Changes for indirect launch of SYCL parallel reduce [\#3511](https://github.com/kokkos/kokkos/pull/3511) + +**Implemented enhancements BuildSystem:** +- Also require C++14 when building gtest [\#3912](https://github.com/kokkos/kokkos/pull/3912) +- Fix compiling SYCL with OpenMP [\#3874](https://github.com/kokkos/kokkos/pull/3874) +- Require C++17 for SYCL (at configuration time) [\#3869](https://github.com/kokkos/kokkos/pull/3869) +- Add COMPILE_DEFINITIONS argument to kokkos_create_imported_tpl [\#3862](https://github.com/kokkos/kokkos/pull/3862) +- Do not pass arch flags to the linker with no rdc [\#3846](https://github.com/kokkos/kokkos/pull/3846) +- Try compiling C++14 check with C++14 support and print error message [\#3843](https://github.com/kokkos/kokkos/pull/3843) +- Enable HIP with Cray Clang [\#3842](https://github.com/kokkos/kokkos/pull/3842) +- Add an option to disable header self containment tests [\#3834](https://github.com/kokkos/kokkos/pull/3834) +- CMake check for C++14 [\#3809](https://github.com/kokkos/kokkos/pull/3809) +- Prefer -std=* over --std=* [\#3779](https://github.com/kokkos/kokkos/pull/3779) +- Kokkos launch compiler updates [\#3778](https://github.com/kokkos/kokkos/pull/3778) +- Updated comments and enabled no-op for kokkos_launch_compiler [\#3774](https://github.com/kokkos/kokkos/pull/3774) +- Apple's Clang not correctly recognised [\#3772](https://github.com/kokkos/kokkos/pull/3772) +- kokkos_launch_compiler + CUDA auto-detect arch [\#3770](https://github.com/kokkos/kokkos/pull/3770) +- Add Spack test support for Kokkos [\#3753](https://github.com/kokkos/kokkos/pull/3753) +- Split SYCL tests for aot compilation [\#3741](https://github.com/kokkos/kokkos/pull/3741) +- Use consistent OpenMP flag for IntelClang [\#3735](https://github.com/kokkos/kokkos/pull/3735) +- Add support for -Wno-deprecated-gpu-targets [\#3722](https://github.com/kokkos/kokkos/pull/3722) +- Add configuration to target CUDA compute capability 8.6 [\#3713](https://github.com/kokkos/kokkos/pull/3713) +- Added VERSION and SOVERSION to KOKKOS_INTERNAL_ADD_LIBRARY [\#3706](https://github.com/kokkos/kokkos/pull/3706) +- Add fast-math to known NVCC flags [\#3699](https://github.com/kokkos/kokkos/pull/3699) +- Add MI-100 arch string [\#3698](https://github.com/kokkos/kokkos/pull/3698) +- Require CMake >=3.16 [\#3679](https://github.com/kokkos/kokkos/pull/3679) +- KokkosCI.cmake, KokkosCTest.cmake.in, CTestConfig.cmake.in + CI updates [\#2844](https://github.com/kokkos/kokkos/pull/2844) + +**Implemented enhancements Tools:** +- Improve readability of the callback invocation in profiling [\#3860](https://github.com/kokkos/kokkos/pull/3860) +- V1.1 Tools Interface: incremental, action-based [\#3812](https://github.com/kokkos/kokkos/pull/3812) +- Enable launch latency simulations [\#3721](https://github.com/kokkos/kokkos/pull/3721) +- Added metadata callback to tools interface [\#3711](https://github.com/kokkos/kokkos/pull/3711) +- MDRange Tile Size Tuning [\#3688](https://github.com/kokkos/kokkos/pull/3688) +- Added support for command-line args for kokkos-tools [\#3627](https://github.com/kokkos/kokkos/pull/3627) +- Query max tile sizes for an MDRangePolicy, and set tile sizes on an existing policy [\#3481](https://github.com/kokkos/kokkos/pull/3481) + +**Implemented enhancements Other:** +- Try detecting ndevices in get_gpu [\#3921](https://github.com/kokkos/kokkos/pull/3921) +- Use strcmp to compare names() [\#3909](https://github.com/kokkos/kokkos/pull/3909) +- Add execution space arguments for constructor overloads that might allocate a new underlying View [\#3904](https://github.com/kokkos/kokkos/pull/3904) +- Prefix labels in internal use of kokkos_malloc [\#3891](https://github.com/kokkos/kokkos/pull/3891) +- Prefix labels for internal uses of SharedAllocationRecord [\#3890](https://github.com/kokkos/kokkos/pull/3890) +- Add missing hypot math function [\#3880](https://github.com/kokkos/kokkos/pull/3880) +- Unify algorithm unit tests to avoid code duplication [\#3851](https://github.com/kokkos/kokkos/pull/3851) +- DualView.template view() better matches for Devices in UVMSpace cases [\#3857](https://github.com/kokkos/kokkos/pull/3857) +- More extensive disentangling of Policy Traits [\#3829](https://github.com/kokkos/kokkos/pull/3829) +- Replaced nanosleep and sched_yield with STL routines [\#3825](https://github.com/kokkos/kokkos/pull/3825) +- Constructing Atomic Subviews [\#3810](https://github.com/kokkos/kokkos/pull/3810) +- Metadata Declaration in Core [\#3729](https://github.com/kokkos/kokkos/pull/3729) +- Allow using tagged final functor in parallel_reduce [\#3714](https://github.com/kokkos/kokkos/pull/3714) +- Major duplicate code removal in SharedAllocationRecord specializations [\#3658](https://github.com/kokkos/kokkos/pull/3658) + +**Fixed bugs:** +- Provide forward declarations in Kokkos_ViewLayoutTiled.hpp for XL [\#3911](https://github.com/kokkos/kokkos/pull/3911) +- Fixup absolute value of floating points in Kokkos complex [\#3882](https://github.com/kokkos/kokkos/pull/3882) +- Address intel 17 ICE [\#3881](https://github.com/kokkos/kokkos/pull/3881) +- Add missing pow(Kokkos::complex) overloads [\#3868](https://github.com/kokkos/kokkos/pull/3868) +- Fix bug {pow, log}(Kokkos::complex) [\#3866](https://github.com/kokkos/kokkos/pull/3866)(https://github.com/kokkos/kokkos/pull/3866) +- Cleanup writing to output streams in Cuda [\#3859](https://github.com/kokkos/kokkos/pull/3859) +- Fixup cache CUDA fallback execution space instance used by DualView::sync [\#3856](https://github.com/kokkos/kokkos/pull/3856) +- Fix cmake warning with pthread [\#3854](https://github.com/kokkos/kokkos/pull/3854) +- Fix typo FOUND_CUDA_{DRIVVER -> DRIVER} [\#3852](https://github.com/kokkos/kokkos/pull/3852) +- Fix bug in SYCL team_reduce [\#3848](https://github.com/kokkos/kokkos/pull/3848) +- Atrocious bug in MDRange tuning [\#3803](https://github.com/kokkos/kokkos/pull/3803) +- Fix compiling SYCL with Kokkos_ENABLE_TUNING=ON [\#3800](https://github.com/kokkos/kokkos/pull/3800) +- Fixed command line parsing bug [\#3797](https://github.com/kokkos/kokkos/pull/3797) +- Workaround race condition in SYCL parallel_reduce [\#3782](https://github.com/kokkos/kokkos/pull/3782) +- Fix Atomic{Min,Max} for Kepler30 [\#3780](https://github.com/kokkos/kokkos/pull/3780) +- Fix SYCL typo [\#3755](https://github.com/kokkos/kokkos/pull/3755) +- Fixed Kokkos_install_additional_files macro [\#3752](https://github.com/kokkos/kokkos/pull/3752) +- Fix a typo for Kokkos_ARCH_A64FX [\#3751](https://github.com/kokkos/kokkos/pull/3751) +- OpenMPTarget: fixes and workarounds to work with "Release" build type [\#3748](https://github.com/kokkos/kokkos/pull/3748) +- Fix parsing bug for number of devices command line argument [\#3724](https://github.com/kokkos/kokkos/pull/3724) +- Avoid more warnings with clang and C++20 [\#3719](https://github.com/kokkos/kokkos/pull/3719) +- Fix gcc-10.1 C++20 warnings [\#3718](https://github.com/kokkos/kokkos/pull/3718) +- Fix cuda cache config not being set correct [\#3712](https://github.com/kokkos/kokkos/pull/3712) +- Fix dualview deepcopy perftools [\#3701](https://github.com/kokkos/kokkos/pull/3701) +- use drand instead of frand in drand [\#3696](https://github.com/kokkos/kokkos/pull/3696) + +**Incompatibilities:** +- Remove unimplemented member functions of SYCLDevice [\#3919](https://github.com/kokkos/kokkos/pull/3919) +- Replace cl::sycl [\#3896](https://github.com/kokkos/kokkos/pull/3896) +- Get rid of SYCL workaround in Kokkos_Complex.hpp [\#3884](https://github.com/kokkos/kokkos/pull/3884) +- Replace most uses of if_c [\#3883](https://github.com/kokkos/kokkos/pull/3883) +- Remove Impl::enable_if_type [\#3863](https://github.com/kokkos/kokkos/pull/3863) +- Remove HostBarrier test [\#3847](https://github.com/kokkos/kokkos/pull/3847) +- Avoid (void) interface [\#3836](https://github.com/kokkos/kokkos/pull/3836) +- Remove VerifyExecutionCanAccessMemorySpace [\#3813](https://github.com/kokkos/kokkos/pull/3813) +- Avoid duplicated code in ScratchMemorySpace [\#3793](https://github.com/kokkos/kokkos/pull/3793) +- Remove superfluous FunctorFinal specialization [\#3788](https://github.com/kokkos/kokkos/pull/3788) +- Rename cl::sycl -> sycl in Kokkos_MathematicalFunctions.hpp [\#3678](https://github.com/kokkos/kokkos/pull/3678) +- Remove integer_sequence backward compatibility implementation [\#3533](https://github.com/kokkos/kokkos/pull/3533) + +**Enabled tests:** +- Fixup re-enable core performance tests [\#3903](https://github.com/kokkos/kokkos/pull/3903) +- Enable more SYCL tests [\#3900](https://github.com/kokkos/kokkos/pull/3900) +- Restrict MDRange Policy tests for Intel GPUs [\#3853](https://github.com/kokkos/kokkos/pull/3853) +- Disable death tests for rawhide [\#3844](https://github.com/kokkos/kokkos/pull/3844) +- OpenMPTarget: Block unit tests that do not pass with the nvidia compiler [\#3839](https://github.com/kokkos/kokkos/pull/3839) +- Enable Bitset container test for SYCL [\#3830](https://github.com/kokkos/kokkos/pull/3830) +- Enable some more SYCL tests [\#3744](https://github.com/kokkos/kokkos/pull/3744) +- Enable SYCL atomic tests [\#3742](https://github.com/kokkos/kokkos/pull/3742) +- Enable more SYCL perf_tests [\#3692](https://github.com/kokkos/kokkos/pull/3692) +- Enable examples for SYCL [\#3691](https://github.com/kokkos/kokkos/pull/3691) + ## [3.3.01](https://github.com/kokkos/kokkos/tree/3.3.01) (2021-01-06) [Full Changelog](https://github.com/kokkos/kokkos/compare/3.3.00...3.3.01) diff --git a/lib/kokkos/CMakeLists.txt b/lib/kokkos/CMakeLists.txt index 7bc3c77256..6fc1bf7d2f 100644 --- a/lib/kokkos/CMakeLists.txt +++ b/lib/kokkos/CMakeLists.txt @@ -72,7 +72,7 @@ ENDFUNCTION() LIST(APPEND CMAKE_MODULE_PATH cmake/Modules) IF(NOT KOKKOS_HAS_TRILINOS) - cmake_minimum_required(VERSION 3.10 FATAL_ERROR) + cmake_minimum_required(VERSION 3.16 FATAL_ERROR) set(CMAKE_DISABLE_SOURCE_CHANGES ON) set(CMAKE_DISABLE_IN_SOURCE_BUILD ON) IF (Spack_WORKAROUND) @@ -111,27 +111,25 @@ ENDIF() set(Kokkos_VERSION_MAJOR 3) -set(Kokkos_VERSION_MINOR 3) -set(Kokkos_VERSION_PATCH 1) +set(Kokkos_VERSION_MINOR 4) +set(Kokkos_VERSION_PATCH 00) set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}") math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}") -IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.12.0") - MESSAGE(STATUS "Setting policy CMP0074 to use _ROOT variables") - CMAKE_POLICY(SET CMP0074 NEW) -ENDIF() +MESSAGE(STATUS "Setting policy CMP0074 to use _ROOT variables") +CMAKE_POLICY(SET CMP0074 NEW) # Load either the real TriBITS or a TriBITS wrapper # for certain utility functions that are universal (like GLOBAL_SET) INCLUDE(${KOKKOS_SRC_PATH}/cmake/fake_tribits.cmake) -IF (Kokkos_ENABLE_CUDA AND ${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.14.0") - #If we are building CUDA, we have tricked CMake because we declare a CXX project - #If the default C++ standard for a given compiler matches the requested - #standard, then CMake just omits the -std flag in later versions of CMake - #This breaks CUDA compilation (CUDA compiler can have a different default - #-std then the underlying host compiler by itself). Setting this variable - #forces CMake to always add the -std flag even if it thinks it doesn't need it +IF (Kokkos_ENABLE_CUDA) + # If we are building CUDA, we have tricked CMake because we declare a CXX project + # If the default C++ standard for a given compiler matches the requested + # standard, then CMake just omits the -std flag in later versions of CMake + # This breaks CUDA compilation (CUDA compiler can have a different default + # -std then the underlying host compiler by itself). Setting this variable + # forces CMake to always add the -std flag even if it thinks it doesn't need it GLOBAL_SET(CMAKE_CXX_STANDARD_DEFAULT 98) ENDIF() @@ -139,15 +137,19 @@ ENDIF() # I really wish these were regular variables # but scoping issues can make it difficult GLOBAL_SET(KOKKOS_COMPILE_OPTIONS) -GLOBAL_SET(KOKKOS_LINK_OPTIONS -DKOKKOS_DEPENDENCE) +GLOBAL_SET(KOKKOS_LINK_OPTIONS) GLOBAL_SET(KOKKOS_CUDA_OPTIONS) GLOBAL_SET(KOKKOS_CUDAFE_OPTIONS) GLOBAL_SET(KOKKOS_XCOMPILER_OPTIONS) # We need to append text here for making sure TPLs # we import are available for an installed Kokkos GLOBAL_SET(KOKKOS_TPL_EXPORTS) -# this could probably be scoped to project +# KOKKOS_DEPENDENCE is used by kokkos_launch_compiler GLOBAL_SET(KOKKOS_COMPILE_DEFINITIONS KOKKOS_DEPENDENCE) +# MSVC never goes through kokkos_launch_compiler +IF(NOT MSVC) + GLOBAL_APPEND(KOKKOS_LINK_OPTIONS -DKOKKOS_DEPENDENCE) +ENDIF() # Include a set of Kokkos-specific wrapper functions that # will either call raw CMake or TriBITS diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos index 061b7a46ee..aa97f99b75 100644 --- a/lib/kokkos/Makefile.kokkos +++ b/lib/kokkos/Makefile.kokkos @@ -11,8 +11,8 @@ CXXFLAGS += $(SHFLAGS) endif KOKKOS_VERSION_MAJOR = 3 -KOKKOS_VERSION_MINOR = 3 -KOKKOS_VERSION_PATCH = 1 +KOKKOS_VERSION_MINOR = 4 +KOKKOS_VERSION_PATCH = 00 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc) # Options: Cuda,HIP,OpenMP,Pthread,Serial @@ -20,7 +20,7 @@ KOKKOS_DEVICES ?= "OpenMP" #KOKKOS_DEVICES ?= "Pthread" # Options: # Intel: KNC,KNL,SNB,HSW,BDW,SKX -# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80 +# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86 # ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX # IBM: BGQ,Power7,Power8,Power9 # AMD-GPUS: Vega900,Vega906,Vega908 @@ -164,17 +164,17 @@ KOKKOS_INTERNAL_OS_DARWIN := $(call kokkos_has_string,$(KOKKOS_OS),Darwin) KOKKOS_CXX_VERSION := $(strip $(shell $(CXX) --version 2>&1)) KOKKOS_INTERNAL_COMPILER_INTEL := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Intel Corporation) KOKKOS_INTERNAL_COMPILER_PGI := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),PGI) -KOKKOS_INTERNAL_COMPILER_XL := $(strip $(shell $(CXX) -qversion 2>&1 | grep XL | wc -l)) -KOKKOS_INTERNAL_COMPILER_CRAY := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep "CC-" | wc -l)) -KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell echo "$(shell export OMPI_CXX=$(OMPI_CXX); export MPICH_CXX=$(MPICH_CXX); $(CXX) --version 2>&1 | grep nvcc | wc -l)>0" | bc)) +KOKKOS_INTERNAL_COMPILER_XL := $(strip $(shell $(CXX) -qversion 2>&1 | grep -c XL)) +KOKKOS_INTERNAL_COMPILER_CRAY := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep -c "CC-")) +KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell echo "$(shell export OMPI_CXX=$(OMPI_CXX); export MPICH_CXX=$(MPICH_CXX); $(CXX) --version 2>&1 | grep -c nvcc)>0" | bc)) KOKKOS_INTERNAL_COMPILER_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),clang) -KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Apple LLVM) +KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Apple clang) KOKKOS_INTERNAL_COMPILER_HCC := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),HCC) KOKKOS_INTERNAL_COMPILER_GCC := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),GCC) # Check Host Compiler if using NVCC through nvcc_wrapper ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) - KOKKOS_INTERNAL_COMPILER_NVCC_WRAPPER := $(strip $(shell echo $(CXX) | grep nvcc_wrapper | wc -l)) + KOKKOS_INTERNAL_COMPILER_NVCC_WRAPPER := $(strip $(shell echo $(CXX) | grep -c nvcc_wrapper)) ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC_WRAPPER), 1) KOKKOS_CXX_HOST_VERSION := $(strip $(shell $(CXX) $(CXXFLAGS) --host-version 2>&1)) @@ -297,11 +297,11 @@ else #KOKKOS_INTERNAL_CXX1Z_FLAG := -hstd=c++1z #KOKKOS_INTERNAL_CXX2A_FLAG := -hstd=c++2a else - KOKKOS_INTERNAL_CXX14_FLAG := --std=c++14 - KOKKOS_INTERNAL_CXX1Y_FLAG := --std=c++1y - KOKKOS_INTERNAL_CXX17_FLAG := --std=c++17 - KOKKOS_INTERNAL_CXX1Z_FLAG := --std=c++1z - KOKKOS_INTERNAL_CXX2A_FLAG := --std=c++2a + KOKKOS_INTERNAL_CXX14_FLAG := -std=c++14 + KOKKOS_INTERNAL_CXX1Y_FLAG := -std=c++1y + KOKKOS_INTERNAL_CXX17_FLAG := -std=c++17 + KOKKOS_INTERNAL_CXX1Z_FLAG := -std=c++1z + KOKKOS_INTERNAL_CXX2A_FLAG := -std=c++2a endif endif endif @@ -332,6 +332,7 @@ KOKKOS_INTERNAL_USE_ARCH_VOLTA70 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volt KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volta72) KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75) KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80) +KOKKOS_INTERNAL_USE_ARCH_AMPERE86 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere86) KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \ + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \ + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \ @@ -344,7 +345,8 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA70) \ + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \ + $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \ - + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80)) + + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86)) #SEK: This seems like a bug to me ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) @@ -585,10 +587,10 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_PROFILING_LOAD_PRINT), 1) endif ifeq ($(KOKKOS_INTERNAL_ENABLE_TUNING), 1) - tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_TUNING") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_TUNING") endif -tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_LIBDL") +tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_LIBDL") ifeq ($(KOKKOS_INTERNAL_USE_HWLOC), 1) ifneq ($(KOKKOS_CMAKE), yes) @@ -752,6 +754,14 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_A64FX), 1) KOKKOS_CXXFLAGS += -march=armv8.2-a+sve KOKKOS_LDFLAGS += -march=armv8.2-a+sve + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + KOKKOS_CXXFLAGS += -msve-vector-bits=512 + KOKKOS_LDFLAGS += -msve-vector-bits=512 + endif + ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1) + KOKKOS_CXXFLAGS += -msve-vector-bits=512 + KOKKOS_LDFLAGS += -msve-vector-bits=512 + endif endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN), 1) @@ -1100,6 +1110,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE80") KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_80 endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE86), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86 + endif ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG) @@ -1159,7 +1174,7 @@ endif KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h 2>&1) ifeq ($(KOKKOS_INTERNAL_LS_CONFIG), KokkosCore_config.h) - KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep define | wc -l)) + KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep -c define)) else KOKKOS_INTERNAL_NEW_CONFIG := 1 endif @@ -1181,41 +1196,41 @@ tmp := $(call kokkos_update_config_header, KOKKOS_SETUP_HPP_, "KokkosCore_Config tmp := $(call kokkos_update_config_header, KOKKOS_DECLARE_HPP_, "KokkosCore_Config_DeclareBackend.tmp", "KokkosCore_Config_DeclareBackend.hpp") tmp := $(call kokkos_update_config_header, KOKKOS_POST_INCLUDE_HPP_, "KokkosCore_Config_PostInclude.tmp", "KokkosCore_Config_PostInclude.hpp") ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_SetupBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_SetupBackend.hpp") ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1) else endif endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") endif ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_SetupBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_SetupBackend.hpp") endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") endif ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") endif ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") endif ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") endif ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") endif KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp) KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp) @@ -1334,7 +1349,7 @@ ifneq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) endif # With Cygwin functions such as fdopen and fileno are not defined -# when strict ansi is enabled. strict ansi gets enabled with --std=c++14 +# when strict ansi is enabled. strict ansi gets enabled with -std=c++14 # though. So we hard undefine it here. Not sure if that has any bad side effects # This is needed for gtest actually, not for Kokkos itself! ifeq ($(KOKKOS_INTERNAL_OS_CYGWIN), 1) diff --git a/lib/kokkos/Makefile.targets b/lib/kokkos/Makefile.targets index 5a03f7d17e..cf9fc24242 100644 --- a/lib/kokkos/Makefile.targets +++ b/lib/kokkos/Makefile.targets @@ -36,6 +36,8 @@ Kokkos_MemorySpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_ $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemorySpace.cpp Kokkos_HostSpace_deepcopy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp +Kokkos_NumericTraits.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_NumericTraits.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_NumericTraits.cpp ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) Kokkos_Cuda_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp index 69d6cf8f35..904cf5ccb9 100644 --- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp +++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp @@ -668,6 +668,25 @@ struct Random_UniqueIndex { }; #endif +#ifdef KOKKOS_ENABLE_SYCL +template <> +struct Random_UniqueIndex { + using locks_view_type = View; + KOKKOS_FUNCTION + static int get_state_idx(const locks_view_type& locks_) { +#ifdef KOKKOS_ARCH_INTEL_GEN + int i = Kokkos::Impl::clock_tic() % locks_.extent(0); +#else + int i = 0; +#endif + while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) { + i = (i + 1) % static_cast(locks_.extent(0)); + } + return i; + } +}; +#endif + } // namespace Impl template @@ -1028,7 +1047,7 @@ class Random_XorShift1024 { KOKKOS_INLINE_FUNCTION double drand(const double& start, const double& end) { - return frand(end - start) + start; + return drand(end - start) + start; } // Marsaglia polar method for drawing a standard normal distributed random diff --git a/lib/kokkos/algorithms/unit_tests/CMakeLists.txt b/lib/kokkos/algorithms/unit_tests/CMakeLists.txt index 819c9e54ba..9109837985 100644 --- a/lib/kokkos/algorithms/unit_tests/CMakeLists.txt +++ b/lib/kokkos/algorithms/unit_tests/CMakeLists.txt @@ -3,6 +3,7 @@ KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) +KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) SET(GTEST_SOURCE_DIR ${${PARENT_PACKAGE_NAME}_SOURCE_DIR}/tpls/gtest) @@ -25,7 +26,7 @@ KOKKOS_ADD_TEST_LIBRARY( TARGET_COMPILE_DEFINITIONS(kokkosalgorithms_gtest PUBLIC GTEST_HAS_TR1_TUPLE=0 GTEST_HAS_PTHREAD=0) IF((NOT (Kokkos_ENABLE_CUDA AND WIN32)) AND (NOT ("${KOKKOS_CXX_COMPILER_ID}" STREQUAL "Fujitsu"))) -TARGET_COMPILE_FEATURES(kokkosalgorithms_gtest PUBLIC cxx_std_11) + TARGET_COMPILE_FEATURES(kokkosalgorithms_gtest PUBLIC cxx_std_14) ENDIF() # Suppress clang-tidy diagnostics on code that we do not have control over @@ -33,51 +34,42 @@ IF(CMAKE_CXX_CLANG_TIDY) SET_TARGET_PROPERTIES(kokkosalgorithms_gtest PROPERTIES CXX_CLANG_TIDY "") ENDIF() -SET(SOURCES - UnitTestMain.cpp -) +SET(ALGORITHM UnitTestMain.cpp) IF(Kokkos_ENABLE_OPENMP) - LIST( APPEND SOURCES - TestOpenMP.cpp + LIST(APPEND ALGORITHM_SOURCES TestOpenMP_Sort1D.cpp TestOpenMP_Sort3D.cpp TestOpenMP_SortDynamicView.cpp - TestOpenMP_Random.cpp ) ENDIF() -IF(Kokkos_ENABLE_HIP) - LIST( APPEND SOURCES - TestHIP.cpp - ) -ENDIF() +foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL) + # Because there is always an exception to the rule + if(Tag STREQUAL "Threads") + set(DEVICE "PTHREAD") + else() + string(TOUPPER ${Tag} DEVICE) + endif() -IF(Kokkos_ENABLE_CUDA) - LIST( APPEND SOURCES - TestCuda.cpp - ) -ENDIF() - -IF(Kokkos_ENABLE_HPX) - LIST( APPEND SOURCES - TestHPX.cpp - ) -ENDIF() - -IF(Kokkos_ENABLE_SERIAL) - LIST( APPEND SOURCES - TestSerial.cpp - ) -ENDIF() - -IF(Kokkos_ENABLE_PTHREAD) - LIST( APPEND SOURCES - TestThreads.cpp - ) -ENDIF() + if(Kokkos_ENABLE_${DEVICE}) + set(dir ${CMAKE_CURRENT_BINARY_DIR}) + set(file ${dir}/Test${Tag}.cpp) + # Write to a temporary intermediate file and call configure_file to avoid + # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. + file(WRITE ${dir}/dummy.cpp + "#include \n" + "#include \n" + "#include \n" + ) + configure_file(${dir}/dummy.cpp ${file}) + list(APPEND ALGORITHM_SOURCES ${file}) + endif() +endforeach() KOKKOS_ADD_EXECUTABLE_AND_TEST( UnitTest - SOURCES ${SOURCES} + SOURCES + UnitTestMain.cpp + ${ALGORITHM_SOURCES} ) diff --git a/lib/kokkos/algorithms/unit_tests/Makefile b/lib/kokkos/algorithms/unit_tests/Makefile index c112d7c6fc..dd0aa87de0 100644 --- a/lib/kokkos/algorithms/unit_tests/Makefile +++ b/lib/kokkos/algorithms/unit_tests/Makefile @@ -20,11 +20,19 @@ override LDFLAGS += -lpthread include $(KOKKOS_PATH)/Makefile.kokkos -KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/algorithms/unit_tests +KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/algorithms/unit_tests -I${KOKKOS_PATH}/core/unit_test/category_files TEST_TARGETS = TARGETS = +tmp := $(foreach device, $(KOKKOS_DEVICELIST), \ + $(if $(filter Test$(device).cpp, $(shell ls Test$(device).cpp 2>/dev/null)),,\ + $(shell echo "\#include " > Test$(device).cpp); \ + $(shell echo "\#include " >> Test$(device).cpp); \ + $(shell echo "\#include " >> Test$(device).cpp); \ + ) \ +) + ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) OBJ_CUDA = TestCuda.o UnitTestMain.o gtest-all.o TARGETS += KokkosAlgorithms_UnitTest_Cuda @@ -44,7 +52,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) - OBJ_OPENMP = TestOpenMP.o TestOpenMP_Random.o TestOpenMP_Sort1D.o TestOpenMP_Sort3D.o TestOpenMP_SortDynamicView.o UnitTestMain.o gtest-all.o + OBJ_OPENMP = TestOpenMP.o TestOpenMP_Sort1D.o TestOpenMP_Sort3D.o TestOpenMP_SortDynamicView.o UnitTestMain.o gtest-all.o TARGETS += KokkosAlgorithms_UnitTest_OpenMP TEST_TARGETS += test-openmp endif diff --git a/lib/kokkos/algorithms/unit_tests/TestOpenMP_Sort1D.cpp b/lib/kokkos/algorithms/unit_tests/TestOpenMP_Sort1D.cpp index a9b2010ad0..4a5839f0c8 100644 --- a/lib/kokkos/algorithms/unit_tests/TestOpenMP_Sort1D.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestOpenMP_Sort1D.cpp @@ -59,6 +59,8 @@ TEST(openmp, SortUnsigned1D) { Impl::test_1D_sort(171); } +TEST(openmp, SortIssue1160) { Impl::test_issue_1160_sort(); } + } // namespace Test #else void KOKKOS_ALGORITHMS_UNITTESTS_TESTOPENMP_PREVENT_LINK_ERROR() {} diff --git a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp index caba92c152..1f14875096 100644 --- a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp @@ -491,6 +491,34 @@ void test_random(unsigned int num_draws) { } } // namespace Impl +template +void test_random_xorshift64() { +#if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_CUDA) || \ + defined(KOKKOS_ENABLE_HIP) + const int num_draws = 132141141; +#else // SERIAL, HPX, OPENMP + const int num_draws = 10240000; +#endif + Impl::test_random>(num_draws); + Impl::test_random>>( + num_draws); +} + +template +void test_random_xorshift1024() { +#if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_CUDA) || \ + defined(KOKKOS_ENABLE_HIP) + const int num_draws = 52428813; +#else // SERIAL, HPX, OPENMP + const int num_draws = 10130144; +#endif + Impl::test_random>( + num_draws); + Impl::test_random>>( + num_draws); +} } // namespace Test #endif // KOKKOS_TEST_UNORDERED_MAP_HPP diff --git a/lib/kokkos/algorithms/unit_tests/TestRandomCommon.hpp b/lib/kokkos/algorithms/unit_tests/TestRandomCommon.hpp new file mode 100644 index 0000000000..c6d3b59ae1 --- /dev/null +++ b/lib/kokkos/algorithms/unit_tests/TestRandomCommon.hpp @@ -0,0 +1,60 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_ALGORITHMS_UNITTESTS_TESTRANDOM_COMMON_HPP +#define KOKKOS_ALGORITHMS_UNITTESTS_TESTRANDOM_COMMON_HPP + +#include + +namespace Test { + +TEST(TEST_CATEGORY, Random_XorShift64) { + test_random_xorshift64(); +} +TEST(TEST_CATEGORY, Random_XorShift1024_0) { + test_random_xorshift1024(); +} +} // namespace Test + +#endif diff --git a/lib/kokkos/containers/unit_tests/TestHIP_Category.hpp b/lib/kokkos/algorithms/unit_tests/TestSortCommon.hpp similarity index 88% rename from lib/kokkos/containers/unit_tests/TestHIP_Category.hpp rename to lib/kokkos/algorithms/unit_tests/TestSortCommon.hpp index c2d60d1814..56657b6574 100644 --- a/lib/kokkos/containers/unit_tests/TestHIP_Category.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestSortCommon.hpp @@ -42,10 +42,14 @@ //@HEADER */ -#ifndef KOKKOS_TEST_HIP_HPP -#define KOKKOS_TEST_HIP_HPP +#ifndef KOKKOS_ALGORITHMS_UNITTESTS_TESTSORT_COMMON_HPP +#define KOKKOS_ALGORITHMS_UNITTESTS_TESTSORT_COMMON_HPP -#define TEST_CATEGORY hip -#define TEST_EXECSPACE Kokkos::Experimental::HIP +#include +namespace Test { +TEST(TEST_CATEGORY, SortUnsigned) { + Impl::test_sort(171); +} +} // namespace Test #endif diff --git a/lib/kokkos/appveyor.yml b/lib/kokkos/appveyor.yml index c40bf066b7..e8763c0b66 100644 --- a/lib/kokkos/appveyor.yml +++ b/lib/kokkos/appveyor.yml @@ -3,8 +3,4 @@ image: clone_folder: c:\projects\source build_script: - cmd: >- - mkdir build && - cd build && - cmake c:\projects\source -DKokkos_ENABLE_TESTS=ON && - cmake --build . --target install && - ctest -C Debug -V + cmake c:\projects\source -DKokkos_ENABLE_TESTS=ON -DCMAKE_CXX_FLAGS="/W0 /EHsc /d1reportClassLayoutChanges" -DCTEST_ARGS="-C Debug -V --output-on-failure" -DBUILD_NAME=MSVC-2019 -DBUILD_TYPE=Debug -DSITE=AppVeyor -DTARGET=install -P cmake/KokkosCI.cmake diff --git a/lib/kokkos/bin/kokkos_launch_compiler b/lib/kokkos/bin/kokkos_launch_compiler index 1fbebf648f..d929d24f1d 100755 --- a/lib/kokkos/bin/kokkos_launch_compiler +++ b/lib/kokkos/bin/kokkos_launch_compiler @@ -13,6 +13,17 @@ # $1 are 'ar', 'cmake', etc. during the linking phase # +# emit a message about the underlying command executed +: ${DEBUG:=0} +: ${KOKKOS_DEBUG_LAUNCH_COMPILER:=${DEBUG}} + +debug-message() +{ + if [ "${KOKKOS_DEBUG_LAUNCH_COMPILER}" -ne 0 ]; then + echo -e "##### $(basename ${BASH_SOURCE[0]}) executing: \"$@\"... #####" + fi +} + # check the arguments for the KOKKOS_DEPENDENCE compiler definition KOKKOS_DEPENDENCE=0 for i in ${@} @@ -23,16 +34,30 @@ do fi done -# if C++ is not passed, someone is probably trying to invoke it directly +# if Kokkos compiler is not passed, someone is probably trying to invoke it directly if [ -z "${1}" ]; then - echo -e "\n${BASH_SOURCE[0]} was invoked without the C++ compiler as the first argument." + echo -e "\n${BASH_SOURCE[0]} was invoked without the Kokkos compiler as the first argument." echo "This script is not indended to be directly invoked by any mechanism other" - echo -e "than through a RULE_LAUNCH_COMPILE or RULE_LAUNCH_LINK property set in CMake\n" + echo -e "than through a RULE_LAUNCH_COMPILE or RULE_LAUNCH_LINK property set in CMake.\n" + exit 1 +fi + +# if Kokkos compiler is not passed, someone is probably trying to invoke it directly +if [ -z "${2}" ]; then + echo -e "\n${BASH_SOURCE[0]} was invoked without the C++ compiler as the second argument." + echo "This script is not indended to be directly invoked by any mechanism other" + echo -e "than through a RULE_LAUNCH_COMPILE or RULE_LAUNCH_LINK property set in CMake.\n" exit 1 fi # if there aren't two args, this isn't necessarily invalid, just a bit strange -if [ -z "${2}" ]; then exit 0; fi +if [ -z "${3}" ]; then exit 0; fi + +# store the Kokkos compiler +KOKKOS_COMPILER=${1} + +# remove the Kokkos compiler from the arguments +shift # store the expected C++ compiler CXX_COMPILER=${1} @@ -40,48 +65,57 @@ CXX_COMPILER=${1} # remove the expected C++ compiler from the arguments shift -# after the above shift, $1 is now the exe for the compile or link command, e.g. -# kokkos_launch_compiler g++ gcc -c file.c -o file.o +# NOTE: in below, ${KOKKOS_COMPILER} is usually nvcc_wrapper +# +# after the above shifts, $1 is now the exe for the compile or link command, e.g. +# kokkos_launch_compiler ${KOKKOS_COMPILER} g++ gcc -c file.c -o file.o # becomes: # kokkos_launch_compiler gcc -c file.c -o file.o -# Check to see if the executable is the C++ compiler and if it is not, then +# We check to see if the executable is the C++ compiler and if it is not, then # just execute the command. # # Summary: -# kokkos_launch_compiler g++ gcc -c file.c -o file.o +# kokkos_launch_compiler ${KOKKOS_COMPILER} g++ gcc -c file.c -o file.o # results in this command being executed: # gcc -c file.c -o file.o # and -# kokkos_launch_compiler g++ g++ -c file.cpp -o file.o +# kokkos_launch_compiler ${KOKKOS_COMPILER} g++ g++ -c file.cpp -o file.o # results in this command being executed: -# nvcc_wrapper -c file.cpp -o file.o +# ${KOKKOS_COMPILER} -c file.cpp -o file.o if [[ "${KOKKOS_DEPENDENCE}" -eq "0" || "${CXX_COMPILER}" != "${1}" ]]; then - # the command does not depend on Kokkos so just execute the command w/o re-directing to nvcc_wrapper + debug-message $@ + # the command does not depend on Kokkos so just execute the command w/o re-directing to ${KOKKOS_COMPILER} eval $@ else - # the executable is the C++ compiler, so we need to re-direct to nvcc_wrapper - - # find the nvcc_wrapper from the same build/install - NVCC_WRAPPER="$(dirname ${BASH_SOURCE[0]})/nvcc_wrapper" - - if [ -z "${NVCC_WRAPPER}" ]; then - echo -e "\nError: nvcc_wrapper not found in $(dirname ${BASH_SOURCE[0]}).\n" + # the executable is the C++ compiler, so we need to re-direct to ${KOKKOS_COMPILER} + if [ ! -f "${KOKKOS_COMPILER}" ]; then + echo -e "\nError: the compiler redirect for Kokkos was not found at ${KOKKOS_COMPILER}\n" exit 1 fi - # set default nvcc wrapper compiler if not specified - : ${NVCC_WRAPPER_DEFAULT_COMPILER:=${CXX_COMPILER}} - export NVCC_WRAPPER_DEFAULT_COMPILER + # find the nvcc_wrapper from the same build/install + NVCC_WRAPPER="$(dirname ${BASH_SOURCE[0]})/nvcc_wrapper" + if [ "${KOKKOS_COMPILER}" = "${NVCC_WRAPPER}" ]; then + # this should only be valid in the install tree -- it will be set to CMAKE_CXX_COMPILER used using Kokkos installation + if [ -z $(echo "@NVCC_WRAPPER_DEFAULT_COMPILER@" | grep 'NVCC_WRAPPER_DEFAULT_COMPILER') ]; then + : ${NVCC_WRAPPER_DEFAULT_COMPILER:="@NVCC_WRAPPER_DEFAULT_COMPILER@"} + fi - # calling itself will cause an infinitely long build - if [ "${NVCC_WRAPPER}" = "${NVCC_WRAPPER_DEFAULT_COMPILER}" ]; then - echo -e "\nError: NVCC_WRAPPER == NVCC_WRAPPER_DEFAULT_COMPILER. Terminating to avoid infinite loop!\n" - exit 1 + # set default nvcc wrapper compiler if not specified + : ${NVCC_WRAPPER_DEFAULT_COMPILER:=${CXX_COMPILER}} + export NVCC_WRAPPER_DEFAULT_COMPILER + + # nvcc_wrapper calling itself will cause an infinitely long build + if [ "${NVCC_WRAPPER}" = "${NVCC_WRAPPER_DEFAULT_COMPILER}" ]; then + echo -e "\nError: NVCC_WRAPPER == NVCC_WRAPPER_DEFAULT_COMPILER. Terminating to avoid infinite loop!\n" + exit 1 + fi fi # discard the compiler from the command shift - # execute nvcc_wrapper - ${NVCC_WRAPPER} $@ + debug-message ${KOKKOS_COMPILER} $@ + # execute ${KOKKOS_COMPILER} (again, usually nvcc_wrapper) + ${KOKKOS_COMPILER} $@ fi diff --git a/lib/kokkos/bin/nvcc_wrapper b/lib/kokkos/bin/nvcc_wrapper index 4ecf4c66d5..5556e888e3 100755 --- a/lib/kokkos/bin/nvcc_wrapper +++ b/lib/kokkos/bin/nvcc_wrapper @@ -191,11 +191,11 @@ do shift ;; #Handle known nvcc args - --dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|-Xptxas*|--fmad*|--Wext-lambda-captures-this|-Wext-lambda-captures-this) + --dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|-Xptxas*|--fmad*|--use_fast_math|--Wext-lambda-captures-this|-Wext-lambda-captures-this) cuda_args="$cuda_args $1" ;; #Handle more known nvcc args - --expt-extended-lambda|--expt-relaxed-constexpr) + --expt-extended-lambda|--expt-relaxed-constexpr|--Wno-deprecated-gpu-targets|-Wno-deprecated-gpu-targets) cuda_args="$cuda_args $1" ;; #Handle known nvcc args that have an argument diff --git a/lib/kokkos/cmake/CTestConfig.cmake.in b/lib/kokkos/cmake/CTestConfig.cmake.in new file mode 100644 index 0000000000..1f82c0d64d --- /dev/null +++ b/lib/kokkos/cmake/CTestConfig.cmake.in @@ -0,0 +1,91 @@ +#----------------------------------------------------------------------------------------# +# +# CTestConfig.cmake template for Kokkos +# +#----------------------------------------------------------------------------------------# + +# +# dash-board related +# +set(CTEST_PROJECT_NAME "Kokkos") +set(CTEST_NIGHTLY_START_TIME "01:00:00 UTC") +set(CTEST_DROP_METHOD "https") +set(CTEST_DROP_SITE "cdash.nersc.gov") +set(CTEST_DROP_LOCATION "/submit.php?project=${CTEST_PROJECT_NAME}") +set(CTEST_CDASH_VERSION "1.6") +set(CTEST_CDASH_QUERY_VERSION TRUE) +set(CTEST_SUBMIT_RETRY_COUNT "1") +set(CTEST_SUBMIT_RETRY_DELAY "30") + +# +# configure/build related +# +set(CTEST_BUILD_NAME "@BUILD_NAME@") +set(CTEST_MODEL "@MODEL@") +set(CTEST_SITE "@SITE@") +set(CTEST_CONFIGURATION_TYPE "@BUILD_TYPE@") +set(CTEST_SOURCE_DIRECTORY "@SOURCE_REALDIR@") +set(CTEST_BINARY_DIRECTORY "@BINARY_REALDIR@") + +# +# configure/build related +# +set(CTEST_UPDATE_TYPE "git") +set(CTEST_UPDATE_VERSION_ONLY ON) +# set(CTEST_GENERATOR "") +# set(CTEST_GENERATOR_PLATFORM "") + +# +# testing related +# +set(CTEST_TIMEOUT "7200") +set(CTEST_TEST_TIMEOUT "7200") +set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_ERRORS "100") +set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_WARNINGS "100") +set(CTEST_CUSTOM_MAXIMUM_PASSED_TEST_OUTPUT_SIZE "1048576") + +# +# coverage related +# +set(CTEST_CUSTOM_COVERAGE_EXCLUDE ".*tpls/.*;/usr/.*;.*unit_test/.*;.*unit_tests/.*;.*perf_test/.*") + +# +# commands +# +if(NOT "@CHECKOUT_COMMAND@" STREQUAL "") + set(CTEST_CHECKOUT_COMMAND "@CHECKOUT_COMMAND@") +endif() +set(CTEST_UPDATE_COMMAND "@GIT_EXECUTABLE@") +set(CTEST_CONFIGURE_COMMAND "@CMAKE_COMMAND@ -DCMAKE_BUILD_TYPE=@BUILD_TYPE@ -DKokkos_ENABLE_TESTS=ON @CONFIG_ARGS@ @SOURCE_REALDIR@") +set(CTEST_BUILD_COMMAND "@CMAKE_COMMAND@ --build @BINARY_REALDIR@ --target @TARGET@") +if(NOT WIN32) + set(CTEST_BUILD_COMMAND "${CTEST_BUILD_COMMAND} -- -j@BUILD_JOBS@") +endif() +set(CTEST_COVERAGE_COMMAND "gcov") +set(CTEST_MEMORYCHECK_COMMAND "valgrind") +set(CTEST_GIT_COMMAND "@GIT_EXECUTABLE@") + +# +# various configs +# +set(APPEND_VALUE @APPEND@) +if(APPEND_VALUE) + set(APPEND_CTEST APPEND) +endif() + +macro(SET_TEST_PROP VAR) + if(NOT "${ARGS}" STREQUAL "") + set(${VAR}_CTEST ${VAR} ${ARGN}) + endif() +endmacro() + +set_test_prop(START @START@) +set_test_prop(END @END@) +set_test_prop(STRIDE @STRIDE@) +set_test_prop(INCLUDE @INCLUDE@) +set_test_prop(EXCLUDE @EXCLUDE@) +set_test_prop(INCLUDE_LABEL @INCLUDE_LABEL@) +set_test_prop(EXCLUDE_LABEL @EXCLUDE_LABEL@) +set_test_prop(PARALLEL_LEVEL @PARALLEL_LEVEL@) +set_test_prop(STOP_TIME @STOP_TIME@) +set_test_prop(COVERAGE_LABELS @LABELS@) diff --git a/lib/kokkos/cmake/KokkosCI.cmake b/lib/kokkos/cmake/KokkosCI.cmake new file mode 100644 index 0000000000..e8c9af37ad --- /dev/null +++ b/lib/kokkos/cmake/KokkosCI.cmake @@ -0,0 +1,350 @@ +cmake_minimum_required(VERSION 3.16 FATAL_ERROR) + +message(STATUS "") + +get_cmake_property(_cached_vars CACHE_VARIABLES) +set(KOKKOS_CMAKE_ARGS) +set(EXCLUDED_VARIABLES "CMAKE_COMMAND" "CMAKE_CPACK_COMMAND" "CMAKE_CTEST_COMMAND" "CMAKE_ROOT" + "CTEST_ARGS" "BUILD_NAME" "CMAKE_CXX_FLAGS" "CMAKE_BUILD_TYPE") +list(SORT _cached_vars) +foreach(_var ${_cached_vars}) + if(NOT "${_var}" IN_LIST EXCLUDED_VARIABLES) + list(APPEND KOKKOS_CMAKE_ARGS ${_var}) + if("${_var}" STREQUAL "CMAKE_BUILD_TYPE") + set(BUILD_TYPE "${CMAKE_BUILD_TYPE}") + endif() + endif() +endforeach() + + +#----------------------------------------------------------------------------------------# +# +# Macros and variables +# +#----------------------------------------------------------------------------------------# + +macro(CHECK_REQUIRED VAR) + if(NOT DEFINED ${VAR}) + message(FATAL_ERROR "Error! Variable '${VAR}' must be defined") + endif() +endmacro() + +# require the build name variable +CHECK_REQUIRED(BUILD_NAME) + +# uses all args +macro(SET_DEFAULT VAR) + if(NOT DEFINED ${VAR}) + set(${VAR} ${ARGN}) + endif() + # remove these ctest configuration variables from the defines + # passed to the Kokkos configuration + if("${VAR}" IN_LIST KOKKOS_CMAKE_ARGS) + list(REMOVE_ITEM KOKKOS_CMAKE_ARGS "${VAR}") + endif() +endmacro() + +# uses first arg -- useful for selecting via priority from multiple +# potentially defined variables, e.g.: +# +# set_default_arg1(BUILD_NAME ${TRAVIS_BUILD_NAME} ${BUILD_NAME}) +# +macro(SET_DEFAULT_ARG1 VAR) + if(NOT DEFINED ${VAR}) + foreach(_ARG ${ARGN}) + if(NOT "${_ARG}" STREQUAL "") + set(${VAR} ${_ARG}) + break() + endif() + endforeach() + endif() + # remove these ctest configuration variables from the defines + # passed to the Kokkos configuration + if("${VAR}" IN_LIST KOKKOS_CMAKE_ARGS) + list(REMOVE_ITEM KOKKOS_CMAKE_ARGS "${VAR}") + endif() +endmacro() + +# determine the default working directory +if(NOT "$ENV{WORKSPACE}" STREQUAL "") + set(WORKING_DIR "$ENV{WORKSPACE}") +else() + get_filename_component(WORKING_DIR ${CMAKE_CURRENT_LIST_DIR} DIRECTORY) +endif() + +# determine the hostname +execute_process(COMMAND hostname + OUTPUT_VARIABLE HOSTNAME + OUTPUT_STRIP_TRAILING_WHITESPACE) + +SET_DEFAULT(HOSTNAME "$ENV{HOSTNAME}") + +# get the number of processors +include(ProcessorCount) +ProcessorCount(NUM_PROCESSORS) + +# find git +find_package(Git QUIET) +if(NOT GIT_EXECUTABLE) + unset(GIT_EXECUTABLE CACHE) + unset(GIT_EXECUTABLE) +endif() + +function(EXECUTE_GIT_COMMAND VAR) + set(${VAR} "" PARENT_SCOPE) + execute_process(COMMAND ${GIT_EXECUTABLE} ${ARGN} + OUTPUT_VARIABLE VAL + RESULT_VARIABLE RET + OUTPUT_STRIP_TRAILING_WHITESPACE + WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR} + ERROR_QUIET) + string(REPLACE ";" " " _CMD "${GIT_EXECUTABLE} ${ARGN}") + set(LAST_GIT_COMMAND "${_CMD}" PARENT_SCOPE) + if(RET EQUAL 0) + set(${VAR} "${VAL}" PARENT_SCOPE) + endif() +endfunction() + +# just gets the git branch name if available +function(GET_GIT_BRANCH_NAME VAR) + execute_git_command(GIT_BRANCH branch --show-current) + set(_INVALID "%D" "HEAD") + if(NOT GIT_BRANCH OR "${GIT_BRANCH}" IN_LIST _INVALID) + execute_git_command(GIT_BRANCH show -s --format=%D) + if(NOT GIT_BRANCH OR "${GIT_BRANCH}" IN_LIST _INVALID) + execute_git_command(GIT_BRANCH --describe all) + endif() + endif() + # + if(GIT_BRANCH) + string(REPLACE " " ";" _DESC "${GIT_BRANCH}") + # just set it to last one via loop instead of wonky cmake index manip + foreach(_ITR ${_DESC}) + set(GIT_BRANCH "${_ITR}") + endforeach() + set(${VAR} "${GIT_BRANCH}" PARENT_SCOPE) + message(STATUS "GIT BRANCH via '${LAST_GIT_COMMAND}': ${GIT_BRANCH}") + endif() +endfunction() + +# just gets the git branch name if available +function(GET_GIT_AUTHOR_NAME VAR) + execute_git_command(GIT_AUTHOR show -s --format=%an) + if(GIT_AUTHOR) + string(LENGTH "${GIT_AUTHOR}" STRLEN) + # if the build name gets too long, this can cause submission errors + if(STRLEN GREATER 24) + # remove middle initial + string(REGEX REPLACE " [A-Z]\. " " " GIT_AUTHOR "${GIT_AUTHOR}") + # get first and sur name + string(REGEX REPLACE "([A-Za-z]+) ([A-Za-z]+)" "\\1" F_NAME "${GIT_AUTHOR}") + string(REGEX REPLACE "([A-Za-z]+) ([A-Za-z]+)" "\\2" S_NAME "${GIT_AUTHOR}") + if(S_NAME) + set(GIT_AUTHOR "${S_NAME}") + elseif(F_NAME) + set(GIT_AUTHOR "${F_NAME}") + endif() + endif() + # remove any spaces, quotes, periods, etc. + string(REGEX REPLACE "[ ',;_\.\"]+" "" GIT_AUTHOR "${GIT_AUTHOR}") + set(${VAR} "${GIT_AUTHOR}" PARENT_SCOPE) + message(STATUS "GIT AUTHOR via '${LAST_GIT_COMMAND}': ${GIT_AUTHOR}") + endif() +endfunction() + +# get the name of the branch +GET_GIT_BRANCH_NAME(GIT_BRANCH) +# get the name of the author +GET_GIT_AUTHOR_NAME(GIT_AUTHOR) +# author, prefer git method for consistency +SET_DEFAULT_ARG1(AUTHOR ${GIT_AUTHOR} $ENV{GIT_AUTHOR} $ENV{AUTHOR}) +# SLUG == owner_name/repo_name +SET_DEFAULT_ARG1(SLUG $ENV{TRAVIS_PULL_REQUEST_SLUG} $ENV{TRAVIS_REPO_SLUG} $ENV{APPVEYOR_REPO_NAME} $ENV{PULL_REQUEST_SLUG} $ENV{REPO_SLUG}) +# branch name +SET_DEFAULT_ARG1(BRANCH $ENV{TRAVIS_PULL_REQUEST_BRANCH} $ENV{TRAVIS_BRANCH} $ENV{APPVEYOR_PULL_REQUEST_HEAD_REPO_BRANCH} $ENV{APPVEYOR_REPO_BRANCH} $ENV{GIT_BRANCH} $ENV{BRANCH_NAME} $ENV{BRANCH} ${GIT_BRANCH}) +# pull request number +SET_DEFAULT_ARG1(PULL_REQUEST_NUM $ENV{TRAVIS_PULL_REQUEST} $ENV{CHANGE_ID} $ENV{APPVEYOR_PULL_REQUEST_NUMBER} $ENV{PULL_REQUEST_NUM}) +# get the event type, e.g. push, pull_request, api, cron, etc. +SET_DEFAULT_ARG1(EVENT_TYPE $ENV{TRAVIS_EVENT_TYPE} ${EVENT_TYPE}) + +if("${BRANCH}" STREQUAL "") + message(STATUS "Checked: environment variables for Travis, Appveyor, Jenkins (git plugin), BRANCH_NAME, BRANCH and 'git branch --show-current'") + message(FATAL_ERROR "Error! Git branch could not be determined. Please provide -DBRANCH=") +endif() + +#----------------------------------------------------------------------------------------# +# +# Set default values if not provided on command-line +# +#----------------------------------------------------------------------------------------# + +SET_DEFAULT(SOURCE_DIR "${WORKING_DIR}") # source directory +SET_DEFAULT(BINARY_DIR "${WORKING_DIR}/build") # build directory +SET_DEFAULT(BUILD_TYPE "${CMAKE_BUILD_TYPE}") # Release, Debug, etc. +SET_DEFAULT(MODEL "Continuous") # Continuous, Nightly, or Experimental +SET_DEFAULT(JOBS 1) # number of parallel ctests +SET_DEFAULT(CTEST_COMMAND "${CMAKE_CTEST_COMMAND}") # just in case +SET_DEFAULT(CTEST_ARGS "-V --output-on-failure") # extra arguments when ctest is called +SET_DEFAULT(GIT_EXECUTABLE "git") # ctest_update +SET_DEFAULT(TARGET "all") # build target +SET_DEFAULT_ARG1(SITE "$ENV{SITE}" + "${HOSTNAME}") # update site +SET_DEFAULT_ARG1(BUILD_JOBS "$ENV{BUILD_JOBS}" + "${NUM_PROCESSORS}") # number of parallel compile jobs +# +# The variable below correspond to ctest arguments, i.e. START,END,STRIDE are +# '-I START,END,STRIDE' +# +SET_DEFAULT(START "") +SET_DEFAULT(END "") +SET_DEFAULT(STRIDE "") +SET_DEFAULT(INCLUDE "") +SET_DEFAULT(EXCLUDE "") +SET_DEFAULT(INCLUDE_LABEL "") +SET_DEFAULT(EXCLUDE_LABEL "") +SET_DEFAULT(PARALLEL_LEVEL "") +SET_DEFAULT(STOP_TIME "") +SET_DEFAULT(LABELS "") +SET_DEFAULT(NOTES "") + +# default static build tag for Nightly +set(BUILD_TAG "${BRANCH}") + +if(NOT BUILD_TYPE) + # default for kokkos if not specified + set(BUILD_TYPE "RelWithDebInfo") +endif() + +# generate dynamic name if continuous or experimental model +if(NOT "${MODEL}" STREQUAL "Nightly") + if(EVENT_TYPE AND PULL_REQUEST_NUM) + # e.g. pull_request/123 + if(AUTHOR) + set(BUILD_TAG "${AUTHOR}/${EVENT_TYPE}/${PULL_REQUEST_NUM}") + else() + set(BUILD_TAG "${EVENT_TYPE}/${PULL_REQUEST_NUM}") + endif() + elseif(SLUG) + # e.g. owner_name/repo_name + set(BUILD_TAG "${SLUG}") + elseif(AUTHOR) + set(BUILD_TAG "${AUTHOR}/${BRANCH}") + endif() + if(EVENT_TYPE AND NOT PULL_REQUEST_NUM) + set(BUILD_TAG "${BUILD_TAG}-${EVENT_TYPE}") + endif() +endif() + +# unnecessary +string(REPLACE "/remotes/" "/" BUILD_TAG "${BUILD_TAG}") +string(REPLACE "/origin/" "/" BUILD_TAG "${BUILD_TAG}") + +message(STATUS "BUILD_TAG: ${BUILD_TAG}") + +set(BUILD_NAME "[${BUILD_TAG}] [${BUILD_NAME}-${BUILD_TYPE}]") + +# colons in build name create extra (empty) entries in CDash +string(REPLACE ":" "-" BUILD_NAME "${BUILD_NAME}") +# unnecessary info +string(REPLACE "/merge]" "]" BUILD_NAME "${BUILD_NAME}") +# consistency +string(REPLACE "/pr/" "/pull/" BUILD_NAME "${BUILD_NAME}") +string(REPLACE "pull_request/" "pull/" BUILD_NAME "${BUILD_NAME}") +# miscellaneous from missing fields +string(REPLACE "--" "-" BUILD_NAME "${BUILD_NAME}") +string(REPLACE "-]" "]" BUILD_NAME "${BUILD_NAME}") + +# check binary directory +if(EXISTS ${BINARY_DIR}) + if(NOT IS_DIRECTORY "${BINARY_DIR}") + message(FATAL_ERROR "Error! '${BINARY_DIR}' already exists and is not a directory!") + endif() + file(GLOB BINARY_DIR_FILES "${BINARY_DIR}/*") + if(NOT "${BINARY_DIR_FILES}" STREQUAL "") + message(FATAL_ERROR "Error! '${BINARY_DIR}' already exists and is not empty!") + endif() +endif() + +get_filename_component(SOURCE_REALDIR ${SOURCE_DIR} REALPATH) +get_filename_component(BINARY_REALDIR ${BINARY_DIR} REALPATH) + +#----------------------------------------------------------------------------------------# +# +# Generate the CTestConfig.cmake +# +#----------------------------------------------------------------------------------------# + +set(CONFIG_ARGS) +foreach(_ARG ${KOKKOS_CMAKE_ARGS}) + if(NOT "${${_ARG}}" STREQUAL "") + get_property(_ARG_TYPE CACHE ${_ARG} PROPERTY TYPE) + if("${_ARG_TYPE}" STREQUAL "UNINITIALIZED") + if("${${_ARG}}" STREQUAL "ON" OR "${${_ARG}}" STREQUAL "OFF") + set(_ARG_TYPE "BOOL") + elseif(EXISTS "${${_ARG}}" AND NOT IS_DIRECTORY "${${_ARG}}") + set(_ARG_TYPE "FILEPATH") + elseif(EXISTS "${${_ARG}}" AND IS_DIRECTORY "${${_ARG}}") + set(_ARG_TYPE "PATH") + elseif(NOT "${${_ARG}}" STREQUAL "") + set(_ARG_TYPE "STRING") + endif() + endif() + set(CONFIG_ARGS "${CONFIG_ARGS}set(${_ARG} \"${${_ARG}}\" CACHE ${_ARG_TYPE} \"\")\n") + endif() +endforeach() + +file(WRITE ${BINARY_REALDIR}/initial-cache.cmake +" +set(CMAKE_CXX_FLAGS \"${CMAKE_CXX_FLAGS}\" CACHE STRING \"\") +${CONFIG_ARGS} +") + +file(READ ${BINARY_REALDIR}/initial-cache.cmake _CACHE_INFO) +message(STATUS "Initial cache:\n${_CACHE_INFO}") + +# initialize the cache +set(CONFIG_ARGS "-C ${BINARY_REALDIR}/initial-cache.cmake") + + +# generate the CTestConfig.cmake +configure_file( + ${CMAKE_CURRENT_LIST_DIR}/CTestConfig.cmake.in + ${BINARY_REALDIR}/CTestConfig.cmake + @ONLY) + +# copy/generate the dashboard script +configure_file( + ${CMAKE_CURRENT_LIST_DIR}/KokkosCTest.cmake.in + ${BINARY_REALDIR}/KokkosCTest.cmake + @ONLY) + +# custom CTest settings go in ${BINARY_DIR}/CTestCustom.cmake +execute_process( + COMMAND ${CMAKE_COMMAND} -E touch CTestCustom.cmake + WORKING_DIRECTORY ${BINARY_REALDIR} + ) + +#----------------------------------------------------------------------------------------# +# +# Execute CTest +# +#----------------------------------------------------------------------------------------# + +message(STATUS "") +message(STATUS "BUILD_NAME: ${BUILD_NAME}") +message(STATUS "Executing '${CTEST_COMMAND} -S KokkosCTest.cmake ${CTEST_ARGS}'...") +message(STATUS "") + +# e.g. -DCTEST_ARGS="--output-on-failure -VV" should really be -DCTEST_ARGS="--output-on-failure;-VV" +string(REPLACE " " ";" CTEST_ARGS "${CTEST_ARGS}") + +execute_process( + COMMAND ${CTEST_COMMAND} -S KokkosCTest.cmake ${CTEST_ARGS} + RESULT_VARIABLE RET + WORKING_DIRECTORY ${BINARY_REALDIR} + ) + +# ensure that any non-zero result variable gets propagated +if(NOT RET EQUAL 0) + message(FATAL_ERROR "CTest return non-zero exit code: ${RET}") +endif() diff --git a/lib/kokkos/cmake/KokkosCTest.cmake.in b/lib/kokkos/cmake/KokkosCTest.cmake.in new file mode 100644 index 0000000000..b6917f3cc1 --- /dev/null +++ b/lib/kokkos/cmake/KokkosCTest.cmake.in @@ -0,0 +1,261 @@ +cmake_minimum_required(VERSION 3.16 FATAL_ERROR) + +if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/CTestConfig.cmake") + include("${CMAKE_CURRENT_LIST_DIR}/CTestConfig.cmake") +endif() + +include(ProcessorCount) +ProcessorCount(CTEST_PROCESSOR_COUNT) + +cmake_policy(SET CMP0009 NEW) +cmake_policy(SET CMP0011 NEW) + +# ---------------------------------------------------------------------------- # +# -- Commands +# ---------------------------------------------------------------------------- # +find_program(CTEST_CMAKE_COMMAND NAMES cmake) +find_program(CTEST_UNAME_COMMAND NAMES uname) + +find_program(CTEST_BZR_COMMAND NAMES bzr) +find_program(CTEST_CVS_COMMAND NAMES cvs) +find_program(CTEST_GIT_COMMAND NAMES git) +find_program(CTEST_HG_COMMAND NAMES hg) +find_program(CTEST_P4_COMMAND NAMES p4) +find_program(CTEST_SVN_COMMAND NAMES svn) + +find_program(VALGRIND_COMMAND NAMES valgrind) +find_program(GCOV_COMMAND NAMES gcov) +find_program(LCOV_COMMAND NAMES llvm-cov) +find_program(MEMORYCHECK_COMMAND NAMES valgrind ) + +set(MEMORYCHECK_TYPE Valgrind) +# set(MEMORYCHECK_TYPE Purify) +# set(MEMORYCHECK_TYPE BoundsChecker) +# set(MEMORYCHECK_TYPE ThreadSanitizer) +# set(MEMORYCHECK_TYPE AddressSanitizer) +# set(MEMORYCHECK_TYPE LeakSanitizer) +# set(MEMORYCHECK_TYPE MemorySanitizer) +# set(MEMORYCHECK_TYPE UndefinedBehaviorSanitizer) +set(MEMORYCHECK_COMMAND_OPTIONS "--trace-children=yes --leak-check=full") + +# ---------------------------------------------------------------------------- # +# -- Settings +# ---------------------------------------------------------------------------- # +## -- Process timeout in seconds +set(CTEST_TIMEOUT "7200") +## -- Set output to English +set(ENV{LC_MESSAGES} "en_EN" ) + + +# ---------------------------------------------------------------------------- # +# -- Copy ctest configuration file +# ---------------------------------------------------------------------------- # +macro(COPY_CTEST_CONFIG_FILES) + + foreach(_FILE CTestConfig.cmake CTestCustom.cmake) + + # if current directory is not binary or source directory + if(NOT "${CMAKE_CURRENT_LIST_DIR}" STREQUAL "${CTEST_BINARY_DIRECTORY}" AND + NOT "${CTEST_SOURCE_DIRECTORY}" STREQUAL "${CTEST_BINARY_DIRECTORY}") + + # if file exists in current directory + if(EXISTS ${CMAKE_CURRENT_LIST_DIR}/${_FILE}) + configure_file(${CMAKE_CURRENT_LIST_DIR}/${_FILE} + ${CTEST_BINARY_DIRECTORY}/${_FILE} COPYONLY) + endif() + + # if source and binary differ + elseif(NOT "${CTEST_SOURCE_DIRECTORY}" STREQUAL "${CTEST_BINARY_DIRECTORY}") + + # if file exists in source directory but not in binary directory + if(EXISTS ${CTEST_SOURCE_DIRECTORY}/${_FILE} AND + NOT EXISTS ${CTEST_BINARY_DIRECTORY}/${_FILE}) + configure_file(${CTEST_SOURCE_DIRECTORY}/${_FILE} + ${CTEST_BINARY_DIRECTORY}/${_FILE} COPYONLY) + endif() + + endif() + endforeach() + +endmacro() + +ctest_read_custom_files("${CMAKE_CURRENT_LIST_DIR}") + +message(STATUS "CTEST_MODEL: ${CTEST_MODEL}") + +#-------------------------------------------------------------------------# +# Start +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Running START_CTEST stage...") +message(STATUS "") + +ctest_start(${CTEST_MODEL} TRACK ${CTEST_MODEL} ${APPEND_CTEST} + ${CTEST_SOURCE_DIRECTORY} ${CTEST_BINARY_DIRECTORY}) + + +#-------------------------------------------------------------------------# +# Config +# +copy_ctest_config_files() +ctest_read_custom_files("${CTEST_BINARY_DIRECTORY}") + + +#-------------------------------------------------------------------------# +# Update +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_UPDATE stage...") +message(STATUS "") + +ctest_update(SOURCE "${CTEST_SOURCE_DIRECTORY}" + RETURN_VALUE up_ret) + + +#-------------------------------------------------------------------------# +# Configure +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_CONFIGURE stage...") +message(STATUS "") + +ctest_configure(BUILD "${CTEST_BINARY_DIRECTORY}" + SOURCE ${CTEST_SOURCE_DIRECTORY} + ${APPEND_CTEST} + OPTIONS "${CTEST_CONFIGURE_OPTIONS}" + RETURN_VALUE config_ret) + + +#-------------------------------------------------------------------------# +# Echo configure log bc Damien wants to delay merging this PR for eternity +# +file(GLOB _configure_log "${CTEST_BINARY_DIRECTORY}/Testing/Temporary/LastConfigure*.log") +# should only have one but loop just for safety +foreach(_LOG ${_configure_log}) + file(READ ${_LOG} _LOG_MESSAGE) + message(STATUS "Configure Log: ${_LOG}") + message(STATUS "\n${_LOG_MESSAGE}\n") +endforeach() + + +#-------------------------------------------------------------------------# +# Build +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_BUILD stage...") +message(STATUS "") + +ctest_build(BUILD "${CTEST_BINARY_DIRECTORY}" + ${APPEND_CTEST} + RETURN_VALUE build_ret) + + +#-------------------------------------------------------------------------# +# Echo build log bc Damien wants to delay merging this PR for eternity +# +file(GLOB _build_log "${CTEST_BINARY_DIRECTORY}/Testing/Temporary/LastBuild*.log") +# should only have one but loop just for safety +foreach(_LOG ${_build_log}) + file(READ ${_LOG} _LOG_MESSAGE) + message(STATUS "Build Log: ${_LOG}") + message(STATUS "\n${_LOG_MESSAGE}\n") +endforeach() + + +#-------------------------------------------------------------------------# +# Test +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_TEST stage...") +message(STATUS "") + +ctest_test(RETURN_VALUE test_ret + ${APPEND_CTEST} + ${START_CTEST} + ${END_CTEST} + ${STRIDE_CTEST} + ${INCLUDE_CTEST} + ${EXCLUDE_CTEST} + ${INCLUDE_LABEL_CTEST} + ${EXCLUDE_LABEL_CTEST} + ${PARALLEL_LEVEL_CTEST} + ${STOP_TIME_CTEST} + SCHEDULE_RANDOM OFF) + + +#-------------------------------------------------------------------------# +# Coverage +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_COVERAGE stage...") +message(STATUS "") + +execute_process(COMMAND ${CTEST_COVERAGE_COMMAND} ${CTEST_COVERAGE_EXTRA_FLAGS} + WORKING_DIRECTORY ${CTEST_BINARY_DIRECTORY} + ERROR_QUIET) + +ctest_coverage(${APPEND_CTEST} + ${CTEST_COVERAGE_LABELS} + RETURN_VALUE cov_ret) + + +#-------------------------------------------------------------------------# +# MemCheck +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_MEMCHECK stage...") +message(STATUS "") + +ctest_memcheck(RETURN_VALUE mem_ret + ${APPEND_CTEST} + ${START_CTEST} + ${END_CTEST} + ${STRIDE_CTEST} + ${INCLUDE_CTEST} + ${EXCLUDE_CTEST} + ${INCLUDE_LABEL_CTEST} + ${EXCLUDE_LABEL_CTEST} + ${PARALLEL_LEVEL_CTEST}) + + +#-------------------------------------------------------------------------# +# Submit +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_SUBMIT stage...") +message(STATUS "") + +file(GLOB_RECURSE NOTE_FILES "${CTEST_BINARY_DIRECTORY}/*CTestNotes.cmake") +foreach(_FILE ${NOTE_FILES}) + message(STATUS "Including CTest notes files: \"${_FILE}\"...") + include("${_FILE}") +endforeach() + +# capture submit error so it doesn't fail because of a submission error +ctest_submit(RETURN_VALUE submit_ret + RETRY_COUNT 2 + RETRY_DELAY 10 + CAPTURE_CMAKE_ERROR submit_err) + +#-------------------------------------------------------------------------# +# Submit +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Finished ${CTEST_MODEL} Stages (${STAGES})") +message(STATUS "") + + +#-------------------------------------------------------------------------# +# Non-zero exit codes for important errors +# +if(NOT config_ret EQUAL 0) + message(FATAL_ERROR "Error during configuration! Exit code: ${config_ret}") +endif() + +if(NOT build_ret EQUAL 0) + message(FATAL_ERROR "Error during build! Exit code: ${build_ret}") +endif() + +if(NOT test_ret EQUAL 0) + message(FATAL_ERROR "Error during testing! Exit code: ${test_ret}") +endif() diff --git a/lib/kokkos/cmake/KokkosConfig.cmake.in b/lib/kokkos/cmake/KokkosConfig.cmake.in index 9fbd22ee5c..44a8fcd9c3 100644 --- a/lib/kokkos/cmake/KokkosConfig.cmake.in +++ b/lib/kokkos/cmake/KokkosConfig.cmake.in @@ -19,17 +19,44 @@ INCLUDE("${Kokkos_CMAKE_DIR}/KokkosTargets.cmake") INCLUDE("${Kokkos_CMAKE_DIR}/KokkosConfigCommon.cmake") UNSET(Kokkos_CMAKE_DIR) -# if CUDA was enabled and separable compilation was specified, e.g. -# find_package(Kokkos COMPONENTS separable_compilation) -# then we set the RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK -IF(@Kokkos_ENABLE_CUDA@ AND NOT "separable_compilation" IN_LIST Kokkos_FIND_COMPONENTS) +# check for conflicts +IF("launch_compiler" IN_LIST Kokkos_FIND_COMPONENTS AND + "separable_compilation" IN_LIST Kokkos_FIND_COMPONENTS) + MESSAGE(STATUS "'launch_compiler' implies global redirection of targets depending on Kokkos to appropriate compiler.") + MESSAGE(STATUS "'separable_compilation' implies explicitly defining where redirection occurs via 'kokkos_compilation(PROJECT|TARGET|SOURCE|DIRECTORY ...)'") + MESSAGE(FATAL_ERROR "Conflicting COMPONENTS: 'launch_compiler' and 'separable_compilation'") +ENDIF() + +IF("launch_compiler" IN_LIST Kokkos_FIND_COMPONENTS) + # + # if find_package(Kokkos COMPONENTS launch_compiler) then rely on the + # RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK to always redirect to the + # appropriate compiler for Kokkos + # + + MESSAGE(STATUS "kokkos_launch_compiler is enabled globally. C++ compiler commands with -DKOKKOS_DEPENDENCE will be redirected to the appropriate compiler for Kokkos") + kokkos_compilation( + GLOBAL + CHECK_CUDA_COMPILES) + +ELSEIF(@Kokkos_ENABLE_CUDA@ AND NOT "separable_compilation" IN_LIST Kokkos_FIND_COMPONENTS) + # + # if CUDA was enabled, separable compilation was not specified, and current compiler + # cannot compile CUDA, then set the RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK globally and + # kokkos_launch_compiler will re-direct to the compiler used to compile CUDA code during installation. + # kokkos_launch_compiler will re-direct if ${CMAKE_CXX_COMPILER} and -DKOKKOS_DEPENDENCE is present, + # otherwise, the original command will be executed + # + # run test to see if CMAKE_CXX_COMPILER=nvcc_wrapper kokkos_compiler_is_nvcc(IS_NVCC ${CMAKE_CXX_COMPILER}) - # if not nvcc_wrapper, use RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK - IF(NOT IS_NVCC AND NOT CMAKE_CXX_COMPILER_ID STREQUAL Clang AND - (NOT DEFINED Kokkos_LAUNCH_COMPILER OR Kokkos_LAUNCH_COMPILER)) - MESSAGE(STATUS "kokkos_launch_compiler is enabled globally. C++ compiler commands with -DKOKKOS_DEPENDENCE will be redirected to nvcc_wrapper") + + # if not nvcc_wrapper and Kokkos_LAUNCH_COMPILER was not set to OFF + IF(NOT IS_NVCC AND (NOT DEFINED Kokkos_LAUNCH_COMPILER OR Kokkos_LAUNCH_COMPILER)) + MESSAGE(STATUS "kokkos_launch_compiler is enabled globally. C++ compiler commands with -DKOKKOS_DEPENDENCE will be redirected to the appropriate compiler for Kokkos") kokkos_compilation(GLOBAL) ENDIF() - UNSET(IS_NVCC) # be mindful of the environment, pollution is bad + + # be mindful of the environment, pollution is bad + UNSET(IS_NVCC) ENDIF() diff --git a/lib/kokkos/cmake/KokkosConfigCommon.cmake.in b/lib/kokkos/cmake/KokkosConfigCommon.cmake.in index 42c755c215..ab93e65afe 100644 --- a/lib/kokkos/cmake/KokkosConfigCommon.cmake.in +++ b/lib/kokkos/cmake/KokkosConfigCommon.cmake.in @@ -3,6 +3,7 @@ SET(Kokkos_OPTIONS @KOKKOS_ENABLED_OPTIONS@) SET(Kokkos_TPLS @KOKKOS_ENABLED_TPLS@) SET(Kokkos_ARCH @KOKKOS_ENABLED_ARCH_LIST@) SET(Kokkos_CXX_COMPILER "@CMAKE_CXX_COMPILER@") +SET(Kokkos_CXX_COMPILER_ID "@KOKKOS_CXX_COMPILER_ID@") # These are needed by KokkosKernels FOREACH(DEV ${Kokkos_DEVICES}) @@ -13,13 +14,13 @@ IF(NOT Kokkos_FIND_QUIETLY) MESSAGE(STATUS "Enabled Kokkos devices: ${Kokkos_DEVICES}") ENDIF() -IF (Kokkos_ENABLE_CUDA AND ${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.14.0") - #If we are building CUDA, we have tricked CMake because we declare a CXX project - #If the default C++ standard for a given compiler matches the requested - #standard, then CMake just omits the -std flag in later versions of CMake - #This breaks CUDA compilation (CUDA compiler can have a different default - #-std then the underlying host compiler by itself). Setting this variable - #forces CMake to always add the -std flag even if it thinks it doesn't need it +IF (Kokkos_ENABLE_CUDA) + # If we are building CUDA, we have tricked CMake because we declare a CXX project + # If the default C++ standard for a given compiler matches the requested + # standard, then CMake just omits the -std flag in later versions of CMake + # This breaks CUDA compilation (CUDA compiler can have a different default + # -std then the underlying host compiler by itself). Setting this variable + # forces CMake to always add the -std flag even if it thinks it doesn't need it SET(CMAKE_CXX_STANDARD_DEFAULT 98 CACHE INTERNAL "" FORCE) ENDIF() @@ -90,52 +91,6 @@ function(kokkos_check) endif() endfunction() -# this function is provided to easily select which files use nvcc_wrapper: -# -# GLOBAL --> all files -# TARGET --> all files in a target -# SOURCE --> specific source files -# DIRECTORY --> all files in directory -# PROJECT --> all files/targets in a project/subproject -# -FUNCTION(kokkos_compilation) - CMAKE_PARSE_ARGUMENTS(COMP "GLOBAL;PROJECT" "" "DIRECTORY;TARGET;SOURCE" ${ARGN}) - - # search relative first and then absolute - SET(_HINTS "${CMAKE_CURRENT_LIST_DIR}/../.." "@CMAKE_INSTALL_PREFIX@") - - # find kokkos_launch_compiler - FIND_PROGRAM(Kokkos_COMPILE_LAUNCHER - NAMES kokkos_launch_compiler - HINTS ${_HINTS} - PATHS ${_HINTS} - PATH_SUFFIXES bin) - - IF(NOT Kokkos_COMPILE_LAUNCHER) - MESSAGE(FATAL_ERROR "Kokkos could not find 'kokkos_launch_compiler'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/launcher'") - ENDIF() - - IF(COMP_GLOBAL) - # if global, don't bother setting others - SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") - SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") - ELSE() - FOREACH(_TYPE PROJECT DIRECTORY TARGET SOURCE) - # make project/subproject scoping easy, e.g. KokkosCompilation(PROJECT) after project(...) - IF("${_TYPE}" STREQUAL "PROJECT" AND COMP_${_TYPE}) - LIST(APPEND COMP_DIRECTORY ${PROJECT_SOURCE_DIR}) - UNSET(COMP_${_TYPE}) - ENDIF() - # set the properties if defined - IF(COMP_${_TYPE}) - # MESSAGE(STATUS "Using nvcc_wrapper :: ${_TYPE} :: ${COMP_${_TYPE}}") - SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") - SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") - ENDIF() - ENDFOREACH() - ENDIF() -ENDFUNCTION() - # A test to check whether a downstream project set the C++ compiler to NVCC or not # this is called only when Kokkos was installed with Kokkos_ENABLE_CUDA=ON FUNCTION(kokkos_compiler_is_nvcc VAR COMPILER) @@ -159,3 +114,161 @@ FUNCTION(kokkos_compiler_is_nvcc VAR COMPILER) ENDIF() ENDFUNCTION() +# this function checks whether the current CXX compiler supports building CUDA +FUNCTION(kokkos_cxx_compiler_cuda_test _VAR _COMPILER) + + FILE(WRITE ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cu +" +#include +#include + +__global__ +void kernel(int sz, double* data) +{ + int _beg = blockIdx.x * blockDim.x + threadIdx.x; + for(int i = _beg; i < sz; ++i) + data[i] += static_cast(i); +} + +int main() +{ + double* data = NULL; + int blocks = 64; + int grids = 64; + int ret = cudaMalloc(&data, blocks * grids * sizeof(double)); + if(ret != cudaSuccess) + return EXIT_FAILURE; + kernel<<>>(blocks * grids, data); + cudaDeviceSynchronize(); + return EXIT_SUCCESS; +} +") + + # save the command for debugging + SET(_COMMANDS "${_COMPILER} ${ARGN} -c ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cu") + + # use execute_process instead of try compile because we want to set custom compiler + EXECUTE_PROCESS(COMMAND ${_COMPILER} ${ARGN} -c ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cu + RESULT_VARIABLE _RET + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}/compile_tests + TIMEOUT 15 + OUTPUT_QUIET + ERROR_QUIET) + + IF(NOT _RET EQUAL 0) + # save the command for debugging + SET(_COMMANDS "${_COMMAND}\n${_COMPILER} --cuda-gpu-arch=sm_35 ${ARGN} -c ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cu") + # try the compile test again with clang arguments + EXECUTE_PROCESS(COMMAND ${_COMPILER} --cuda-gpu-arch=sm_35 -c ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cu + RESULT_VARIABLE _RET + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}/compile_tests + TIMEOUT 15 + OUTPUT_QUIET + ERROR_QUIET) + ENDIF() + + SET(${_VAR}_COMMANDS "${_COMMANDS}" PARENT_SCOPE) + SET(${_VAR} ${_RET} PARENT_SCOPE) +ENDFUNCTION() + +# this function is provided to easily select which files use the same compiler as Kokkos +# when it was installed (or nvcc_wrapper): +# +# GLOBAL --> all files +# TARGET --> all files in a target +# SOURCE --> specific source files +# DIRECTORY --> all files in directory +# PROJECT --> all files/targets in a project/subproject +# +# Use the COMPILER argument to specify a compiler, if needed. By default, it will +# set the values to ${Kokkos_CXX_COMPILER} unless Kokkos_ENABLE_CUDA=ON and +# Kokkos_CXX_COMPILER_ID is NVIDIA, then it will set it to nvcc_wrapper +# +# Use CHECK_CUDA_COMPILES to run a check when CUDA is enabled +# +FUNCTION(kokkos_compilation) + CMAKE_PARSE_ARGUMENTS(COMP + "GLOBAL;PROJECT;CHECK_CUDA_COMPILES" + "COMPILER" + "DIRECTORY;TARGET;SOURCE;COMMAND_PREFIX" + ${ARGN}) + + # if built w/o CUDA support, we want to basically make this a no-op + SET(_Kokkos_ENABLE_CUDA @Kokkos_ENABLE_CUDA@) + + # search relative first and then absolute + SET(_HINTS "${CMAKE_CURRENT_LIST_DIR}/../.." "@CMAKE_INSTALL_PREFIX@") + + # find kokkos_launch_compiler + FIND_PROGRAM(Kokkos_COMPILE_LAUNCHER + NAMES kokkos_launch_compiler + HINTS ${_HINTS} + PATHS ${_HINTS} + PATH_SUFFIXES bin) + + IF(NOT Kokkos_COMPILE_LAUNCHER) + MESSAGE(FATAL_ERROR "Kokkos could not find 'kokkos_launch_compiler'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/launcher'") + ENDIF() + + # if COMPILER was not specified, assume Kokkos_CXX_COMPILER + IF(NOT COMP_COMPILER) + SET(COMP_COMPILER ${Kokkos_CXX_COMPILER}) + IF(_Kokkos_ENABLE_CUDA AND Kokkos_CXX_COMPILER_ID STREQUAL NVIDIA) + # find nvcc_wrapper + FIND_PROGRAM(Kokkos_NVCC_WRAPPER + NAMES nvcc_wrapper + HINTS ${_HINTS} + PATHS ${_HINTS} + PATH_SUFFIXES bin) + # fatal if we can't nvcc_wrapper + IF(NOT Kokkos_NVCC_WRAPPER) + MESSAGE(FATAL_ERROR "Kokkos could not find nvcc_wrapper. Please set '-DKokkos_NVCC_WRAPPER=/path/to/nvcc_wrapper'") + ENDIF() + SET(COMP_COMPILER ${Kokkos_NVCC_WRAPPER}) + ENDIF() + ENDIF() + + # check that the original compiler still exists! + IF(NOT EXISTS ${COMP_COMPILER}) + MESSAGE(FATAL_ERROR "Kokkos could not find original compiler: '${COMP_COMPILER}'") + ENDIF() + + # try to ensure that compiling cuda code works! + IF(_Kokkos_ENABLE_CUDA AND COMP_CHECK_CUDA_COMPILES) + + # this may fail if kokkos_compiler launcher was used during install + kokkos_cxx_compiler_cuda_test(_COMPILES_CUDA + ${Kokkos_COMPILE_LAUNCHER} ${COMP_COMPILER} ${CMAKE_CXX_COMPILER}) + + # if above failed, throw an error + IF(NOT _COMPILES_CUDA) + MESSAGE(FATAL_ERROR "kokkos_cxx_compiler_cuda_test failed! Test commands:\n${_COMPILES_CUDA_COMMANDS}") + ENDIF() + ENDIF() + + IF(COMP_COMMAND_PREFIX) + SET(_PREFIX "${COMP_COMMAND_PREFIX}") + STRING(REPLACE ";" " " _PREFIX "${COMP_COMMAND_PREFIX}") + SET(Kokkos_COMPILER_LAUNCHER "${_PREFIX} ${Kokkos_COMPILE_LAUNCHER}") + ENDIF() + + IF(COMP_GLOBAL) + # if global, don't bother setting others + SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${COMP_COMPILER} ${CMAKE_CXX_COMPILER}") + SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${COMP_COMPILER} ${CMAKE_CXX_COMPILER}") + ELSE() + FOREACH(_TYPE PROJECT DIRECTORY TARGET SOURCE) + # make project/subproject scoping easy, e.g. KokkosCompilation(PROJECT) after project(...) + IF("${_TYPE}" STREQUAL "PROJECT" AND COMP_${_TYPE}) + LIST(APPEND COMP_DIRECTORY ${PROJECT_SOURCE_DIR}) + UNSET(COMP_${_TYPE}) + ENDIF() + # set the properties if defined + IF(COMP_${_TYPE}) + # MESSAGE(STATUS "Using ${COMP_COMPILER} :: ${_TYPE} :: ${COMP_${_TYPE}}") + SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${COMP_COMPILER} ${CMAKE_CXX_COMPILER}") + SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${COMP_COMPILER} ${CMAKE_CXX_COMPILER}") + ENDIF() + ENDFOREACH() + ENDIF() +ENDFUNCTION() diff --git a/lib/kokkos/cmake/KokkosCore_config.h.in b/lib/kokkos/cmake/KokkosCore_config.h.in index 0259fe69d5..fbfae3711e 100644 --- a/lib/kokkos/cmake/KokkosCore_config.h.in +++ b/lib/kokkos/cmake/KokkosCore_config.h.in @@ -78,6 +78,7 @@ #cmakedefine KOKKOS_ARCH_POWER7 #cmakedefine KOKKOS_ARCH_POWER8 #cmakedefine KOKKOS_ARCH_POWER9 +#cmakedefine KOKKOS_ARCH_INTEL_GEN #cmakedefine KOKKOS_ARCH_KEPLER #cmakedefine KOKKOS_ARCH_KEPLER30 #cmakedefine KOKKOS_ARCH_KEPLER32 @@ -95,5 +96,8 @@ #cmakedefine KOKKOS_ARCH_VOLTA72 #cmakedefine KOKKOS_ARCH_TURING75 #cmakedefine KOKKOS_ARCH_AMPERE80 +#cmakedefine KOKKOS_ARCH_AMPERE86 #cmakedefine KOKKOS_ARCH_AMD_ZEN #cmakedefine KOKKOS_ARCH_AMD_ZEN2 + +#cmakedefine KOKKOS_IMPL_DISABLE_SYCL_DEVICE_PRINTF diff --git a/lib/kokkos/cmake/Modules/CudaToolkit.cmake b/lib/kokkos/cmake/Modules/CudaToolkit.cmake index d620a71d36..eda5541f7c 100644 --- a/lib/kokkos/cmake/Modules/CudaToolkit.cmake +++ b/lib/kokkos/cmake/Modules/CudaToolkit.cmake @@ -481,76 +481,6 @@ if(CMAKE_CUDA_COMPILER_LOADED AND NOT CUDAToolkit_BIN_DIR AND CMAKE_CUDA_COMPILE unset(cuda_dir) endif() -IF(CMAKE_VERSION VERSION_LESS "3.12.0") - function(import_target_link_libraries target) - cmake_parse_arguments(HACK - "SYSTEM;INTERFACE;PUBLIC" - "" - "" - ${ARGN} - ) - get_target_property(LIBS ${target} INTERFACE_LINK_LIBRARIES) - if (LIBS) - list(APPEND LIBS ${HACK_UNPARSED_ARGUMENTS}) - else() - set(LIBS ${HACK_UNPARSED_ARGUMENTS}) - endif() - set_target_properties(${target} PROPERTIES - INTERFACE_LINK_LIBRARIES "${LIBS}") - endfunction() -ELSE() - function(import_target_link_libraries) - target_link_libraries(${ARGN}) - endfunction() -ENDIF() - -IF(CMAKE_VERSION VERSION_LESS "3.13.0") - function(import_target_link_directories target) - cmake_parse_arguments(HACK - "SYSTEM;INTERFACE;PUBLIC" - "" - "" - ${ARGN} - ) - get_target_property(LINK_LIBS ${target} INTERFACE_LINK_LIBRARIES) - if (LINK_LIBS) #could be not-found - set(LINK_LIBS_LIST ${LINK_LIBS}) - endif() - foreach(LIB ${HACK_UNPARSED_ARGUMENTS}) - list(APPEND LINK_LIBS_LIST -L${LIB}) - endforeach() - set_target_properties(${target} PROPERTIES - INTERFACE_LINK_LIBRARIES "${LINK_LIBS_LIST}") - endfunction() -ELSE() - function(import_target_link_directories) - target_link_directories(${ARGN}) - endfunction() -ENDIF() - -IF(CMAKE_VERSION VERSION_LESS "3.12.0") - function(import_target_include_directories target) - cmake_parse_arguments(HACK - "SYSTEM;INTERFACE;PUBLIC" - "" - "" - ${ARGN} - ) - get_target_property(INLUDE_DIRS ${target} INTERFACE_INCLUDE_DIRECTORIES) - if (INCLUDE_DIRS) - list(APPEND INCLUDE_DIRS ${HACK_UNPARSED_ARGUMENTS}) - else() - set(INCLUDE_DIRS ${HACK_UNPARSED_ARGUMENTS}) - endif() - set_target_properties(${target} PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES "${INCLUDE_DIRS}") - endfunction() -ELSE() - function(import_target_include_directories) - target_include_directories(${ARGN}) - endfunction() -ENDIF() - # Try language- or user-provided path first. if(CUDAToolkit_BIN_DIR) find_program(CUDAToolkit_NVCC_EXECUTABLE @@ -854,11 +784,11 @@ if(CUDAToolkit_FOUND) if (NOT TARGET CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY) add_library(CUDA::${lib_name} IMPORTED INTERFACE) - import_target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}") - import_target_link_libraries(CUDA::${lib_name} INTERFACE "${CUDA_${lib_name}_LIBRARY}") + target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}") + target_link_libraries(CUDA::${lib_name} INTERFACE "${CUDA_${lib_name}_LIBRARY}") foreach(dep ${arg_DEPS}) if(TARGET CUDA::${dep}) - import_target_link_libraries(CUDA::${lib_name} INTERFACE CUDA::${dep}) + target_link_libraries(CUDA::${lib_name} INTERFACE CUDA::${dep}) endif() endforeach() endif() @@ -866,8 +796,8 @@ if(CUDAToolkit_FOUND) if(NOT TARGET CUDA::toolkit) add_library(CUDA::toolkit IMPORTED INTERFACE) - import_target_include_directories(CUDA::toolkit SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}") - import_target_link_directories(CUDA::toolkit INTERFACE "${CUDAToolkit_LIBRARY_DIR}") + target_include_directories(CUDA::toolkit SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}") + target_link_directories(CUDA::toolkit INTERFACE "${CUDAToolkit_LIBRARY_DIR}") endif() _CUDAToolkit_find_and_add_import_lib(cuda_driver ALT cuda) @@ -882,11 +812,11 @@ if(CUDAToolkit_FOUND) AND TARGET CUDA::cudart_static) add_library(CUDA::cudart_static_deps IMPORTED INTERFACE) - import_target_link_libraries(CUDA::cudart_static INTERFACE CUDA::cudart_static_deps) + target_link_libraries(CUDA::cudart_static INTERFACE CUDA::cudart_static_deps) if(UNIX AND (CMAKE_C_COMPILER OR CMAKE_CXX_COMPILER)) find_package(Threads REQUIRED) - import_target_link_libraries(CUDA::cudart_static_deps INTERFACE Threads::Threads ${CMAKE_DL_LIBS}) + target_link_libraries(CUDA::cudart_static_deps INTERFACE Threads::Threads ${CMAKE_DL_LIBS}) endif() if(UNIX AND NOT APPLE) @@ -896,7 +826,7 @@ if(CUDAToolkit_FOUND) if(NOT CUDAToolkit_rt_LIBRARY) message(WARNING "Could not find librt library, needed by CUDA::cudart_static") else() - import_target_link_libraries(CUDA::cudart_static_deps INTERFACE ${CUDAToolkit_rt_LIBRARY}) + target_link_libraries(CUDA::cudart_static_deps INTERFACE ${CUDAToolkit_rt_LIBRARY}) endif() endif() endif() diff --git a/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake b/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake index a1072a60c6..8d58d96415 100644 --- a/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake @@ -25,7 +25,7 @@ IF (TARGET CUDA::cuda_driver) SET(FOUND_CUDA_DRIVER TRUE) KOKKOS_EXPORT_IMPORTED_TPL(CUDA::cuda_driver) ELSE() - SET(FOUND_CUDA_DRIVVER FALSE) + SET(FOUND_CUDA_DRIVER FALSE) ENDIF() include(FindPackageHandleStandardArgs) diff --git a/lib/kokkos/cmake/Modules/FindTPLPTHREAD.cmake b/lib/kokkos/cmake/Modules/FindTPLPTHREAD.cmake index 1d154e29af..a743fca0e4 100644 --- a/lib/kokkos/cmake/Modules/FindTPLPTHREAD.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLPTHREAD.cmake @@ -10,7 +10,7 @@ TRY_COMPILE(KOKKOS_HAS_PTHREAD_ARG # ${CMAKE_CXX${KOKKOS_CXX_STANDARD}_STANDARD_COMPILE_OPTION} INCLUDE(FindPackageHandleStandardArgs) -FIND_PACKAGE_HANDLE_STANDARD_ARGS(PTHREAD DEFAULT_MSG KOKKOS_HAS_PTHREAD_ARG) +FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLPTHREAD DEFAULT_MSG KOKKOS_HAS_PTHREAD_ARG) #Only create the TPL if we succeed IF (KOKKOS_HAS_PTHREAD_ARG) KOKKOS_CREATE_IMPORTED_TPL(PTHREAD diff --git a/lib/kokkos/cmake/Modules/FindTPLROCM.cmake b/lib/kokkos/cmake/Modules/FindTPLROCM.cmake new file mode 100644 index 0000000000..512ad6ceb2 --- /dev/null +++ b/lib/kokkos/cmake/Modules/FindTPLROCM.cmake @@ -0,0 +1,11 @@ +include(FindPackageHandleStandardArgs) + +FIND_LIBRARY(AMD_HIP_LIBRARY amdhip64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) +FIND_LIBRARY(HSA_RUNTIME_LIBRARY hsa-runtime64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) + +find_package_handle_standard_args(TPLROCM DEFAULT_MSG AMD_HIP_LIBRARY HSA_RUNTIME_LIBRARY) + +kokkos_create_imported_tpl(ROCM INTERFACE + LINK_LIBRARIES ${HSA_RUNTIME_LIBRARY} ${AMD_HIP_LIBRARY} + COMPILE_DEFINITIONS __HIP_ROCclr__ +) diff --git a/lib/kokkos/cmake/compile_tests/cplusplus14.cpp b/lib/kokkos/cmake/compile_tests/cplusplus14.cpp new file mode 100644 index 0000000000..52ec9885ec --- /dev/null +++ b/lib/kokkos/cmake/compile_tests/cplusplus14.cpp @@ -0,0 +1,8 @@ +#include + +int main() { + // _t versions of type traits were added in C++14 + std::remove_cv_t i = 0; + + return i; +} diff --git a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc index 48c01c070c..a26ac5af4b 100644 --- a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc +++ b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc @@ -72,6 +72,7 @@ int main() { case 72: std::cout << "Set -DKokkos_ARCH_VOLTA72=ON ." << std::endl; break; case 75: std::cout << "Set -DKokkos_ARCH_TURING75=ON ." << std::endl; break; case 80: std::cout << "Set -DKokkos_ARCH_AMPERE80=ON ." << std::endl; break; + case 86: std::cout << "Set -DKokkos_ARCH_AMPERE86=ON ." << std::endl; break; default: std::cout << "Compute capability " << compute_capability << " is not supported" << std::endl; diff --git a/lib/kokkos/cmake/compile_tests/pthread.cpp b/lib/kokkos/cmake/compile_tests/pthread.cpp index 92310da029..3f83bf6a5f 100644 --- a/lib/kokkos/cmake/compile_tests/pthread.cpp +++ b/lib/kokkos/cmake/compile_tests/pthread.cpp @@ -2,7 +2,7 @@ void* kokkos_test(void* args) { return args; } -int main(void) { +int main() { pthread_t thread; /* Use NULL to avoid C++11. Some compilers do not have C++11 by default. Forcing C++11 diff --git a/lib/kokkos/cmake/fake_tribits.cmake b/lib/kokkos/cmake/fake_tribits.cmake index 2e82a46235..fbd6745a60 100644 --- a/lib/kokkos/cmake/fake_tribits.cmake +++ b/lib/kokkos/cmake/fake_tribits.cmake @@ -81,10 +81,16 @@ ENDMACRO() FUNCTION(KOKKOS_ADD_TEST) if (KOKKOS_HAS_TRILINOS) CMAKE_PARSE_ARGUMENTS(TEST - "" + "SKIP_TRIBITS" "EXE;NAME;TOOL" "ARGS" ${ARGN}) + + IF(TEST_SKIP_TRIBITS) + MESSAGE(STATUS "Skipping test ${TEST_NAME} in TriBits") + RETURN() + ENDIF() + IF(TEST_EXE) SET(EXE_ROOT ${TEST_EXE}) ELSE() @@ -119,11 +125,10 @@ FUNCTION(KOKKOS_ADD_TEST) endif() else() CMAKE_PARSE_ARGUMENTS(TEST - "WILL_FAIL" + "WILL_FAIL;SKIP_TRIBITS" "FAIL_REGULAR_EXPRESSION;PASS_REGULAR_EXPRESSION;EXE;NAME;TOOL" "CATEGORIES;ARGS" ${ARGN}) - SET(TESTS_ADDED) # To match Tribits, we should always be receiving # the root names of exes/libs IF(TEST_EXE) @@ -135,48 +140,27 @@ FUNCTION(KOKKOS_ADD_TEST) # These should be the full target name SET(TEST_NAME ${PACKAGE_NAME}_${TEST_NAME}) SET(EXE ${PACKAGE_NAME}_${EXE_ROOT}) - IF (TEST_ARGS) - SET(TEST_NUMBER 0) - FOREACH (ARG_STR ${TEST_ARGS}) - # This is passed as a single string blob to match TriBITS behavior - # We need this to be turned into a list - STRING(REPLACE " " ";" ARG_STR_LIST ${ARG_STR}) - IF(WIN32) - ADD_TEST(NAME ${TEST_NAME}${TEST_NUMBER} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} - COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX} ${ARG_STR_LIST}) - ELSE() - ADD_TEST(NAME ${TEST_NAME}${TEST_NUMBER} COMMAND ${EXE} ${ARG_STR_LIST}) - ENDIF() - LIST(APPEND TESTS_ADDED "${TEST_NAME}${TEST_NUMBER}") - MATH(EXPR TEST_NUMBER "${TEST_NUMBER} + 1") - ENDFOREACH() + IF(WIN32) + ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} + COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX} ${TEST_ARGS}) ELSE() - IF(WIN32) - ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} - COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX}) - ELSE() - ADD_TEST(NAME ${TEST_NAME} COMMAND ${EXE}) - ENDIF() - LIST(APPEND TESTS_ADDED "${TEST_NAME}") + ADD_TEST(NAME ${TEST_NAME} COMMAND ${EXE} ${TEST_ARGS}) + ENDIF() + IF(TEST_WILL_FAIL) + SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${TEST_WILL_FAIL}) + ENDIF() + IF(TEST_FAIL_REGULAR_EXPRESSION) + SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${TEST_FAIL_REGULAR_EXPRESSION}) + ENDIF() + IF(TEST_PASS_REGULAR_EXPRESSION) + SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${TEST_PASS_REGULAR_EXPRESSION}) + ENDIF() + IF(TEST_TOOL) + ADD_DEPENDENCIES(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool + SET_PROPERTY(TEST ${TEST_NAME} APPEND_STRING PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$") ENDIF() - - FOREACH(TEST_NAME ${TESTS_ADDED}) - IF(TEST_WILL_FAIL) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${TEST_WILL_FAIL}) - ENDIF() - IF(TEST_FAIL_REGULAR_EXPRESSION) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${TEST_FAIL_REGULAR_EXPRESSION}) - ENDIF() - IF(TEST_PASS_REGULAR_EXPRESSION) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${TEST_PASS_REGULAR_EXPRESSION}) - ENDIF() - if(TEST_TOOL) - add_dependencies(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool - set_property(TEST ${TEST_NAME} APPEND_STRING PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$") - endif() - ENDFOREACH() VERIFY_EMPTY(KOKKOS_ADD_TEST ${TEST_UNPARSED_ARGUMENTS}) - endif() + ENDIF() ENDFUNCTION() FUNCTION(KOKKOS_ADD_ADVANCED_TEST) @@ -326,14 +310,6 @@ ENDIF() ENDFUNCTION() -FUNCTION(KOKKOS_TARGET_COMPILE_DEFINITIONS) - IF (KOKKOS_HAS_TRILINOS) - TARGET_COMPILE_DEFINITIONS(${TARGET} ${ARGN}) - ELSE() - TARGET_COMPILE_DEFINITIONS(${TARGET} ${ARGN}) - ENDIF() -ENDFUNCTION() - FUNCTION(KOKKOS_INCLUDE_DIRECTORIES) IF(KOKKOS_HAS_TRILINOS) TRIBITS_INCLUDE_DIRECTORIES(${ARGN}) @@ -350,10 +326,6 @@ ENDIF() ENDFUNCTION() -MACRO(KOKKOS_ADD_COMPILE_OPTIONS) -ADD_COMPILE_OPTIONS(${ARGN}) -ENDMACRO() - MACRO(PRINTALL match) get_cmake_property(_variableNames VARIABLES) list (SORT _variableNames) @@ -376,4 +348,3 @@ FUNCTION(GLOBAL_APPEND VARNAME) LIST(APPEND TEMP ${ARGN}) GLOBAL_SET(${VARNAME} ${TEMP}) ENDFUNCTION() - diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake index 53aaf7dccf..ec18e70a36 100644 --- a/lib/kokkos/cmake/kokkos_arch.cmake +++ b/lib/kokkos/cmake/kokkos_arch.cmake @@ -35,7 +35,7 @@ KOKKOS_ARCH_OPTION(ARMV80 HOST "ARMv8.0 Compatible CPU") KOKKOS_ARCH_OPTION(ARMV81 HOST "ARMv8.1 Compatible CPU") KOKKOS_ARCH_OPTION(ARMV8_THUNDERX HOST "ARMv8 Cavium ThunderX CPU") KOKKOS_ARCH_OPTION(ARMV8_THUNDERX2 HOST "ARMv8 Cavium ThunderX2 CPU") -KOKKOS_ARCH_OPTION(A64FX HOST "ARMv8.2 with SVE Suport") +KOKKOS_ARCH_OPTION(A64FX HOST "ARMv8.2 with SVE Support") KOKKOS_ARCH_OPTION(WSM HOST "Intel Westmere CPU") KOKKOS_ARCH_OPTION(SNB HOST "Intel Sandy/Ivy Bridge CPUs") KOKKOS_ARCH_OPTION(HSW HOST "Intel Haswell CPUs") @@ -60,11 +60,12 @@ KOKKOS_ARCH_OPTION(VOLTA70 GPU "NVIDIA Volta generation CC 7.0") KOKKOS_ARCH_OPTION(VOLTA72 GPU "NVIDIA Volta generation CC 7.2") KOKKOS_ARCH_OPTION(TURING75 GPU "NVIDIA Turing generation CC 7.5") KOKKOS_ARCH_OPTION(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0") +KOKKOS_ARCH_OPTION(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6") KOKKOS_ARCH_OPTION(ZEN HOST "AMD Zen architecture") KOKKOS_ARCH_OPTION(ZEN2 HOST "AMD Zen2 architecture") KOKKOS_ARCH_OPTION(VEGA900 GPU "AMD GPU MI25 GFX900") KOKKOS_ARCH_OPTION(VEGA906 GPU "AMD GPU MI50/MI60 GFX906") -KOKKOS_ARCH_OPTION(VEGA908 GPU "AMD GPU") +KOKKOS_ARCH_OPTION(VEGA908 GPU "AMD GPU MI100 GFX908") KOKKOS_ARCH_OPTION(INTEL_GEN GPU "Intel GPUs Gen9+") @@ -141,8 +142,16 @@ ENDIF() #------------------------------- KOKKOS_HIP_OPTIONS --------------------------- #clear anything that might be in the cache GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS) -IF(KOKKOS_CXX_COMPILER_ID STREQUAL HIP) - SET(AMDGPU_ARCH_FLAG "--amdgpu-target") +IF(KOKKOS_ENABLE_HIP) + IF(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + SET(AMDGPU_ARCH_FLAG "--amdgpu-target") + ELSE() + SET(AMDGPU_ARCH_FLAG "--offload-arch") + GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS -x hip) + IF(DEFINED ENV{ROCM_PATH}) + GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS --rocm-path=$ENV{ROCM_PATH}) + ENDIF() + ENDIF() ENDIF() @@ -183,6 +192,8 @@ ENDIF() IF (KOKKOS_ARCH_A64FX) COMPILER_SPECIFIC_FLAGS( DEFAULT -march=armv8.2-a+sve + Clang -march=armv8.2-a+sve -msve-vector-bits=512 + GCC -march=armv8.2-a+sve -msve-vector-bits=512 ) ENDIF() @@ -309,7 +320,7 @@ IF (KOKKOS_ARCH_POWER8 OR KOKKOS_ARCH_POWER9) SET(KOKKOS_USE_ISA_POWERPCLE ON) ENDIF() -IF (Kokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) +IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) COMPILER_SPECIFIC_FLAGS( Clang -fcuda-rdc NVIDIA --relocatable-device-code=true @@ -333,8 +344,8 @@ ENDIF() #Right now we cannot get the compiler ID when cross-compiling, so just check #that HIP is enabled -IF (Kokkos_ENABLE_HIP) - IF (Kokkos_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) +IF (KOKKOS_ENABLE_HIP) + IF (KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) COMPILER_SPECIFIC_FLAGS( DEFAULT -fgpu-rdc ) @@ -345,8 +356,7 @@ IF (Kokkos_ENABLE_HIP) ENDIF() ENDIF() - -IF (Kokkos_ENABLE_SYCL) +IF (KOKKOS_ENABLE_SYCL) COMPILER_SPECIFIC_FLAGS( DEFAULT -fsycl ) @@ -363,7 +373,7 @@ FUNCTION(CHECK_CUDA_ARCH ARCH FLAG) MESSAGE(FATAL_ERROR "Multiple GPU architectures given! Already have ${CUDA_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again.") ENDIF() SET(CUDA_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) - IF (NOT KOKKOS_ENABLE_CUDA AND NOT KOKKOS_ENABLE_OPENMPTARGET) + IF (NOT KOKKOS_ENABLE_CUDA AND NOT KOKKOS_ENABLE_OPENMPTARGET AND NOT KOKKOS_ENABLE_SYCL) MESSAGE(WARNING "Given CUDA arch ${ARCH}, but Kokkos_ENABLE_CUDA and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.") UNSET(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) ELSE() @@ -396,6 +406,7 @@ CHECK_CUDA_ARCH(VOLTA70 sm_70) CHECK_CUDA_ARCH(VOLTA72 sm_72) CHECK_CUDA_ARCH(TURING75 sm_75) CHECK_CUDA_ARCH(AMPERE80 sm_80) +CHECK_CUDA_ARCH(AMPERE86 sm_86) SET(AMDGPU_ARCH_ALREADY_SPECIFIED "") FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG) @@ -405,12 +416,12 @@ FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG) ENDIF() SET(AMDGPU_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) IF (NOT KOKKOS_ENABLE_HIP AND NOT KOKKOS_ENABLE_OPENMPTARGET) - MESSAGE(WARNING "Given HIP arch ${ARCH}, but Kokkos_ENABLE_AMDGPU and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.") + MESSAGE(WARNING "Given AMD GPU architecture ${ARCH}, but Kokkos_ENABLE_HIP and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.") UNSET(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) ELSE() SET(KOKKOS_AMDGPU_ARCH_FLAG ${FLAG} PARENT_SCOPE) GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") - IF(KOKKOS_ENABLE_HIP) + IF(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") ENDIF() ENDIF() @@ -451,6 +462,24 @@ IF (KOKKOS_ENABLE_OPENMPTARGET) ENDIF() ENDIF() +IF (KOKKOS_ENABLE_SYCL) + IF(CUDA_ARCH_ALREADY_SPECIFIED) + IF(KOKKOS_ENABLE_UNSUPPORTED_ARCHS) + COMPILER_SPECIFIC_FLAGS( + DEFAULT -fsycl-targets=nvptx64-nvidia-cuda-sycldevice + ) + # FIXME_SYCL The CUDA backend doesn't support printf yet. + GLOBAL_SET(KOKKOS_IMPL_DISABLE_SYCL_DEVICE_PRINTF ON) + ELSE() + MESSAGE(SEND_ERROR "Setting a CUDA architecture for SYCL is only allowed with Kokkos_ENABLE_UNSUPPORTED_ARCHS=ON!") + ENDIF() + ELSEIF(KOKKOS_ARCH_INTEL_GEN) + COMPILER_SPECIFIC_FLAGS( + DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device skl" + ) + ENDIF() +ENDIF() + IF(KOKKOS_ENABLE_CUDA AND NOT CUDA_ARCH_ALREADY_SPECIFIED) # Try to autodetect the CUDA Compute Capability by asking the device SET(_BINARY_TEST_DIR ${CMAKE_CURRENT_BINARY_DIR}/cmake/compile_tests/CUDAComputeCapabilityWorkdir) @@ -464,6 +493,43 @@ IF(KOKKOS_ENABLE_CUDA AND NOT CUDA_ARCH_ALREADY_SPECIFIED) ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc COMPILE_DEFINITIONS -DSM_ONLY RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY) + + # if user is using kokkos_compiler_launcher, above will fail. + IF(NOT _COMPILE_RESULT OR NOT _RESULT EQUAL 0) + # check to see if CUDA is not already enabled (may happen when Kokkos is subproject) + GET_PROPERTY(_ENABLED_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES) + # language has to be fully enabled, just checking for CMAKE_CUDA_COMPILER isn't enough + IF(NOT "CUDA" IN_LIST _ENABLED_LANGUAGES) + # make sure the user knows that we aren't using CUDA compiler for anything else + MESSAGE(STATUS "CUDA auto-detection of architecture failed with ${CMAKE_CXX_COMPILER}. Enabling CUDA language ONLY to auto-detect architecture...") + INCLUDE(CheckLanguage) + CHECK_LANGUAGE(CUDA) + IF(CMAKE_CUDA_COMPILER) + ENABLE_LANGUAGE(CUDA) + ELSE() + MESSAGE(STATUS "CUDA language could not be enabled") + ENDIF() + ENDIF() + + # if CUDA was enabled, this will be defined + IF(CMAKE_CUDA_COMPILER) + # copy our test to .cu so cmake compiles as CUDA + CONFIGURE_FILE( + ${PROJECT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc + ${PROJECT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu + COPYONLY + ) + # run test again + TRY_RUN( + _RESULT + _COMPILE_RESULT + ${_BINARY_TEST_DIR} + ${PROJECT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu + COMPILE_DEFINITIONS -DSM_ONLY + RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY) + ENDIF() + ENDIF() + LIST(FIND KOKKOS_CUDA_ARCH_FLAGS sm_${_CUDA_COMPUTE_CAPABILITY} FLAG_INDEX) IF(_COMPILE_RESULT AND _RESULT EQUAL 0 AND NOT FLAG_INDEX EQUAL -1) MESSAGE(STATUS "Detected CUDA Compute Capability ${_CUDA_COMPUTE_CAPABILITY}") @@ -500,7 +566,7 @@ IF (KOKKOS_ENABLE_CUDA) SET(KOKKOS_ARCH_VOLTA ON) ENDIF() - IF (KOKKOS_ARCH_AMPERE80) + IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86) SET(KOKKOS_ARCH_AMPERE ON) ENDIF() ENDIF() diff --git a/lib/kokkos/cmake/kokkos_compiler_id.cmake b/lib/kokkos/cmake/kokkos_compiler_id.cmake index e6600161f9..4434d6928f 100644 --- a/lib/kokkos/cmake/kokkos_compiler_id.cmake +++ b/lib/kokkos/cmake/kokkos_compiler_id.cmake @@ -27,6 +27,12 @@ IF(Kokkos_ENABLE_CUDA) PATHS ${PROJECT_SOURCE_DIR} PATH_SUFFIXES bin) + FIND_PROGRAM(Kokkos_NVCC_WRAPPER + NAMES nvcc_wrapper + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin) + # check if compiler was set to nvcc_wrapper kokkos_internal_have_compiler_nvcc(${CMAKE_CXX_COMPILER}) # if launcher was found and nvcc_wrapper was not specified as @@ -37,7 +43,7 @@ IF(Kokkos_ENABLE_CUDA) # if the second argument matches the C++ compiler, it forwards the rest of the # args to nvcc_wrapper kokkos_internal_have_compiler_nvcc( - ${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER} -DKOKKOS_DEPENDENCE) + ${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER} -DKOKKOS_DEPENDENCE) SET(INTERNAL_USE_COMPILER_LAUNCHER true) ENDIF() ENDIF() @@ -55,32 +61,7 @@ IF(INTERNAL_HAVE_COMPILER_NVCC) SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) MESSAGE(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}") IF(INTERNAL_USE_COMPILER_LAUNCHER) - IF(Kokkos_LAUNCH_COMPILER_INFO) - GET_FILENAME_COMPONENT(BASE_COMPILER_NAME ${CMAKE_CXX_COMPILER} NAME) - # does not have STATUS intentionally - MESSAGE("") - MESSAGE("Kokkos_LAUNCH_COMPILER_INFO (${Kokkos_COMPILE_LAUNCHER}):") - MESSAGE(" - Kokkos + CUDA backend requires the C++ files to be compiled as CUDA code.") - MESSAGE(" - kokkos_launch_compiler permits CMAKE_CXX_COMPILER to be set to a traditional C++ compiler when Kokkos_ENABLE_CUDA=ON") - MESSAGE(" by prefixing all the compile and link commands with the path to the script + CMAKE_CXX_COMPILER (${CMAKE_CXX_COMPILER}).") - MESSAGE(" - If any of the compile or link commands have CMAKE_CXX_COMPILER as the first argument, it replaces CMAKE_CXX_COMPILER with nvcc_wrapper.") - MESSAGE(" - If the compile or link command is not CMAKE_CXX_COMPILER, it just executes the command.") - MESSAGE(" - If using ccache, set CMAKE_CXX_COMPILER to nvcc_wrapper explicitly.") - MESSAGE(" - kokkos_compiler_launcher is available to downstream projects as well.") - MESSAGE(" - If CMAKE_CXX_COMPILER=nvcc_wrapper, all legacy behavior will be preserved during 'find_package(Kokkos)'") - MESSAGE(" - If CMAKE_CXX_COMPILER is not nvcc_wrapper, 'find_package(Kokkos)' will apply 'kokkos_compilation(GLOBAL)' unless separable compilation is enabled") - MESSAGE(" - This can be disabled via '-DKokkos_LAUNCH_COMPILER=OFF'") - MESSAGE(" - Use 'find_package(Kokkos COMPONENTS separable_compilation)' to enable separable compilation") - MESSAGE(" - Separable compilation allows you to control the scope of where the compiler transformation behavior (${BASE_COMPILER_NAME} -> nvcc_wrapper) is applied") - MESSAGE(" - The compiler transformation can be applied on a per-project, per-directory, per-target, and/or per-source-file basis") - MESSAGE(" - 'kokkos_compilation(PROJECT)' will apply the compiler transformation to all targets in a project/subproject") - MESSAGE(" - 'kokkos_compilation(TARGET [...])' will apply the compiler transformation to the specified target(s)") - MESSAGE(" - 'kokkos_compilation(SOURCE [...])' will apply the compiler transformation to the specified source file(s)") - MESSAGE(" - 'kokkos_compilation(DIRECTORY [...])' will apply the compiler transformation to the specified directories") - MESSAGE("") - ELSE() - MESSAGE(STATUS "kokkos_launch_compiler (${Kokkos_COMPILE_LAUNCHER}) is enabled... Set Kokkos_LAUNCH_COMPILER_INFO=ON for more info.") - ENDIF() + MESSAGE(STATUS "kokkos_launch_compiler (${Kokkos_COMPILE_LAUNCHER}) is enabled...") kokkos_compilation(GLOBAL) ENDIF() ENDIF() @@ -92,7 +73,11 @@ IF(Kokkos_ENABLE_HIP) OUTPUT_STRIP_TRAILING_WHITESPACE) STRING(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION} ) - SET(KOKKOS_CXX_COMPILER_ID HIP CACHE STRING INTERNAL FORCE) + + STRING(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "HIP version" INTERNAL_COMPILER_VERSION_CONTAINS_HIP) + IF(INTERNAL_COMPILER_VERSION_CONTAINS_HIP GREATER -1) + SET(KOKKOS_CXX_COMPILER_ID HIPCC CACHE STRING INTERNAL FORCE) + ENDIF() STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE}) @@ -103,8 +88,7 @@ ENDIF() IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) # The Cray compiler reports as Clang to most versions of CMake EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - COMMAND grep Cray - COMMAND wc -l + COMMAND grep -c Cray OUTPUT_VARIABLE INTERNAL_HAVE_CRAY_COMPILER OUTPUT_STRIP_TRAILING_WHITESPACE) IF (INTERNAL_HAVE_CRAY_COMPILER) #not actually Clang @@ -112,8 +96,7 @@ IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) ENDIF() # The clang based Intel compiler reports as Clang to most versions of CMake EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - COMMAND grep icpx - COMMAND wc -l + COMMAND grep -c "DPC++\\|icpx" OUTPUT_VARIABLE INTERNAL_HAVE_INTEL_COMPILER OUTPUT_STRIP_TRAILING_WHITESPACE) IF (INTERNAL_HAVE_INTEL_COMPILER) #not actually Clang @@ -174,7 +157,7 @@ ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ENDIF() SET(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "Kokkos turns off CXX extensions" FORCE) -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL HIP) +ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 3.8.0) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ENDIF() diff --git a/lib/kokkos/cmake/kokkos_corner_cases.cmake b/lib/kokkos/cmake/kokkos_corner_cases.cmake index 3962c4b16e..a84ac2b630 100644 --- a/lib/kokkos/cmake/kokkos_corner_cases.cmake +++ b/lib/kokkos/cmake/kokkos_corner_cases.cmake @@ -49,11 +49,14 @@ ENDIF() IF (KOKKOS_CXX_STANDARD STREQUAL 17) IF (KOKKOS_CXX_COMPILER_ID STREQUAL GNU AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 7) - MESSAGE(FATAL_ERROR "You have requested c++17 support for GCC ${KOKKOS_CXX_COMPILER_VERSION}. Although CMake has allowed this and GCC accepts -std=c++1z/c++17, GCC <= 6 does not properly support *this capture. Please reduce the C++ standard to 14 or upgrade the compiler if you do need C++17 support.") + MESSAGE(FATAL_ERROR "You have requested C++17 support for GCC ${KOKKOS_CXX_COMPILER_VERSION}. Although CMake has allowed this and GCC accepts -std=c++1z/c++17, GCC < 7 does not properly support *this capture. Please reduce the C++ standard to 14 or upgrade the compiler if you do need C++17 support.") ENDIF() IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 11) - MESSAGE(FATAL_ERROR "You have requested c++17 support for NVCC ${KOKKOS_CXX_COMPILER_VERSION}. NVCC only supports C++17 from version 11 on. Please reduce the C++ standard to 14 or upgrade the compiler if you need C++17 support.") + MESSAGE(FATAL_ERROR "You have requested C++17 support for NVCC ${KOKKOS_CXX_COMPILER_VERSION}. NVCC only supports C++17 from version 11 on. Please reduce the C++ standard to 14 or upgrade the compiler if you need C++17 support.") + ENDIF() + IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND KOKKOS_ENABLE_CUDA_CONSTEXPR) + MESSAGE(WARNING "You have requested -DKokkos_ENABLE_CUDA_CONSTEXPR=ON with C++17 support for NVCC ${KOKKOS_CXX_COMPILER_VERSION} which is known to trigger compiler bugs. See https://github.com/kokkos/kokkos/issues/3496") ENDIF() ENDIF() diff --git a/lib/kokkos/cmake/kokkos_enable_devices.cmake b/lib/kokkos/cmake/kokkos_enable_devices.cmake index 41ee10a8a0..445dad47ce 100644 --- a/lib/kokkos/cmake/kokkos_enable_devices.cmake +++ b/lib/kokkos/cmake/kokkos_enable_devices.cmake @@ -48,9 +48,6 @@ IF(KOKKOS_ENABLE_OPENMP) IF(KOKKOS_CLANG_IS_CRAY) SET(ClangOpenMPFlag -fopenmp) ENDIF() - IF(KOKKOS_CLANG_IS_INTEL) - SET(ClangOpenMPFlag -fiopenmp) - ENDIF() IF(KOKKOS_COMPILER_CLANG_MSVC) #for clang-cl expression /openmp yields an error, so directly add the specific Clang flag SET(ClangOpenMPFlag /clang:-fopenmp=libomp) @@ -64,6 +61,7 @@ IF(KOKKOS_ENABLE_OPENMP) COMPILER_SPECIFIC_FLAGS( COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID Clang -Xcompiler ${ClangOpenMPFlag} + IntelClang -Xcompiler -fiopenmp PGI -Xcompiler -mp Cray NO-VALUE-SPECIFIED XL -Xcompiler -qsmp=omp @@ -72,6 +70,7 @@ IF(KOKKOS_ENABLE_OPENMP) ELSE() COMPILER_SPECIFIC_FLAGS( Clang ${ClangOpenMPFlag} + IntelClang -fiopenmp AppleClang -Xpreprocessor -fopenmp PGI -mp Cray NO-VALUE-SPECIFIED @@ -152,3 +151,11 @@ IF (KOKKOS_ENABLE_HIP) ENDIF() KOKKOS_DEVICE_OPTION(SYCL OFF DEVICE "Whether to build SYCL backend") + +## SYCL has extra setup requirements, turn on Kokkos_Setup_SYCL.hpp in macros +IF (KOKKOS_ENABLE_SYCL) + IF(KOKKOS_CXX_STANDARD LESS 17) + MESSAGE(FATAL_ERROR "SYCL backend requires C++17 or newer!") + ENDIF() + LIST(APPEND DEVICE_SETUP_LIST SYCL) +ENDIF() diff --git a/lib/kokkos/cmake/kokkos_enable_options.cmake b/lib/kokkos/cmake/kokkos_enable_options.cmake index 5df498f373..95bce66c7b 100644 --- a/lib/kokkos/cmake/kokkos_enable_options.cmake +++ b/lib/kokkos/cmake/kokkos_enable_options.cmake @@ -48,6 +48,7 @@ KOKKOS_ENABLE_OPTION(COMPILER_WARNINGS OFF "Whether to print all compiler war KOKKOS_ENABLE_OPTION(PROFILING_LOAD_PRINT OFF "Whether to print information about which profiling tools got loaded") KOKKOS_ENABLE_OPTION(TUNING OFF "Whether to create bindings for tuning tools") KOKKOS_ENABLE_OPTION(AGGRESSIVE_VECTORIZATION OFF "Whether to aggressively vectorize loops") +KOKKOS_ENABLE_OPTION(LAUNCH_COMPILER ON "Whether to potentially use the launch compiler") IF (KOKKOS_ENABLE_CUDA) SET(KOKKOS_COMPILER_CUDA_VERSION "${KOKKOS_COMPILER_VERSION_MAJOR}${KOKKOS_COMPILER_VERSION_MINOR}") @@ -68,6 +69,15 @@ ELSE() ENDIF() KOKKOS_ENABLE_OPTION(COMPLEX_ALIGN ${COMPLEX_ALIGN_DEFAULT} "Whether to align Kokkos::complex to 2*alignof(RealType)") +IF (KOKKOS_ENABLE_TESTS) + SET(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT ON) +ELSE() + SET(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT OFF) +ENDIF() +KOKKOS_ENABLE_OPTION(HEADER_SELF_CONTAINMENT_TESTS ${HEADER_SELF_CONTAINMENT_TESTS_DEFAULT} "Enable header self-containment unit tests") +IF (NOT KOKKOS_ENABLE_TESTS AND KOKKOS_ENABLE_HEADER_SELF_CONTAINMENT_TESTS) + MESSAGE(WARNING "Kokkos_ENABLE_HEADER_SELF_CONTAINMENT_TESTS is ON but Kokkos_ENABLE_TESTS is OFF. Option will be ignored.") +ENDIF() IF (KOKKOS_ENABLE_CUDA AND (KOKKOS_CXX_COMPILER_ID STREQUAL Clang)) SET(CUDA_CONSTEXPR_DEFAULT ON) @@ -76,14 +86,14 @@ ELSE() ENDIF() KOKKOS_ENABLE_OPTION(CUDA_CONSTEXPR ${CUDA_CONSTEXPR_DEFAULT} "Whether to activate experimental relaxed constexpr functions") +Kokkos_ENABLE_OPTION(UNSUPPORTED_ARCHS OFF "Whether to allow architectures in backends Kokkos doesn't optimize for") + FUNCTION(check_device_specific_options) CMAKE_PARSE_ARGUMENTS(SOME "" "DEVICE" "OPTIONS" ${ARGN}) IF(NOT KOKKOS_ENABLE_${SOME_DEVICE}) FOREACH(OPTION ${SOME_OPTIONS}) - IF(CMAKE_VERSION VERSION_GREATER_EQUAL 3.14) - IF(NOT DEFINED CACHE{Kokkos_ENABLE_${OPTION}} OR NOT DEFINED CACHE{Kokkos_ENABLE_${SOME_DEVICE}}) - MESSAGE(FATAL_ERROR "Internal logic error: option '${OPTION}' or device '${SOME_DEVICE}' not recognized.") - ENDIF() + IF(NOT DEFINED CACHE{Kokkos_ENABLE_${OPTION}} OR NOT DEFINED CACHE{Kokkos_ENABLE_${SOME_DEVICE}}) + MESSAGE(FATAL_ERROR "Internal logic error: option '${OPTION}' or device '${SOME_DEVICE}' not recognized.") ENDIF() IF(KOKKOS_ENABLE_${OPTION}) MESSAGE(WARNING "Kokkos_ENABLE_${OPTION} is ON but ${SOME_DEVICE} backend is not enabled. Option will be ignored.") diff --git a/lib/kokkos/cmake/kokkos_functions.cmake b/lib/kokkos/cmake/kokkos_functions.cmake index 2b17d648b4..858322394d 100644 --- a/lib/kokkos/cmake/kokkos_functions.cmake +++ b/lib/kokkos/cmake/kokkos_functions.cmake @@ -169,9 +169,7 @@ MACRO(kokkos_export_imported_tpl NAME) ENDIF() SET(TPL_LINK_OPTIONS) - IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.13.0") - GET_TARGET_PROPERTY(TPL_LINK_OPTIONS ${NAME} INTERFACE_LINK_OPTIONS) - ENDIF() + GET_TARGET_PROPERTY(TPL_LINK_OPTIONS ${NAME} INTERFACE_LINK_OPTIONS) IF(TPL_LINK_OPTIONS) KOKKOS_APPEND_CONFIG_LINE("INTERFACE_LINK_OPTIONS ${TPL_LINK_OPTIONS}") ENDIF() @@ -230,9 +228,7 @@ MACRO(kokkos_import_tpl NAME) # I have still been getting errors about ROOT variables being ignored # I'm not sure if this is a scope issue - but make sure # the policy is set before we do any find_package calls - IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.12.0") - CMAKE_POLICY(SET CMP0074 NEW) - ENDIF() + CMAKE_POLICY(SET CMP0074 NEW) IF (KOKKOS_ENABLE_${NAME}) #Tack on a TPL here to make sure we avoid using anyone else's find @@ -314,7 +310,7 @@ MACRO(kokkos_create_imported_tpl NAME) CMAKE_PARSE_ARGUMENTS(TPL "INTERFACE" "LIBRARY" - "LINK_LIBRARIES;INCLUDES;COMPILE_OPTIONS;LINK_OPTIONS" + "LINK_LIBRARIES;INCLUDES;COMPILE_DEFINITIONS;COMPILE_OPTIONS;LINK_OPTIONS" ${ARGN}) @@ -334,6 +330,9 @@ MACRO(kokkos_create_imported_tpl NAME) IF(TPL_INCLUDES) TARGET_INCLUDE_DIRECTORIES(${NAME} INTERFACE ${TPL_INCLUDES}) ENDIF() + IF(TPL_COMPILE_DEFINITIONS) + TARGET_COMPILE_DEFINITIONS(${NAME} INTERFACE ${TPL_COMPILE_DEFINITIONS}) + ENDIF() IF(TPL_COMPILE_OPTIONS) TARGET_COMPILE_OPTIONS(${NAME} INTERFACE ${TPL_COMPILE_OPTIONS}) ENDIF() @@ -355,6 +354,10 @@ MACRO(kokkos_create_imported_tpl NAME) SET_TARGET_PROPERTIES(${NAME} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${TPL_INCLUDES}") ENDIF() + IF(TPL_COMPILE_DEFINITIONS) + SET_TARGET_PROPERTIES(${NAME} PROPERTIES + INTERFACE_COMPILE_DEFINITIONS "${TPL_COMPILE_DEFINITIONS}") + ENDIF() IF(TPL_COMPILE_OPTIONS) SET_TARGET_PROPERTIES(${NAME} PROPERTIES INTERFACE_COMPILE_OPTIONS "${TPL_COMPILE_OPTIONS}") @@ -770,7 +773,7 @@ FUNCTION(kokkos_link_tpl TARGET) ENDFUNCTION() FUNCTION(COMPILER_SPECIFIC_OPTIONS_HELPER) - SET(COMPILERS NVIDIA PGI XL DEFAULT Cray Intel Clang AppleClang IntelClang GNU HIP Fujitsu) + SET(COMPILERS NVIDIA PGI XL DEFAULT Cray Intel Clang AppleClang IntelClang GNU HIPCC Fujitsu) CMAKE_PARSE_ARGUMENTS( PARSE "LINK_OPTIONS;COMPILE_OPTIONS;COMPILE_DEFINITIONS;LINK_LIBRARIES" @@ -926,6 +929,9 @@ ENDFUNCTION() # DIRECTORY --> all files in directory # PROJECT --> all files/targets in a project/subproject # +# NOTE: this is VERY DIFFERENT than the version in KokkosConfigCommon.cmake.in. +# This version explicitly uses nvcc_wrapper. +# FUNCTION(kokkos_compilation) # check whether the compiler already supports building CUDA KOKKOS_CXX_COMPILER_CUDA_TEST(Kokkos_CXX_COMPILER_COMPILES_CUDA) @@ -947,10 +953,21 @@ FUNCTION(kokkos_compilation) MESSAGE(FATAL_ERROR "Kokkos could not find 'kokkos_launch_compiler'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/launcher'") ENDIF() + # find nvcc_wrapper + FIND_PROGRAM(Kokkos_NVCC_WRAPPER + NAMES nvcc_wrapper + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin) + + IF(NOT Kokkos_COMPILE_LAUNCHER) + MESSAGE(FATAL_ERROR "Kokkos could not find 'nvcc_wrapper'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/nvcc_wrapper'") + ENDIF() + IF(COMP_GLOBAL) # if global, don't bother setting others - SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") - SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") + SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") + SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") ELSE() FOREACH(_TYPE PROJECT DIRECTORY TARGET SOURCE) # make project/subproject scoping easy, e.g. KokkosCompilation(PROJECT) after project(...) @@ -961,8 +978,8 @@ FUNCTION(kokkos_compilation) # set the properties if defined IF(COMP_${_TYPE}) # MESSAGE(STATUS "Using nvcc_wrapper :: ${_TYPE} :: ${COMP_${_TYPE}}") - SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") - SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") + SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") + SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") ENDIF() ENDFOREACH() ENDIF() diff --git a/lib/kokkos/cmake/kokkos_test_cxx_std.cmake b/lib/kokkos/cmake/kokkos_test_cxx_std.cmake index 1d7da922eb..707fb000af 100644 --- a/lib/kokkos/cmake/kokkos_test_cxx_std.cmake +++ b/lib/kokkos/cmake/kokkos_test_cxx_std.cmake @@ -86,6 +86,19 @@ ELSE() MESSAGE(FATAL_ERROR "Unknown C++ standard ${KOKKOS_CXX_STANDARD} - must be 14, 17, or 20") ENDIF() +# Enforce that we can compile a simple C++14 program + +TRY_COMPILE(CAN_COMPILE_CPP14 + ${KOKKOS_TOP_BUILD_DIR}/corner_cases + ${KOKKOS_SOURCE_DIR}/cmake/compile_tests/cplusplus14.cpp + OUTPUT_VARIABLE ERROR_MESSAGE + CXX_STANDARD 14 +) +if (NOT CAN_COMPILE_CPP14) + UNSET(CAN_COMPILE_CPP14 CACHE) #make sure CMake always re-runs this + MESSAGE(FATAL_ERROR "C++${KOKKOS_CXX_STANDARD}-compliant compiler detected, but unable to compile C++14 or later program. Verify that ${CMAKE_CXX_COMPILER_ID}:${CMAKE_CXX_COMPILER_VERSION} is set up correctly (e.g., check that correct library headers are being used).\nFailing output:\n ${ERROR_MESSAGE}") +ENDIF() +UNSET(CAN_COMPILE_CPP14 CACHE) #make sure CMake always re-runs this # Enforce that extensions are turned off for nvcc_wrapper. diff --git a/lib/kokkos/cmake/kokkos_tpls.cmake b/lib/kokkos/cmake/kokkos_tpls.cmake index b58d3696ea..d8d044c9d7 100644 --- a/lib/kokkos/cmake/kokkos_tpls.cmake +++ b/lib/kokkos/cmake/kokkos_tpls.cmake @@ -1,5 +1,6 @@ KOKKOS_CFG_DEPENDS(TPLS OPTIONS) KOKKOS_CFG_DEPENDS(TPLS DEVICES) +KOKKOS_CFG_DEPENDS(TPLS COMPILER_ID) FUNCTION(KOKKOS_TPL_OPTION PKG DEFAULT) CMAKE_PARSE_ARGUMENTS(PARSED @@ -38,6 +39,12 @@ IF(KOKKOS_ENABLE_MEMKIND) ENDIF() KOKKOS_TPL_OPTION(CUDA ${Kokkos_ENABLE_CUDA} TRIBITS CUDA) KOKKOS_TPL_OPTION(LIBRT Off) +IF(KOKKOS_ENABLE_HIP AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + SET(ROCM_DEFAULT ON) +ELSE() + SET(ROCM_DEFAULT OFF) +ENDIF() +KOKKOS_TPL_OPTION(ROCM ${ROCM_DEFAULT}) IF (WIN32) SET(LIBDL_DEFAULT Off) @@ -70,6 +77,7 @@ KOKKOS_IMPORT_TPL(LIBRT) KOKKOS_IMPORT_TPL(LIBDL) KOKKOS_IMPORT_TPL(MEMKIND) KOKKOS_IMPORT_TPL(PTHREAD INTERFACE) +KOKKOS_IMPORT_TPL(ROCM INTERFACE) #Convert list to newlines (which CMake doesn't always like in cache variables) STRING(REPLACE ";" "\n" KOKKOS_TPL_EXPORT_TEMP "${KOKKOS_TPL_EXPORTS}") diff --git a/lib/kokkos/cmake/kokkos_tribits.cmake b/lib/kokkos/cmake/kokkos_tribits.cmake index 059fb192f0..afa036066a 100644 --- a/lib/kokkos/cmake/kokkos_tribits.cmake +++ b/lib/kokkos/cmake/kokkos_tribits.cmake @@ -141,39 +141,54 @@ FUNCTION(KOKKOS_ADD_EXECUTABLE ROOT_NAME) ENDFUNCTION() FUNCTION(KOKKOS_ADD_EXECUTABLE_AND_TEST ROOT_NAME) -CMAKE_PARSE_ARGUMENTS(PARSE - "" - "" - "SOURCES;CATEGORIES;ARGS" - ${ARGN}) -VERIFY_EMPTY(KOKKOS_ADD_EXECUTABLE_AND_TEST ${PARSE_UNPARSED_ARGUMENTS}) + CMAKE_PARSE_ARGUMENTS(PARSE + "" + "" + "SOURCES;CATEGORIES;ARGS" + ${ARGN}) + VERIFY_EMPTY(KOKKOS_ADD_EXECUTABLE_AND_TEST ${PARSE_UNPARSED_ARGUMENTS}) -IF (KOKKOS_HAS_TRILINOS) - IF(DEFINED PARSE_ARGS) - STRING(REPLACE ";" " " PARSE_ARGS "${PARSE_ARGS}") - ENDIF() - TRIBITS_ADD_EXECUTABLE_AND_TEST( - ${ROOT_NAME} - SOURCES ${PARSE_SOURCES} - TESTONLYLIBS kokkos_gtest - NUM_MPI_PROCS 1 - COMM serial mpi - ARGS ${PARSE_ARGS} - CATEGORIES ${PARSE_CATEGORIES} - SOURCES ${PARSE_SOURCES} - FAIL_REGULAR_EXPRESSION " FAILED " - ARGS ${PARSE_ARGS} - ) -ELSE() - KOKKOS_ADD_TEST_EXECUTABLE(${ROOT_NAME} - SOURCES ${PARSE_SOURCES} - ) - KOKKOS_ADD_TEST(NAME ${ROOT_NAME} - EXE ${ROOT_NAME} - FAIL_REGULAR_EXPRESSION " FAILED " - ARGS ${PARSE_ARGS} - ) -ENDIF() + IF (KOKKOS_HAS_TRILINOS) + IF(DEFINED PARSE_ARGS) + STRING(REPLACE ";" " " PARSE_ARGS "${PARSE_ARGS}") + ENDIF() + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ROOT_NAME} + SOURCES ${PARSE_SOURCES} + TESTONLYLIBS kokkos_gtest + NUM_MPI_PROCS 1 + COMM serial mpi + ARGS ${PARSE_ARGS} + CATEGORIES ${PARSE_CATEGORIES} + SOURCES ${PARSE_SOURCES} + FAIL_REGULAR_EXPRESSION " FAILED " + ARGS ${PARSE_ARGS} + ) + ELSE() + KOKKOS_ADD_TEST_EXECUTABLE(${ROOT_NAME} + SOURCES ${PARSE_SOURCES} + ) + IF (PARSE_ARGS) + SET(TEST_NUMBER 0) + FOREACH (ARG_STR ${PARSE_ARGS}) + # This is passed as a single string blob to match TriBITS behavior + # We need this to be turned into a list + STRING(REPLACE " " ";" ARG_STR_LIST ${ARG_STR}) + LIST(APPEND TEST_NAME "${ROOT_NAME}${TEST_NUMBER}") + MATH(EXPR TEST_NUMBER "${TEST_NUMBER} + 1") + KOKKOS_ADD_TEST(NAME ${TEST_NAME} + EXE ${ROOT_NAME} + FAIL_REGULAR_EXPRESSION " FAILED " + ARGS ${ARG_STR_LIST} + ) + ENDFOREACH() + ELSE() + KOKKOS_ADD_TEST(NAME ${ROOT_NAME} + EXE ${ROOT_NAME} + FAIL_REGULAR_EXPRESSION " FAILED " + ) + ENDIF() + ENDIF() ENDFUNCTION() FUNCTION(KOKKOS_SET_EXE_PROPERTY ROOT_NAME) @@ -301,11 +316,26 @@ ENDMACRO() ## Includes generated header files, scripts such as nvcc_wrapper and hpcbind, ## as well as other files provided through plugins. MACRO(KOKKOS_INSTALL_ADDITIONAL_FILES) - # kokkos_launch_compiler is used by Kokkos to prefix compiler commands so that they forward to nvcc_wrapper + + # kokkos_launch_compiler is used by Kokkos to prefix compiler commands so that they forward to original kokkos compiler + # if nvcc_wrapper was not used as CMAKE_CXX_COMPILER, configure the original compiler into kokkos_launch_compiler + IF(NOT "${CMAKE_CXX_COMPILER}" MATCHES "nvcc_wrapper") + SET(NVCC_WRAPPER_DEFAULT_COMPILER "${CMAKE_CXX_COMPILER}") + ELSE() + IF(NOT "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}" STREQUAL "") + SET(NVCC_WRAPPER_DEFAULT_COMPILER "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}") + ENDIF() + ENDIF() + + CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/bin/kokkos_launch_compiler + ${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler + @ONLY) + INSTALL(PROGRAMS "${CMAKE_CURRENT_SOURCE_DIR}/bin/nvcc_wrapper" "${CMAKE_CURRENT_SOURCE_DIR}/bin/hpcbind" "${CMAKE_CURRENT_SOURCE_DIR}/bin/kokkos_launch_compiler" + "${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler" DESTINATION ${CMAKE_INSTALL_BINDIR}) INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h" @@ -313,7 +343,7 @@ MACRO(KOKKOS_INSTALL_ADDITIONAL_FILES) "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_SetupBackend.hpp" "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_DeclareBackend.hpp" "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_PostInclude.hpp" - DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + DESTINATION ${KOKKOS_HEADER_DIR}) ENDMACRO() FUNCTION(KOKKOS_SET_LIBRARY_PROPERTIES LIBRARY_NAME) @@ -330,24 +360,12 @@ FUNCTION(KOKKOS_SET_LIBRARY_PROPERTIES LIBRARY_NAME) ${LIBRARY_NAME} PUBLIC $<$:${KOKKOS_LINK_OPTIONS}> ) - ELSEIF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.13") + ELSE() #I can use link options #just assume CXX linkage TARGET_LINK_OPTIONS( ${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_OPTIONS} ) - ELSE() - #assume CXX linkage, we have no good way to check otherwise - IF (PARSE_PLAIN_STYLE) - TARGET_LINK_LIBRARIES( - ${LIBRARY_NAME} ${KOKKOS_LINK_OPTIONS} - ) - ELSE() - #well, have to do it the wrong way for now - TARGET_LINK_LIBRARIES( - ${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_OPTIONS} - ) - ENDIF() ENDIF() TARGET_COMPILE_OPTIONS( @@ -448,6 +466,13 @@ FUNCTION(KOKKOS_INTERNAL_ADD_LIBRARY LIBRARY_NAME) ${PARSE_SOURCES} ) + IF(PARSE_SHARED OR BUILD_SHARED_LIBS) + SET_TARGET_PROPERTIES(${LIBRARY_NAME} PROPERTIES + VERSION ${Kokkos_VERSION} + SOVERSION ${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR} + ) + ENDIF() + KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(${LIBRARY_NAME}) #In case we are building in-tree, add an alias name diff --git a/lib/kokkos/containers/src/CMakeLists.txt b/lib/kokkos/containers/src/CMakeLists.txt index 7000624b6b..98655896d4 100644 --- a/lib/kokkos/containers/src/CMakeLists.txt +++ b/lib/kokkos/containers/src/CMakeLists.txt @@ -26,8 +26,6 @@ KOKKOS_ADD_LIBRARY( HEADERS ${KOKKOS_CONTAINER_HEADERS} ) -SET_TARGET_PROPERTIES(kokkoscontainers PROPERTIES VERSION ${Kokkos_VERSION}) - KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscontainers ${KOKKOS_TOP_BUILD_DIR} ${CMAKE_CURRENT_BINARY_DIR} @@ -36,4 +34,3 @@ KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscontainers KOKKOS_LINK_INTERNAL_LIBRARY(kokkoscontainers kokkoscore) #----------------------------------------------------------------------------- - diff --git a/lib/kokkos/containers/src/Kokkos_DualView.hpp b/lib/kokkos/containers/src/Kokkos_DualView.hpp index 689f0eb2ed..45710d1f73 100644 --- a/lib/kokkos/containers/src/Kokkos_DualView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DualView.hpp @@ -91,6 +91,25 @@ namespace Kokkos { * behavior. Please see the documentation of Kokkos::View for * examples. The default suffices for most users. */ + +namespace Impl { + +#ifdef KOKKOS_ENABLE_CUDA + +inline const Kokkos::Cuda& get_cuda_space(const Kokkos::Cuda& in) { return in; } + +inline const Kokkos::Cuda& get_cuda_space() { + return *Kokkos::Impl::cuda_get_deep_copy_space(); +} + +template +inline const Kokkos::Cuda& get_cuda_space(const NonCudaExecSpace&) { + return get_cuda_space(); +} + +#endif // KOKKOS_ENABLE_CUDA + +} // namespace Impl template class DualView : public ViewTraits { @@ -295,6 +314,53 @@ class DualView : public ViewTraits { "DualView constructed with incompatible views"); } } + // does the DualView have only one device + struct impl_dualview_is_single_device { + enum : bool { + value = std::is_same::value + }; + }; + + // does the given device match the device of t_dev? + template + struct impl_device_matches_tdev_device { + enum : bool { + value = std::is_same::value + }; + }; + // does the given device match the device of t_host? + template + struct impl_device_matches_thost_device { + enum : bool { + value = std::is_same::value + }; + }; + + // does the given device match the execution space of t_host? + template + struct impl_device_matches_thost_exec { + enum : bool { + value = std::is_same::value + }; + }; + + // does the given device match the execution space of t_dev? + template + struct impl_device_matches_tdev_exec { + enum : bool { + value = std::is_same::value + }; + }; + + // does the given device's memory space match the memory space of t_dev? + template + struct impl_device_matches_tdev_memory_space { + enum : bool { + value = std::is_same::value + }; + }; //@} //! \name Methods for synchronizing, marking as modified, and getting Views. @@ -302,7 +368,7 @@ class DualView : public ViewTraits { /// \brief Return a View on a specific device \c Device. /// - /// Please don't be afraid of the if_c expression in the return + /// Please don't be afraid of the nested if_c expressions in the return /// value's type. That just tells the method what the return type /// should be: t_dev if the \c Device template parameter matches /// this DualView's device type, else t_host. @@ -323,10 +389,17 @@ class DualView : public ViewTraits { /// typename dual_view_type::t_host hostView = DV.view (); /// \endcode template - KOKKOS_INLINE_FUNCTION const typename Impl::if_c< - std::is_same::value, - t_dev, t_host>::type& + KOKKOS_INLINE_FUNCTION const typename std::conditional_t< + impl_device_matches_tdev_device::value, t_dev, + typename std::conditional_t< + impl_device_matches_thost_device::value, t_host, + typename std::conditional_t< + impl_device_matches_thost_exec::value, t_host, + typename std::conditional_t< + impl_device_matches_tdev_exec::value, t_dev, + typename std::conditional_t< + impl_device_matches_tdev_memory_space::value, + t_dev, t_host> > > > > view() const { constexpr bool device_is_memspace = std::is_same::value; @@ -463,6 +536,7 @@ class DualView : public ViewTraits { true); } } + /// \brief Update data on device or host only if data in the other /// space has been marked as modified. /// @@ -480,12 +554,9 @@ class DualView : public ViewTraits { /// the data in either View. You must manually mark modified data /// as modified, by calling the modify() method with the /// appropriate template parameter. - template - void sync(const typename std::enable_if< - (std::is_same::value) || - (std::is_same::value), - int>::type& = 0) { + // deliberately passing args by cref as they're used multiple times + template + void sync_impl(std::true_type, Args const&... args) { if (modified_flags.data() == nullptr) return; int dev = get_device_side(); @@ -497,12 +568,12 @@ class DualView : public ViewTraits { Kokkos::CudaUVMSpace>::value) { if (d_view.data() == h_view.data()) Kokkos::Impl::cuda_prefetch_pointer( - Kokkos::Cuda(), d_view.data(), + Impl::get_cuda_space(args...), d_view.data(), sizeof(typename t_dev::value_type) * d_view.span(), true); } #endif - deep_copy(d_view, h_view); + deep_copy(args..., d_view, h_view); modified_flags(0) = modified_flags(1) = 0; impl_report_device_sync(); } @@ -514,12 +585,12 @@ class DualView : public ViewTraits { Kokkos::CudaUVMSpace>::value) { if (d_view.data() == h_view.data()) Kokkos::Impl::cuda_prefetch_pointer( - Kokkos::Cuda(), d_view.data(), + Impl::get_cuda_space(args...), d_view.data(), sizeof(typename t_dev::value_type) * d_view.span(), false); } #endif - deep_copy(h_view, d_view); + deep_copy(args..., h_view, d_view); modified_flags(0) = modified_flags(1) = 0; impl_report_host_sync(); } @@ -533,10 +604,26 @@ class DualView : public ViewTraits { template void sync(const typename std::enable_if< - (!std::is_same::value) || + (std::is_same::value) || (std::is_same::value), int>::type& = 0) { + sync_impl(std::true_type{}); + } + + template + void sync(const ExecutionSpace& exec, + const typename std::enable_if< + (std::is_same::value) || + (std::is_same::value), + int>::type& = 0) { + sync_impl(std::true_type{}, exec); + } + + // deliberately passing args by cref as they're used multiple times + template + void sync_impl(std::false_type, Args const&...) { if (modified_flags.data() == nullptr) return; int dev = get_device_side(); @@ -557,7 +644,27 @@ class DualView : public ViewTraits { } } - void sync_host() { + template + void sync(const typename std::enable_if< + (!std::is_same::value) || + (std::is_same::value), + int>::type& = 0) { + sync_impl(std::false_type{}); + } + template + void sync(const ExecutionSpace& exec, + const typename std::enable_if< + (!std::is_same::value) || + (std::is_same::value), + int>::type& = 0) { + sync_impl(std::false_type{}, exec); + } + + // deliberately passing args by cref as they're used multiple times + template + void sync_host_impl(Args const&... args) { if (!std::is_same::value) Impl::throw_runtime_exception( @@ -569,18 +676,26 @@ class DualView : public ViewTraits { Kokkos::CudaUVMSpace>::value) { if (d_view.data() == h_view.data()) Kokkos::Impl::cuda_prefetch_pointer( - Kokkos::Cuda(), d_view.data(), + Impl::get_cuda_space(args...), d_view.data(), sizeof(typename t_dev::value_type) * d_view.span(), false); } #endif - deep_copy(h_view, d_view); + deep_copy(args..., h_view, d_view); modified_flags(1) = modified_flags(0) = 0; impl_report_host_sync(); } } - void sync_device() { + template + void sync_host(const ExecSpace& exec) { + sync_host_impl(exec); + } + void sync_host() { sync_host_impl(); } + + // deliberately passing args by cref as they're used multiple times + template + void sync_device_impl(Args const&... args) { if (!std::is_same::value) Impl::throw_runtime_exception( @@ -592,17 +707,23 @@ class DualView : public ViewTraits { Kokkos::CudaUVMSpace>::value) { if (d_view.data() == h_view.data()) Kokkos::Impl::cuda_prefetch_pointer( - Kokkos::Cuda(), d_view.data(), + Impl::get_cuda_space(args...), d_view.data(), sizeof(typename t_dev::value_type) * d_view.span(), true); } #endif - deep_copy(d_view, h_view); + deep_copy(args..., d_view, h_view); modified_flags(1) = modified_flags(0) = 0; impl_report_device_sync(); } } + template + void sync_device(const ExecSpace& exec) { + sync_device_impl(exec); + } + void sync_device() { sync_device_impl(); } + template bool need_sync() const { if (modified_flags.data() == nullptr) return false; @@ -658,6 +779,7 @@ class DualView : public ViewTraits { template void modify() { if (modified_flags.data() == nullptr) return; + if (impl_dualview_is_single_device::value) return; int dev = get_device_side(); if (dev == 1) { // if Device is the same as DualView's device type @@ -690,6 +812,7 @@ class DualView : public ViewTraits { } inline void modify_host() { + if (impl_dualview_is_single_device::value) return; if (modified_flags.data() != nullptr) { modified_flags(0) = (modified_flags(1) > modified_flags(0) ? modified_flags(1) @@ -710,6 +833,7 @@ class DualView : public ViewTraits { } inline void modify_device() { + if (impl_dualview_is_single_device::value) return; if (modified_flags.data() != nullptr) { modified_flags(1) = (modified_flags(1) > modified_flags(0) ? modified_flags(1) diff --git a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp index c66d7a5f36..c6323fef93 100644 --- a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp @@ -245,13 +245,10 @@ KOKKOS_INLINE_FUNCTION bool dyn_rank_view_verify_operator_bounds( return (size_t(i) < map.extent(R)) && dyn_rank_view_verify_operator_bounds(rank, map, args...); } else if (i != 0) { - // FIXME_SYCL SYCL doesn't allow printf in kernels -#ifndef KOKKOS_ENABLE_SYCL - printf( + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "DynRankView Debug Bounds Checking Error: at rank %u\n Extra " "arguments beyond the rank must be zero \n", R); -#endif return (false) && dyn_rank_view_verify_operator_bounds(rank, map, args...); } else { @@ -575,37 +572,22 @@ class DynRankView : public ViewTraits { (is_layout_left || is_layout_right || is_layout_stride) }; - template ::accessible> - struct verify_space { - KOKKOS_FORCEINLINE_FUNCTION static void check() {} - }; - - template - struct verify_space { - KOKKOS_FORCEINLINE_FUNCTION static void check() { - Kokkos::abort( - "Kokkos::DynRankView ERROR: attempt to access inaccessible memory " - "space"); - }; - }; - // Bounds checking macros #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) // rank of the calling operator - included as first argument in ARG -#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(ARG) \ - DynRankView::template verify_space< \ - Kokkos::Impl::ActiveExecutionMemorySpace>::check(); \ - Kokkos::Impl::dyn_rank_view_verify_operator_bounds< \ - typename traits::memory_space> \ +#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(ARG) \ + Kokkos::Impl::verify_space::check(); \ + Kokkos::Impl::dyn_rank_view_verify_operator_bounds< \ + typename traits::memory_space> \ ARG; #else -#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(ARG) \ - DynRankView::template verify_space< \ - Kokkos::Impl::ActiveExecutionMemorySpace>::check(); +#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(ARG) \ + Kokkos::Impl::verify_space::check(); #endif diff --git a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp index 06bd556661..cc949d4c55 100644 --- a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp @@ -76,6 +76,12 @@ struct ChunkArraySpace { using memory_space = typename Kokkos::Experimental::HIPHostPinnedSpace; }; #endif +#ifdef KOKKOS_ENABLE_SYCL +template <> +struct ChunkArraySpace { + using memory_space = typename Kokkos::Experimental::SYCLSharedUSMSpace; +}; +#endif } // end namespace Impl /** \brief Dynamic views are restricted to rank-one and no layout. diff --git a/lib/kokkos/containers/src/Kokkos_OffsetView.hpp b/lib/kokkos/containers/src/Kokkos_OffsetView.hpp index 4fd084338e..0f21a08ba3 100644 --- a/lib/kokkos/containers/src/Kokkos_OffsetView.hpp +++ b/lib/kokkos/containers/src/Kokkos_OffsetView.hpp @@ -377,34 +377,20 @@ class OffsetView : public ViewTraits { std::is_same::value && (is_layout_left || is_layout_right || is_layout_stride); - template ::accessible> - struct verify_space { - KOKKOS_FORCEINLINE_FUNCTION static void check() {} - }; - - template - struct verify_space { - KOKKOS_FORCEINLINE_FUNCTION static void check() { - Kokkos::abort( - "Kokkos::View ERROR: attempt to access inaccessible memory space"); - }; - }; - #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) -#define KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(ARG) \ - OffsetView::template verify_space< \ - Kokkos::Impl::ActiveExecutionMemorySpace>::check(); \ - Kokkos::Experimental::Impl::offsetview_verify_operator_bounds< \ - typename traits::memory_space> \ +#define KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(ARG) \ + Kokkos::Impl::verify_space::check(); \ + Kokkos::Experimental::Impl::offsetview_verify_operator_bounds< \ + typename traits::memory_space> \ ARG; #else -#define KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(ARG) \ - OffsetView::template verify_space< \ - Kokkos::Impl::ActiveExecutionMemorySpace>::check(); +#define KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(ARG) \ + Kokkos::Impl::verify_space::check(); #endif public: diff --git a/lib/kokkos/containers/src/Kokkos_ScatterView.hpp b/lib/kokkos/containers/src/Kokkos_ScatterView.hpp index 5e18f5a80e..dcd4cf73e5 100644 --- a/lib/kokkos/containers/src/Kokkos_ScatterView.hpp +++ b/lib/kokkos/containers/src/Kokkos_ScatterView.hpp @@ -649,13 +649,13 @@ struct ReduceDuplicatesBase { size_t stride; size_t start; size_t n; - ReduceDuplicatesBase(ValueType const* src_in, ValueType* dest_in, - size_t stride_in, size_t start_in, size_t n_in, - std::string const& name) + ReduceDuplicatesBase(ExecSpace const& exec_space, ValueType const* src_in, + ValueType* dest_in, size_t stride_in, size_t start_in, + size_t n_in, std::string const& name) : src(src_in), dst(dest_in), stride(stride_in), start(start_in), n(n_in) { parallel_for( std::string("Kokkos::ScatterView::ReduceDuplicates [") + name + "]", - RangePolicy(0, stride), + RangePolicy(exec_space, 0, stride), static_cast(*this)); } }; @@ -667,9 +667,10 @@ template struct ReduceDuplicates : public ReduceDuplicatesBase { using Base = ReduceDuplicatesBase; - ReduceDuplicates(ValueType const* src_in, ValueType* dst_in, size_t stride_in, - size_t start_in, size_t n_in, std::string const& name) - : Base(src_in, dst_in, stride_in, start_in, n_in, name) {} + ReduceDuplicates(ExecSpace const& exec_space, ValueType const* src_in, + ValueType* dst_in, size_t stride_in, size_t start_in, + size_t n_in, std::string const& name) + : Base(exec_space, src_in, dst_in, stride_in, start_in, n_in, name) {} KOKKOS_FORCEINLINE_FUNCTION void operator()(size_t i) const { for (size_t j = Base::start; j < Base::n; ++j) { ScatterValue struct ResetDuplicatesBase { using Derived = ResetDuplicates; ValueType* data; - ResetDuplicatesBase(ValueType* data_in, size_t size_in, - std::string const& name) + ResetDuplicatesBase(ExecSpace const& exec_space, ValueType* data_in, + size_t size_in, std::string const& name) : data(data_in) { parallel_for( std::string("Kokkos::ScatterView::ResetDuplicates [") + name + "]", - RangePolicy(0, size_in), + RangePolicy(exec_space, 0, size_in), static_cast(*this)); } }; @@ -703,8 +704,9 @@ struct ResetDuplicatesBase { template struct ResetDuplicates : public ResetDuplicatesBase { using Base = ResetDuplicatesBase; - ResetDuplicates(ValueType* data_in, size_t size_in, std::string const& name) - : Base(data_in, size_in, name) {} + ResetDuplicates(ExecSpace const& exec_space, ValueType* data_in, + size_t size_in, std::string const& name) + : Base(exec_space, data_in, size_in, name) {} KOKKOS_FORCEINLINE_FUNCTION void operator()(size_t i) const { ScatterValue @@ -713,6 +715,16 @@ struct ResetDuplicates : public ResetDuplicatesBase { } }; +template +void check_scatter_view_allocation_properties_argument( + ViewCtorProp const&) { + static_assert(ViewCtorProp::has_execution_space && + ViewCtorProp::has_label && + ViewCtorProp::initialize, + "Allocation property must have an execution name as well as a " + "label, and must perform the view initialization"); +} + } // namespace Experimental } // namespace Impl } // namespace Kokkos @@ -762,10 +774,26 @@ class ScatterView const& original_view) : internal_view(original_view) {} + template + ScatterView(execution_space const& /* exec_space */, + View const& original_view) + : internal_view(original_view) {} + template ScatterView(std::string const& name, Dims... dims) : internal_view(name, dims...) {} + // This overload allows specifying an execution space instance to be + // used by passing, e.g., Kokkos::view_alloc(exec_space, "label") as + // first argument. + template + ScatterView(::Kokkos::Impl::ViewCtorProp const& arg_prop, Dims... dims) + : internal_view(arg_prop, dims...) { + using ::Kokkos::Impl::Experimental:: + check_scatter_view_allocation_properties_argument; + check_scatter_view_allocation_properties_argument(arg_prop); + } + template KOKKOS_FUNCTION ScatterView( const ScatterView void contribute_into(View const& dest) const { + contribute_into(execution_space(), dest); + } + + template + void contribute_into(execution_space const& exec_space, + View const& dest) const { using dest_type = View; static_assert(std::is_same::value, "ScatterView contribute destination has different layout"); static_assert( - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< - memory_space, typename dest_type::memory_space>::value, + Kokkos::Impl::SpaceAccessibility< + execution_space, typename dest_type::memory_space>::accessible, "ScatterView contribute destination memory space not accessible"); if (dest.data() == internal_view.data()) return; Kokkos::Impl::Experimental::ReduceDuplicates( - internal_view.data(), dest.data(), 0, 0, 1, internal_view.label()); + exec_space, internal_view.data(), dest.data(), 0, 0, 1, + internal_view.label()); } - void reset() { + void reset(execution_space const& exec_space = execution_space()) { Kokkos::Impl::Experimental::ResetDuplicates( - internal_view.data(), internal_view.size(), internal_view.label()); + exec_space, internal_view.data(), internal_view.size(), + internal_view.label()); } template void reset_except(View const& view) { - if (view.data() != internal_view.data()) reset(); + reset_except(execution_space(), view); + } + + template + void reset_except(const execution_space& exec_space, + View const& view) { + if (view.data() != internal_view.data()) reset(exec_space); } void resize(const size_t n0 = 0, const size_t n1 = 0, const size_t n2 = 0, @@ -928,10 +970,16 @@ class ScatterView ScatterView(View const& original_view) + : ScatterView(execution_space(), original_view) {} + + template + ScatterView(execution_space const& exec_space, + View const& original_view) : unique_token(), internal_view( view_alloc(WithoutInitializing, - std::string("duplicated_") + original_view.label()), + std::string("duplicated_") + original_view.label(), + exec_space), unique_token.size(), original_view.rank_dynamic > 0 ? original_view.extent(0) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -949,14 +997,32 @@ class ScatterView ScatterView(std::string const& name, Dims... dims) - : internal_view(view_alloc(WithoutInitializing, name), + : ScatterView(view_alloc(execution_space(), name), dims...) {} + + // This overload allows specifying an execution space instance to be + // used by passing, e.g., Kokkos::view_alloc(exec_space, "label") as + // first argument. + template + ScatterView(::Kokkos::Impl::ViewCtorProp const& arg_prop, Dims... dims) + : internal_view(view_alloc(WithoutInitializing, + static_cast<::Kokkos::Impl::ViewCtorProp< + void, std::string> const&>(arg_prop) + .value), unique_token.size(), dims...) { - reset(); + using ::Kokkos::Impl::Experimental:: + check_scatter_view_allocation_properties_argument; + check_scatter_view_allocation_properties_argument(arg_prop); + + auto const exec_space = + static_cast<::Kokkos::Impl::ViewCtorProp const&>( + arg_prop) + .value; + reset(exec_space); } template @@ -984,37 +1050,51 @@ class ScatterView void contribute_into(View const& dest) const { + contribute_into(execution_space(), dest); + } + + template + void contribute_into(execution_space const& exec_space, + View const& dest) const { using dest_type = View; static_assert(std::is_same::value, "ScatterView deep_copy destination has different layout"); static_assert( - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< - memory_space, typename dest_type::memory_space>::value, + Kokkos::Impl::SpaceAccessibility< + execution_space, typename dest_type::memory_space>::accessible, "ScatterView deep_copy destination memory space not accessible"); bool is_equal = (dest.data() == internal_view.data()); size_t start = is_equal ? 1 : 0; Kokkos::Impl::Experimental::ReduceDuplicates( - internal_view.data(), dest.data(), internal_view.stride(0), start, - internal_view.extent(0), internal_view.label()); + exec_space, internal_view.data(), dest.data(), internal_view.stride(0), + start, internal_view.extent(0), internal_view.label()); } - void reset() { + void reset(execution_space const& exec_space = execution_space()) { Kokkos::Impl::Experimental::ResetDuplicates( - internal_view.data(), internal_view.size(), internal_view.label()); + exec_space, internal_view.data(), internal_view.size(), + internal_view.label()); } + template void reset_except(View const& view) { + reset_except(execution_space(), view); + } + + template + void reset_except(execution_space const& exec_space, + View const& view) { if (view.data() != internal_view.data()) { - reset(); + reset(exec_space); return; } Kokkos::Impl::Experimental::ResetDuplicates( - internal_view.data() + view.size(), internal_view.size() - view.size(), - internal_view.label()); + exec_space, internal_view.data() + view.size(), + internal_view.size() - view.size(), internal_view.label()); } void resize(const size_t n0 = 0, const size_t n1 = 0, const size_t n2 = 0, @@ -1075,7 +1155,13 @@ class ScatterView - ScatterView(View const& original_view) : unique_token() { + ScatterView(View const& original_view) + : ScatterView(execution_space(), original_view) {} + + template + ScatterView(execution_space const& exec_space, + View const& original_view) + : unique_token() { size_t arg_N[8] = {original_view.rank > 0 ? original_view.extent(0) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, original_view.rank > 1 ? original_view.extent(1) @@ -1094,14 +1180,27 @@ class ScatterView - ScatterView(std::string const& name, Dims... dims) { + ScatterView(std::string const& name, Dims... dims) + : ScatterView(view_alloc(execution_space(), name), dims...) {} + + // This overload allows specifying an execution space instance to be + // used by passing, e.g., Kokkos::view_alloc(exec_space, "label") as + // first argument. + template + ScatterView(::Kokkos::Impl::ViewCtorProp const& arg_prop, + Dims... dims) { + using ::Kokkos::Impl::Experimental:: + check_scatter_view_allocation_properties_argument; + check_scatter_view_allocation_properties_argument(arg_prop); + original_view_type original_view; size_t arg_N[8] = {original_view.rank > 0 ? original_view.static_extent(0) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -1120,10 +1219,20 @@ class ScatterView const&>( + arg_prop) + .value; internal_view = internal_view_type(view_alloc(WithoutInitializing, name), arg_N[0], arg_N[1], arg_N[2], arg_N[3], arg_N[4], arg_N[5], arg_N[6], arg_N[7]); - reset(); + + auto const exec_space = + static_cast<::Kokkos::Impl::ViewCtorProp const&>( + arg_prop) + .value; + reset(exec_space); } template @@ -1166,6 +1275,12 @@ class ScatterView void contribute_into(View const& dest) const { + contribute_into(execution_space(), dest); + } + + template + void contribute_into(execution_space const& exec_space, + View const& dest) const { using dest_type = View; static_assert( std::is_same::value, "ScatterView deep_copy destination has different layout"); static_assert( - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< - memory_space, typename dest_type::memory_space>::value, + Kokkos::Impl::SpaceAccessibility< + execution_space, typename dest_type::memory_space>::accessible, "ScatterView deep_copy destination memory space not accessible"); auto extent = internal_view.extent(internal_view_type::rank - 1); bool is_equal = (dest.data() == internal_view.data()); size_t start = is_equal ? 1 : 0; Kokkos::Impl::Experimental::ReduceDuplicates( - internal_view.data(), dest.data(), + exec_space, internal_view.data(), dest.data(), internal_view.stride(internal_view_type::rank - 1), start, extent, internal_view.label()); } - void reset() { + void reset(execution_space const& exec_space = execution_space()) { Kokkos::Impl::Experimental::ResetDuplicates( - internal_view.data(), internal_view.size(), internal_view.label()); + exec_space, internal_view.data(), internal_view.size(), + internal_view.label()); } + template void reset_except(View const& view) { + reset_except(execution_space(), view); + } + + template + void reset_except(execution_space const& exec_space, + View const& view) { if (view.data() != internal_view.data()) { - reset(); + reset(exec_space); return; } Kokkos::Impl::Experimental::ResetDuplicates( - internal_view.data() + view.size(), internal_view.size() - view.size(), - internal_view.label()); + exec_space, internal_view.data() + view.size(), + internal_view.size() - view.size(), internal_view.label()); } void resize(const size_t n0 = 0, const size_t n1 = 0, const size_t n2 = 0, @@ -1316,21 +1439,21 @@ template ::array_layout, typename ViewTraits::device_type, Op, - typename Kokkos::Impl::if_c< + std::conditional_t< std::is_same::value, typename Kokkos::Impl::Experimental::DefaultDuplication< typename ViewTraits::execution_space>::type, - Duplication>::type, - typename Kokkos::Impl::if_c< + Duplication>, + std::conditional_t< std::is_same::value, typename Kokkos::Impl::Experimental::DefaultContribution< typename ViewTraits::execution_space, - typename Kokkos::Impl::if_c< + typename std::conditional_t< std::is_same::value, typename Kokkos::Impl::Experimental::DefaultDuplication< typename ViewTraits::execution_space>::type, - Duplication>::type>::type, - Contribution>::type> + Duplication>>::type, + Contribution>> create_scatter_view(View const& original_view) { return original_view; // implicit ScatterView constructor call } @@ -1365,12 +1488,21 @@ create_scatter_view(Op, Duplication, Contribution, namespace Kokkos { namespace Experimental { +template +void contribute( + typename ES::execution_space const& exec_space, View& dest, + Kokkos::Experimental::ScatterView const& src) { + src.contribute_into(exec_space, dest); +} + template void contribute( View& dest, Kokkos::Experimental::ScatterView const& src) { - src.contribute_into(dest); + using execution_space = typename ES::execution_space; + contribute(execution_space{}, dest, src); } } // namespace Experimental diff --git a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp index d2affda93a..edb0e7261d 100644 --- a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp +++ b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp @@ -264,26 +264,24 @@ class UnorderedMap { private: enum : size_type { invalid_index = ~static_cast(0) }; - using impl_value_type = - typename Impl::if_c::type; + using impl_value_type = std::conditional_t; - using key_type_view = typename Impl::if_c< + using key_type_view = std::conditional_t< is_insertable_map, View, - View > >::type; + View > >; - using value_type_view = - typename Impl::if_c, - View > >::type; + using value_type_view = std::conditional_t< + is_insertable_map || is_modifiable_map, + View, + View > >; - using size_type_view = typename Impl::if_c< + using size_type_view = std::conditional_t< is_insertable_map, View, - View > >::type; + View > >; using bitset_type = - typename Impl::if_c, - ConstBitset >::type; + std::conditional_t, + ConstBitset >; enum { modified_idx = 0, erasable_idx = 1, failed_insert_idx = 2 }; enum { num_scalars = 3 }; @@ -540,10 +538,7 @@ class UnorderedMap { // Previously claimed an unused entry that was not inserted. // Release this unused entry immediately. if (!m_available_indexes.reset(new_index)) { - // FIXME_SYCL SYCL doesn't allow printf in kernels -#ifndef KOKKOS_ENABLE_SYCL - printf("Unable to free existing\n"); -#endif + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Unable to free existing\n"); } } @@ -659,8 +654,8 @@ class UnorderedMap { /// /// 'const value_type' via Cuda texture fetch must return by value. KOKKOS_FORCEINLINE_FUNCTION - typename Impl::if_c<(is_set || has_const_value), impl_value_type, - impl_value_type &>::type + std::conditional_t<(is_set || has_const_value), impl_value_type, + impl_value_type &> value_at(size_type i) const { return m_values[is_set ? 0 : (i < capacity() ? i : capacity())]; } diff --git a/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp b/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp index 6e450598d1..6047e60f3d 100644 --- a/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp +++ b/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp @@ -57,10 +57,22 @@ namespace Kokkos { namespace Impl { +KOKKOS_FORCEINLINE_FUNCTION +unsigned rotate_left(unsigned i, int r) { + constexpr int size = static_cast(sizeof(unsigned) * CHAR_BIT); + return r ? ((i << r) | (i >> (size - r))) : i; +} + KOKKOS_FORCEINLINE_FUNCTION unsigned rotate_right(unsigned i, int r) { - enum { size = static_cast(sizeof(unsigned) * CHAR_BIT) }; + constexpr int size = static_cast(sizeof(unsigned) * CHAR_BIT); + // FIXME_SYCL llvm.fshr.i32 missing + // (https://github.com/intel/llvm/issues/3308) +#ifdef __SYCL_DEVICE_ONLY__ + return rotate_left(i, size - r); +#else return r ? ((i >> r) | (i << (size - r))) : i; +#endif } template diff --git a/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp b/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp index b06ab0846c..d7c4a5d1ff 100644 --- a/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp +++ b/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp @@ -250,8 +250,8 @@ struct UnorderedMapPrint { uint32_t list = m_map.m_hash_lists(i); for (size_type curr = list, ii = 0; curr != invalid_index; curr = m_map.m_next_index[curr], ++ii) { - printf("%d[%d]: %d->%d\n", list, ii, m_map.key_at(curr), - m_map.value_at(curr)); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d[%d]: %d->%d\n", list, ii, + m_map.key_at(curr), m_map.value_at(curr)); } } }; diff --git a/lib/kokkos/containers/unit_tests/CMakeLists.txt b/lib/kokkos/containers/unit_tests/CMakeLists.txt index c84c5f6d5e..947d222c27 100644 --- a/lib/kokkos/containers/unit_tests/CMakeLists.txt +++ b/lib/kokkos/containers/unit_tests/CMakeLists.txt @@ -2,6 +2,7 @@ KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) +KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL) # Because there is always an exception to the rule @@ -41,11 +42,6 @@ foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL) configure_file(${dir}/dummy.cpp ${file}) list(APPEND UnitTestSources ${file}) endforeach() - list(REMOVE_ITEM UnitTestSources - ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_Bitset.cpp - ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_ScatterView.cpp - ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_UnorderedMap.cpp - ) KOKKOS_ADD_EXECUTABLE_AND_TEST(UnitTest_${Tag} SOURCES ${UnitTestSources}) endif() endforeach() diff --git a/lib/kokkos/containers/unit_tests/Makefile b/lib/kokkos/containers/unit_tests/Makefile index f42b9b7519..82669fe1ab 100644 --- a/lib/kokkos/containers/unit_tests/Makefile +++ b/lib/kokkos/containers/unit_tests/Makefile @@ -26,7 +26,7 @@ override LDFLAGS += -lpthread include $(KOKKOS_PATH)/Makefile.kokkos -KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/containers/unit_tests +KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/containers/unit_tests -I${KOKKOS_PATH}/core/unit_test/category_files TEST_TARGETS = TARGETS = diff --git a/lib/kokkos/containers/unit_tests/TestCuda_Category.hpp b/lib/kokkos/containers/unit_tests/TestCuda_Category.hpp deleted file mode 100644 index 50935d7a34..0000000000 --- a/lib/kokkos/containers/unit_tests/TestCuda_Category.hpp +++ /dev/null @@ -1,51 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Christian R. Trott (crtrott@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#ifndef KOKKOS_TEST_CUDA_HPP -#define KOKKOS_TEST_CUDA_HPP - -#define TEST_CATEGORY cuda -#define TEST_EXECSPACE Kokkos::Cuda - -#endif diff --git a/lib/kokkos/containers/unit_tests/TestDualView.hpp b/lib/kokkos/containers/unit_tests/TestDualView.hpp index 531caf0f85..3eee85ed10 100644 --- a/lib/kokkos/containers/unit_tests/TestDualView.hpp +++ b/lib/kokkos/containers/unit_tests/TestDualView.hpp @@ -114,6 +114,8 @@ struct test_dualview_combinations { a.template modify(); a.template sync(); + a.template sync( + Kokkos::DefaultExecutionSpace{}); a.h_view(5, 1) = 3; a.h_view(6, 1) = 4; @@ -122,11 +124,15 @@ struct test_dualview_combinations { ViewType b = Kokkos::subview(a, std::pair(6, 9), std::pair(0, 1)); a.template sync(); + a.template sync( + Kokkos::DefaultExecutionSpace{}); b.template modify(); Kokkos::deep_copy(b.d_view, 2); a.template sync(); + a.template sync( + Kokkos::DefaultExecutionSpace{}); Scalar count = 0; for (unsigned int i = 0; i < a.d_view.extent(0); i++) for (unsigned int j = 0; j < a.d_view.extent(1); j++) @@ -180,6 +186,7 @@ struct test_dual_view_deep_copy { } else { a.modify_device(); a.sync_host(); + a.sync_host(Kokkos::DefaultExecutionSpace{}); } // Check device view is initialized as expected @@ -208,6 +215,7 @@ struct test_dual_view_deep_copy { b.template sync(); } else { b.sync_host(); + b.sync_host(Kokkos::DefaultExecutionSpace{}); } // Perform same checks on b as done on a @@ -302,6 +310,7 @@ struct test_dualview_resize { ASSERT_EQ(a.extent(1), m / factor); a.sync_device(); + a.sync_device(Kokkos::DefaultExecutionSpace{}); // Check device view is initialized as expected a_d_sum = 0; @@ -404,19 +413,14 @@ void test_dualview_resize() { Impl::test_dualview_resize(); } -// FIXME_SYCL requires MDRange policy -#ifndef KOKKOS_ENABLE_SYCL TEST(TEST_CATEGORY, dualview_combination) { test_dualview_combinations(10, true); } -#endif TEST(TEST_CATEGORY, dualview_alloc) { test_dualview_alloc(10); } -// FIXME_SYCL requires MDRange policy -#ifndef KOKKOS_ENABLE_SYCL TEST(TEST_CATEGORY, dualview_combinations_without_init) { test_dualview_combinations(10, false); } @@ -433,8 +437,133 @@ TEST(TEST_CATEGORY, dualview_realloc) { TEST(TEST_CATEGORY, dualview_resize) { test_dualview_resize(); } + +namespace { +/** + * + * The following tests are a response to + * https://github.com/kokkos/kokkos/issues/3850 + * and + * https://github.com/kokkos/kokkos/pull/3857 + * + * DualViews were returning incorrect view types and taking + * inappropriate actions based on the templated view methods. + * + * Specifically, template view methods were always returning + * a device view if the memory space was UVM and a Kokkos::Device was passed. + * Sync/modify methods completely broke down So these tests exist to make sure + * that we keep the semantics of UVM DualViews intact. + */ +// modify if we have other UVM enabled backends +#ifdef KOKKOS_ENABLE_CUDA // OR other UVM builds +#define UVM_ENABLED_BUILD #endif +#ifdef UVM_ENABLED_BUILD +template +struct UVMSpaceFor; +#endif + +#ifdef KOKKOS_ENABLE_CUDA // specific to CUDA +template <> +struct UVMSpaceFor { + using type = Kokkos::CudaUVMSpace; +}; +#endif + +#ifdef UVM_ENABLED_BUILD +template <> +struct UVMSpaceFor { + using type = typename UVMSpaceFor::type; +}; +#else +template +struct UVMSpaceFor { + using type = typename ExecSpace::memory_space; +}; +#endif + +using ExecSpace = Kokkos::DefaultExecutionSpace; +using MemSpace = typename UVMSpaceFor::type; +using DeviceType = Kokkos::Device; + +using DualViewType = Kokkos::DualView; +using d_device = DeviceType; +using h_device = Kokkos::Device< + Kokkos::DefaultHostExecutionSpace, + typename UVMSpaceFor::type>; + +TEST(TEST_CATEGORY, dualview_device_correct_kokkos_device) { + DualViewType dv("myView", 100); + dv.clear_sync_state(); + auto v_d = dv.template view(); + using vdt = decltype(v_d); + using vdt_d = vdt::device_type; + using vdt_d_e = vdt_d::execution_space; + ASSERT_STREQ(vdt_d_e::name(), Kokkos::DefaultExecutionSpace::name()); +} +TEST(TEST_CATEGORY, dualview_host_correct_kokkos_device) { + DualViewType dv("myView", 100); + dv.clear_sync_state(); + auto v_h = dv.template view(); + using vht = decltype(v_h); + using vht_d = vht::device_type; + using vht_d_e = vht_d::execution_space; + ASSERT_STREQ(vht_d_e::name(), Kokkos::DefaultHostExecutionSpace::name()); +} + +TEST(TEST_CATEGORY, dualview_host_modify_template_device_sync) { + DualViewType dv("myView", 100); + dv.clear_sync_state(); + dv.modify_host(); + dv.template sync(); + EXPECT_TRUE(!dv.need_sync_device()); + EXPECT_TRUE(!dv.need_sync_host()); + dv.clear_sync_state(); +} + +TEST(TEST_CATEGORY, dualview_host_modify_template_device_execspace_sync) { + DualViewType dv("myView", 100); + dv.clear_sync_state(); + dv.modify_host(); + dv.template sync(); + EXPECT_TRUE(!dv.need_sync_device()); + EXPECT_TRUE(!dv.need_sync_host()); + dv.clear_sync_state(); +} + +TEST(TEST_CATEGORY, dualview_device_modify_template_host_sync) { + DualViewType dv("myView", 100); + dv.clear_sync_state(); + dv.modify_device(); + dv.template sync(); + EXPECT_TRUE(!dv.need_sync_device()); + EXPECT_TRUE(!dv.need_sync_host()); + dv.clear_sync_state(); +} +TEST(TEST_CATEGORY, dualview_device_modify_template_host_execspace_sync) { + DualViewType dv("myView", 100); + dv.clear_sync_state(); + dv.modify_device(); + dv.template sync(); + EXPECT_TRUE(!dv.need_sync_device()); + EXPECT_TRUE(!dv.need_sync_host()); + dv.clear_sync_state(); +} + +TEST(TEST_CATEGORY, + dualview_template_views_return_correct_executionspace_views) { + DualViewType dv("myView", 100); + dv.clear_sync_state(); + using hvt = decltype(dv.view()); + using dvt = decltype(dv.view()); + ASSERT_STREQ(Kokkos::DefaultExecutionSpace::name(), + dvt::device_type::execution_space::name()); + ASSERT_STREQ(Kokkos::DefaultHostExecutionSpace::name(), + hvt::device_type::execution_space::name()); +} + +} // anonymous namespace } // namespace Test #endif // KOKKOS_TEST_DUALVIEW_HPP diff --git a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp index 4b9f994417..f018793dd6 100644 --- a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp +++ b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp @@ -243,8 +243,6 @@ struct TestDynamicView { } }; -// FIXME_SYCL needs resize_serial -#ifndef KOKKOS_ENABLE_SYCL TEST(TEST_CATEGORY, dynamic_view) { using TestDynView = TestDynamicView; @@ -252,7 +250,6 @@ TEST(TEST_CATEGORY, dynamic_view) { TestDynView::run(100000 + 100 * i); } } -#endif } // namespace Test diff --git a/lib/kokkos/containers/unit_tests/TestHPX_Category.hpp b/lib/kokkos/containers/unit_tests/TestHPX_Category.hpp deleted file mode 100644 index 64fc7c0757..0000000000 --- a/lib/kokkos/containers/unit_tests/TestHPX_Category.hpp +++ /dev/null @@ -1,51 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Christian R. Trott (crtrott@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#ifndef KOKKOS_TEST_HPX_HPP -#define KOKKOS_TEST_HPX_HPP - -#define TEST_CATEGORY hpx -#define TEST_EXECSPACE Kokkos::Experimental::HPX - -#endif diff --git a/lib/kokkos/containers/unit_tests/TestOffsetView.hpp b/lib/kokkos/containers/unit_tests/TestOffsetView.hpp index 802813b13b..9ddc226e29 100644 --- a/lib/kokkos/containers/unit_tests/TestOffsetView.hpp +++ b/lib/kokkos/containers/unit_tests/TestOffsetView.hpp @@ -130,8 +130,6 @@ void test_offsetview_construction() { } } - // FIXME_SYCL requires MDRange policy -#ifndef KOKKOS_ENABLE_SYCL const int ovmin0 = ov.begin(0); const int ovend0 = ov.end(0); const int ovmin1 = ov.begin(1); @@ -178,7 +176,6 @@ void test_offsetview_construction() { } ASSERT_EQ(OVResult, answer) << "Bad data found in OffsetView"; -#endif #endif { @@ -215,8 +212,6 @@ void test_offsetview_construction() { point3_type{{extent0, extent1, extent2}}); #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) - // FIXME_SYCL requires MDRange policy -#ifdef KOKKOS_ENABLE_SYCL int view3DSum = 0; Kokkos::parallel_reduce( rangePolicy3DZero, @@ -239,7 +234,6 @@ void test_offsetview_construction() { ASSERT_EQ(view3DSum, offsetView3DSum) << "construction of OffsetView from View and begins array broken."; -#endif #endif } view_type viewFromOV = ov.view(); @@ -266,8 +260,6 @@ void test_offsetview_construction() { Kokkos::deep_copy(aView, ov); #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) - // FIXME_SYCL requires MDRange policy -#ifndef KOKKOS_ENABLE_SYCL int sum = 0; Kokkos::parallel_reduce( rangePolicy2D, @@ -277,7 +269,6 @@ void test_offsetview_construction() { sum); ASSERT_EQ(sum, 0) << "deep_copy(view, offsetView) broken."; -#endif #endif } @@ -288,8 +279,6 @@ void test_offsetview_construction() { Kokkos::deep_copy(ov, aView); #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) - // FIXME_SYCL requires MDRange policy -#ifndef KOKKOS_ENABLE_SYCL int sum = 0; Kokkos::parallel_reduce( rangePolicy2D, @@ -299,7 +288,6 @@ void test_offsetview_construction() { sum); ASSERT_EQ(sum, 0) << "deep_copy(offsetView, view) broken."; -#endif #endif } } @@ -471,8 +459,6 @@ void test_offsetview_subview() { ASSERT_EQ(offsetSubview.end(1), 9); #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) - // FIXME_SYCL requires MDRange policy -#ifndef KOKKOS_ENABLE_SYCL using range_type = Kokkos::MDRangePolicy, Kokkos::IndexType >; using point_type = typename range_type::point_type; @@ -498,7 +484,6 @@ void test_offsetview_subview() { sum); ASSERT_EQ(sum, 6 * (e0 - b0) * (e1 - b1)); -#endif #endif } @@ -701,12 +686,9 @@ void test_offsetview_offsets_rank3() { } #endif -// FIXME_SYCL needs MDRangePolicy -#ifndef KOKKOS_ENABLE_SYCL TEST(TEST_CATEGORY, offsetview_construction) { test_offsetview_construction(); } -#endif TEST(TEST_CATEGORY, offsetview_unmanaged_construction) { test_offsetview_unmanaged_construction(); diff --git a/lib/kokkos/containers/unit_tests/TestOpenMP_Category.hpp b/lib/kokkos/containers/unit_tests/TestOpenMP_Category.hpp deleted file mode 100644 index a0169d1702..0000000000 --- a/lib/kokkos/containers/unit_tests/TestOpenMP_Category.hpp +++ /dev/null @@ -1,51 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Christian R. Trott (crtrott@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#ifndef KOKKOS_TEST_OPENMP_HPP -#define KOKKOS_TEST_OPENMP_HPP - -#define TEST_CATEGORY openmp -#define TEST_EXECSPACE Kokkos::OpenMP - -#endif diff --git a/lib/kokkos/containers/unit_tests/TestSYCL_Category.hpp b/lib/kokkos/containers/unit_tests/TestSYCL_Category.hpp deleted file mode 100644 index 51fd3fc911..0000000000 --- a/lib/kokkos/containers/unit_tests/TestSYCL_Category.hpp +++ /dev/null @@ -1,51 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Christian R. Trott (crtrott@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#ifndef KOKKOS_TEST_SYCL_HPP -#define KOKKOS_TEST_SYCL_HPP - -#define TEST_CATEGORY sycl -#define TEST_EXECSPACE Kokkos::Experimental::SYCL - -#endif diff --git a/lib/kokkos/containers/unit_tests/TestScatterView.hpp b/lib/kokkos/containers/unit_tests/TestScatterView.hpp index 3a3cb607a6..fdbce2d492 100644 --- a/lib/kokkos/containers/unit_tests/TestScatterView.hpp +++ b/lib/kokkos/containers/unit_tests/TestScatterView.hpp @@ -437,6 +437,10 @@ struct test_scatter_view_config { Contribution, Op, NumberType>::orig_view_type; + void compile_constructor() { + auto sv = scatter_view_def(Kokkos::view_alloc(DeviceType{}, "label"), 10); + } + void run_test(int n) { // test allocation { diff --git a/lib/kokkos/containers/unit_tests/TestSerial_Category.hpp b/lib/kokkos/containers/unit_tests/TestSerial_Category.hpp deleted file mode 100644 index 2aa09a315a..0000000000 --- a/lib/kokkos/containers/unit_tests/TestSerial_Category.hpp +++ /dev/null @@ -1,51 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Christian R. Trott (crtrott@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#ifndef KOKKOS_TEST_SERIAL_HPP -#define KOKKOS_TEST_SERIAL_HPP - -#define TEST_CATEGORY serial -#define TEST_EXECSPACE Kokkos::Serial - -#endif diff --git a/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp b/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp index 8bb267ce5d..a9a178f95e 100644 --- a/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp +++ b/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp @@ -285,10 +285,7 @@ void run_test_graph4() { TEST(TEST_CATEGORY, staticcrsgraph) { TestStaticCrsGraph::run_test_graph(); - // FIXME_SYCL requires MDRangePolicy -#ifndef KOKKOS_ENABLE_SYCL TestStaticCrsGraph::run_test_graph2(); -#endif TestStaticCrsGraph::run_test_graph3(1, 0); TestStaticCrsGraph::run_test_graph3(1, 1000); TestStaticCrsGraph::run_test_graph3(1, 10000); diff --git a/lib/kokkos/containers/unit_tests/TestThreads_Category.hpp b/lib/kokkos/containers/unit_tests/TestThreads_Category.hpp deleted file mode 100644 index 74a2b0da36..0000000000 --- a/lib/kokkos/containers/unit_tests/TestThreads_Category.hpp +++ /dev/null @@ -1,51 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Christian R. Trott (crtrott@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#ifndef KOKKOS_TEST_THREADS_HPP -#define KOKKOS_TEST_THREADS_HPP - -#define TEST_CATEGORY threads -#define TEST_EXECSPACE Kokkos::Threads - -#endif diff --git a/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp b/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp index d39e0061c7..4413cfbc80 100644 --- a/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp +++ b/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp @@ -163,7 +163,8 @@ struct TestFind { KOKKOS_INLINE_FUNCTION void operator()(typename execution_space::size_type i, value_type &errors) const { - const bool expect_to_find_i = (i < m_max_key); + const bool expect_to_find_i = + (i < typename execution_space::size_type(m_max_key)); const bool exists = m_map.exists(i); @@ -293,10 +294,11 @@ void test_deep_copy(uint32_t num_nodes) { } } -// FIXME_HIP wrong result in CI but works locally -#ifndef KOKKOS_ENABLE_HIP +// FIXME_SYCL wrong results on Nvidia GPUs but correct on Host and Intel GPUs +// FIXME_HIP // WORKAROUND MSVC -#ifndef _WIN32 +#if !(defined(KOKKOS_ENABLE_HIP) && (HIP_VERSION < 401)) && \ + !defined(_WIN32) && !defined(KOKKOS_ENABLE_SYCL) TEST(TEST_CATEGORY, UnorderedMap_insert) { for (int i = 0; i < 500; ++i) { test_insert(100000, 90000, 100, true); @@ -304,7 +306,6 @@ TEST(TEST_CATEGORY, UnorderedMap_insert) { } } #endif -#endif TEST(TEST_CATEGORY, UnorderedMap_failed_insert) { for (int i = 0; i < 1000; ++i) test_failed_insert(10000); diff --git a/lib/kokkos/core/perf_test/CMakeLists.txt b/lib/kokkos/core/perf_test/CMakeLists.txt index b7b817c910..9ff4b6006d 100644 --- a/lib/kokkos/core/perf_test/CMakeLists.txt +++ b/lib/kokkos/core/perf_test/CMakeLists.txt @@ -9,6 +9,14 @@ # that in TriBITS KokkosAlgorithms can be disabled... #INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/../../algorithms/src") +# FIXME_OPENMPTARGET - the NVIDIA HPC compiler nvc++ in the OpenMPTarget backend does not pass the perf_tests. +IF (KOKKOS_ENABLE_OPENMPTARGET + AND (KOKKOS_CXX_COMPILER_ID STREQUAL PGI + OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)) + RETURN() +ENDIF() + + SET(SOURCES PerfTestMain.cpp PerfTestGramSchmidt.cpp @@ -68,8 +76,7 @@ KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) # This test currently times out for MSVC -# FIXME_SYCL these tests don't compile yet (require parallel_for). -IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC" AND NOT Kokkos_ENABLE_SYCL) +IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") KOKKOS_ADD_EXECUTABLE_AND_TEST( PerfTestExec SOURCES ${SOURCES} @@ -77,13 +84,11 @@ IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC" AND NOT Kokkos_ENABLE_SYCL) ) ENDIF() -# FIXME_SYCL -IF(NOT Kokkos_ENABLE_SYCL) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - PerformanceTest_Atomic - SOURCES test_atomic.cpp - CATEGORIES PERFORMANCE - ) +KOKKOS_ADD_EXECUTABLE_AND_TEST( + PerformanceTest_Atomic + SOURCES test_atomic.cpp + CATEGORIES PERFORMANCE +) IF(NOT KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_CUDA_LAMBDA) KOKKOS_ADD_EXECUTABLE_AND_TEST( @@ -98,7 +103,6 @@ KOKKOS_ADD_EXECUTABLE_AND_TEST( SOURCES test_mempool.cpp CATEGORIES PERFORMANCE ) -ENDIF() IF(NOT Kokkos_ENABLE_OPENMPTARGET) # FIXME OPENMPTARGET needs tasking diff --git a/lib/kokkos/core/perf_test/PerfTestGramSchmidt.cpp b/lib/kokkos/core/perf_test/PerfTestGramSchmidt.cpp index 70186283c1..dee21fd7a5 100644 --- a/lib/kokkos/core/perf_test/PerfTestGramSchmidt.cpp +++ b/lib/kokkos/core/perf_test/PerfTestGramSchmidt.cpp @@ -69,7 +69,7 @@ struct InvNorm2 : public Kokkos::DotSingle { KOKKOS_INLINE_FUNCTION void final(value_type& result) const { - result = std::sqrt(result); + result = Kokkos::Experimental::sqrt(result); Rjj() = result; inv() = (0 < result) ? 1.0 / result : 0; } @@ -145,7 +145,7 @@ struct ModifiedGramSchmidt { // Q(:,j) *= ( 1 / R(j,j) ); => Q(:,j) *= tmp ; Kokkos::scale(tmp, Qj); - for (size_t k = j + 1; k < count; ++k) { + for (size_type k = j + 1; k < count; ++k) { const vector_type Qk = Kokkos::subview(Q_, Kokkos::ALL(), k); const value_view Rjk = Kokkos::subview(R_, j, k); @@ -165,7 +165,7 @@ struct ModifiedGramSchmidt { //-------------------------------------------------------------------------- - static double test(const size_t length, const size_t count, + static double test(const size_type length, const size_type count, const size_t iter = 1) { multivector_type Q_("Q", length, count); multivector_type R_("R", count, count); diff --git a/lib/kokkos/core/src/CMakeLists.txt b/lib/kokkos/core/src/CMakeLists.txt index e0590a78a4..2ab0989805 100644 --- a/lib/kokkos/core/src/CMakeLists.txt +++ b/lib/kokkos/core/src/CMakeLists.txt @@ -72,8 +72,6 @@ KOKKOS_ADD_LIBRARY( ADD_BUILD_OPTIONS # core should be given all the necessary compiler/linker flags ) -SET_TARGET_PROPERTIES(kokkoscore PROPERTIES VERSION ${Kokkos_VERSION}) - KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscore ${KOKKOS_TOP_BUILD_DIR} ${CMAKE_CURRENT_BINARY_DIR} @@ -87,3 +85,4 @@ KOKKOS_LINK_TPL(kokkoscore PUBLIC HPX) KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBDL) KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBRT) KOKKOS_LINK_TPL(kokkoscore PUBLIC PTHREAD) +KOKKOS_LINK_TPL(kokkoscore PUBLIC ROCM) diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp index 4a30c914f0..916f109758 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp @@ -45,6 +45,10 @@ #include #ifdef KOKKOS_ENABLE_CUDA +#include +#include +#include + #include #include #include @@ -52,10 +56,6 @@ #include #include -#include -#include -#include - //#include #include #include @@ -65,6 +65,22 @@ /*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/ +cudaStream_t Kokkos::Impl::cuda_get_deep_copy_stream() { + static cudaStream_t s = nullptr; + if (s == nullptr) { + cudaStreamCreate(&s); + } + return s; +} + +const std::unique_ptr &Kokkos::Impl::cuda_get_deep_copy_space( + bool initialize) { + static std::unique_ptr space = nullptr; + if (!space && initialize) + space = std::make_unique(Kokkos::Impl::cuda_get_deep_copy_stream()); + return space; +} + namespace Kokkos { namespace Impl { @@ -72,13 +88,6 @@ namespace { static std::atomic num_uvm_allocations(0); -cudaStream_t get_deep_copy_stream() { - static cudaStream_t s = nullptr; - if (s == nullptr) { - cudaStreamCreate(&s); - } - return s; -} } // namespace DeepCopy::DeepCopy(void *dst, const void *src, @@ -115,7 +124,7 @@ DeepCopy::DeepCopy(const Cuda &instance, void *dst, } void DeepCopyAsyncCuda(void *dst, const void *src, size_t n) { - cudaStream_t s = get_deep_copy_stream(); + cudaStream_t s = cuda_get_deep_copy_stream(); CUDA_SAFE_CALL(cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, s)); cudaStreamSynchronize(s); } @@ -128,14 +137,14 @@ void DeepCopyAsyncCuda(void *dst, const void *src, size_t n) { namespace Kokkos { -void CudaSpace::access_error() { +KOKKOS_DEPRECATED void CudaSpace::access_error() { const std::string msg( "Kokkos::CudaSpace::access_error attempt to execute Cuda function from " "non-Cuda space"); Kokkos::Impl::throw_runtime_exception(msg); } -void CudaSpace::access_error(const void *const) { +KOKKOS_DEPRECATED void CudaSpace::access_error(const void *const) { const std::string msg( "Kokkos::CudaSpace::access_error attempt to execute Cuda function from " "non-Cuda space"); @@ -459,79 +468,6 @@ SharedAllocationRecord::attach_texture_object( return tex_obj; } -//============================================================================== -// {{{1 - -std::string SharedAllocationRecord::get_label() const { - SharedAllocationHeader header; - - Kokkos::Impl::DeepCopy( - &header, RecordBase::head(), sizeof(SharedAllocationHeader)); - - return std::string(header.m_label); -} - -std::string SharedAllocationRecord::get_label() - const { - return std::string(RecordBase::head()->m_label); -} - -std::string -SharedAllocationRecord::get_label() const { - return std::string(RecordBase::head()->m_label); -} - -// end SharedAllocationRecord::get_label() }}}1 -//============================================================================== - -//============================================================================== -// {{{1 - -SharedAllocationRecord - *SharedAllocationRecord::allocate( - const Kokkos::CudaSpace &arg_space, const std::string &arg_label, - const size_t arg_alloc_size) { - return new SharedAllocationRecord(arg_space, arg_label, arg_alloc_size); -} - -SharedAllocationRecord - *SharedAllocationRecord::allocate( - const Kokkos::CudaUVMSpace &arg_space, const std::string &arg_label, - const size_t arg_alloc_size) { - return new SharedAllocationRecord(arg_space, arg_label, arg_alloc_size); -} - -SharedAllocationRecord - *SharedAllocationRecord::allocate( - const Kokkos::CudaHostPinnedSpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size) { - return new SharedAllocationRecord(arg_space, arg_label, arg_alloc_size); -} - -// end SharedAllocationRecord allocate() }}}1 -//============================================================================== - -//============================================================================== -// {{{1 - -void SharedAllocationRecord::deallocate( - SharedAllocationRecord *arg_rec) { - delete static_cast(arg_rec); -} - -void SharedAllocationRecord::deallocate( - SharedAllocationRecord *arg_rec) { - delete static_cast(arg_rec); -} - -void SharedAllocationRecord::deallocate( - SharedAllocationRecord *arg_rec) { - delete static_cast(arg_rec); -} - -// end SharedAllocationRecord deallocate }}}1 -//============================================================================== - //============================================================================== // {{{1 @@ -580,7 +516,7 @@ SharedAllocationRecord::SharedAllocationRecord( const SharedAllocationRecord::function_type arg_dealloc) // Pass through allocated [ SharedAllocationHeader , user_memory ] // Pass through deallocation function - : SharedAllocationRecord( + : base_t( #ifdef KOKKOS_ENABLE_DEBUG &SharedAllocationRecord::s_root_record, #endif @@ -592,13 +528,7 @@ SharedAllocationRecord::SharedAllocationRecord( SharedAllocationHeader header; - // Fill in the Header information - header.m_record = static_cast *>(this); - - strncpy(header.m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length); - // Set last element zero, in case c_str is too long - header.m_label[SharedAllocationHeader::maximum_label_length - 1] = (char)0; + this->base_t::_fill_host_accessible_header_info(header, arg_label); // Copy to device memory Kokkos::Impl::DeepCopy(RecordBase::m_alloc_ptr, &header, @@ -611,7 +541,7 @@ SharedAllocationRecord::SharedAllocationRecord( const SharedAllocationRecord::function_type arg_dealloc) // Pass through allocated [ SharedAllocationHeader , user_memory ] // Pass through deallocation function - : SharedAllocationRecord( + : base_t( #ifdef KOKKOS_ENABLE_DEBUG &SharedAllocationRecord::s_root_record, #endif @@ -620,16 +550,8 @@ SharedAllocationRecord::SharedAllocationRecord( sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc), m_tex_obj(0), m_space(arg_space) { - // Fill in the Header information, directly accessible via UVM - - RecordBase::m_alloc_ptr->m_record = this; - - strncpy(RecordBase::m_alloc_ptr->m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length); - - // Set last element zero, in case c_str is too long - RecordBase::m_alloc_ptr - ->m_label[SharedAllocationHeader::maximum_label_length - 1] = (char)0; + this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, + arg_label); } SharedAllocationRecord:: @@ -639,7 +561,7 @@ SharedAllocationRecord:: const SharedAllocationRecord::function_type arg_dealloc) // Pass through allocated [ SharedAllocationHeader , user_memory ] // Pass through deallocation function - : SharedAllocationRecord( + : base_t( #ifdef KOKKOS_ENABLE_DEBUG &SharedAllocationRecord::s_root_record, @@ -648,319 +570,13 @@ SharedAllocationRecord:: arg_alloc_size), sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc), m_space(arg_space) { - // Fill in the Header information, directly accessible on the host - - RecordBase::m_alloc_ptr->m_record = this; - - strncpy(RecordBase::m_alloc_ptr->m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length); - // Set last element zero, in case c_str is too long - RecordBase::m_alloc_ptr - ->m_label[SharedAllocationHeader::maximum_label_length - 1] = (char)0; + this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, + arg_label); } // end SharedAllocationRecord constructors }}}1 //============================================================================== -//============================================================================== -// {{{1 - -void *SharedAllocationRecord::allocate_tracked( - const Kokkos::CudaSpace &arg_space, const std::string &arg_alloc_label, - const size_t arg_alloc_size) { - if (!arg_alloc_size) return nullptr; - - SharedAllocationRecord *const r = - allocate(arg_space, arg_alloc_label, arg_alloc_size); - - RecordBase::increment(r); - - return r->data(); -} - -void SharedAllocationRecord::deallocate_tracked( - void *const arg_alloc_ptr) { - if (arg_alloc_ptr != nullptr) { - SharedAllocationRecord *const r = get_record(arg_alloc_ptr); - - RecordBase::decrement(r); - } -} - -void *SharedAllocationRecord::reallocate_tracked( - void *const arg_alloc_ptr, const size_t arg_alloc_size) { - SharedAllocationRecord *const r_old = get_record(arg_alloc_ptr); - SharedAllocationRecord *const r_new = - allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); - - Kokkos::Impl::DeepCopy( - r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); - - RecordBase::increment(r_new); - RecordBase::decrement(r_old); - - return r_new->data(); -} - -void *SharedAllocationRecord::allocate_tracked( - const Kokkos::CudaUVMSpace &arg_space, const std::string &arg_alloc_label, - const size_t arg_alloc_size) { - if (!arg_alloc_size) return nullptr; - - SharedAllocationRecord *const r = - allocate(arg_space, arg_alloc_label, arg_alloc_size); - - RecordBase::increment(r); - - return r->data(); -} - -void SharedAllocationRecord::deallocate_tracked( - void *const arg_alloc_ptr) { - if (arg_alloc_ptr != nullptr) { - SharedAllocationRecord *const r = get_record(arg_alloc_ptr); - - RecordBase::decrement(r); - } -} - -void *SharedAllocationRecord::reallocate_tracked( - void *const arg_alloc_ptr, const size_t arg_alloc_size) { - SharedAllocationRecord *const r_old = get_record(arg_alloc_ptr); - SharedAllocationRecord *const r_new = - allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); - - Kokkos::Impl::DeepCopy( - r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); - - RecordBase::increment(r_new); - RecordBase::decrement(r_old); - - return r_new->data(); -} - -void * -SharedAllocationRecord::allocate_tracked( - const Kokkos::CudaHostPinnedSpace &arg_space, - const std::string &arg_alloc_label, const size_t arg_alloc_size) { - if (!arg_alloc_size) return nullptr; - - SharedAllocationRecord *const r = - allocate(arg_space, arg_alloc_label, arg_alloc_size); - - RecordBase::increment(r); - - return r->data(); -} - -void SharedAllocationRecord::deallocate_tracked(void *const - arg_alloc_ptr) { - if (arg_alloc_ptr != nullptr) { - SharedAllocationRecord *const r = get_record(arg_alloc_ptr); - - RecordBase::decrement(r); - } -} - -void * -SharedAllocationRecord::reallocate_tracked( - void *const arg_alloc_ptr, const size_t arg_alloc_size) { - SharedAllocationRecord *const r_old = get_record(arg_alloc_ptr); - SharedAllocationRecord *const r_new = - allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); - - Kokkos::Impl::DeepCopy( - r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); - - RecordBase::increment(r_new); - RecordBase::decrement(r_old); - - return r_new->data(); -} - -// end SharedAllocationRecored::(re|de|)allocate_tracked }}}1 -//============================================================================== - -//============================================================================== -// {{{1 - -SharedAllocationRecord * -SharedAllocationRecord::get_record(void *alloc_ptr) { - using RecordCuda = SharedAllocationRecord; - - using Header = SharedAllocationHeader; - - // Copy the header from the allocation - Header head; - - Header const *const head_cuda = - alloc_ptr ? Header::get_header(alloc_ptr) : nullptr; - - if (alloc_ptr) { - Kokkos::Impl::DeepCopy( - &head, head_cuda, sizeof(SharedAllocationHeader)); - } - - RecordCuda *const record = - alloc_ptr ? static_cast(head.m_record) : nullptr; - - if (!alloc_ptr || record->m_alloc_ptr != head_cuda) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaSpace , " - "void >::get_record ERROR")); - } - - return record; -} - -SharedAllocationRecord *SharedAllocationRecord< - Kokkos::CudaUVMSpace, void>::get_record(void *alloc_ptr) { - using Header = SharedAllocationHeader; - using RecordCuda = SharedAllocationRecord; - - Header *const h = - alloc_ptr ? reinterpret_cast
(alloc_ptr) - 1 : nullptr; - - if (!alloc_ptr || h->m_record->m_alloc_ptr != h) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::SharedAllocationRecord< " - "Kokkos::CudaUVMSpace , void >::get_record ERROR")); - } - - return static_cast(h->m_record); -} - -SharedAllocationRecord - *SharedAllocationRecord::get_record( - void *alloc_ptr) { - using Header = SharedAllocationHeader; - using RecordCuda = SharedAllocationRecord; - - Header *const h = - alloc_ptr ? reinterpret_cast
(alloc_ptr) - 1 : nullptr; - - if (!alloc_ptr || h->m_record->m_alloc_ptr != h) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::SharedAllocationRecord< " - "Kokkos::CudaHostPinnedSpace , void >::get_record ERROR")); - } - - return static_cast(h->m_record); -} - -// end SharedAllocationRecord::get_record() }}}1 -//============================================================================== - -//============================================================================== -// {{{1 - -// Iterate records to print orphaned memory ... -void SharedAllocationRecord::print_records( - std::ostream &s, const Kokkos::CudaSpace &, bool detail) { - (void)s; - (void)detail; -#ifdef KOKKOS_ENABLE_DEBUG - SharedAllocationRecord *r = &s_root_record; - - char buffer[256]; - - SharedAllocationHeader head; - - if (detail) { - do { - if (r->m_alloc_ptr) { - Kokkos::Impl::DeepCopy( - &head, r->m_alloc_ptr, sizeof(SharedAllocationHeader)); - } else { - head.m_label[0] = 0; - } - - // Formatting dependent on sizeof(uintptr_t) - const char *format_string; - - if (sizeof(uintptr_t) == sizeof(unsigned long)) { - format_string = - "Cuda addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx " - "+ %.8ld ] count(%d) dealloc(0x%.12lx) %s\n"; - } else if (sizeof(uintptr_t) == sizeof(unsigned long long)) { - format_string = - "Cuda addr( 0x%.12llx ) list( 0x%.12llx 0x%.12llx ) extent[ " - "0x%.12llx + %.8ld ] count(%d) dealloc(0x%.12llx) %s\n"; - } - - snprintf(buffer, 256, format_string, reinterpret_cast(r), - reinterpret_cast(r->m_prev), - reinterpret_cast(r->m_next), - reinterpret_cast(r->m_alloc_ptr), r->m_alloc_size, - r->m_count, reinterpret_cast(r->m_dealloc), - head.m_label); - s << buffer; - r = r->m_next; - } while (r != &s_root_record); - } else { - do { - if (r->m_alloc_ptr) { - Kokkos::Impl::DeepCopy( - &head, r->m_alloc_ptr, sizeof(SharedAllocationHeader)); - - // Formatting dependent on sizeof(uintptr_t) - const char *format_string; - - if (sizeof(uintptr_t) == sizeof(unsigned long)) { - format_string = "Cuda [ 0x%.12lx + %ld ] %s\n"; - } else if (sizeof(uintptr_t) == sizeof(unsigned long long)) { - format_string = "Cuda [ 0x%.12llx + %ld ] %s\n"; - } - - snprintf(buffer, 256, format_string, - reinterpret_cast(r->data()), r->size(), - head.m_label); - } else { - snprintf(buffer, 256, "Cuda [ 0 + 0 ]\n"); - } - s << buffer; - r = r->m_next; - } while (r != &s_root_record); - } -#else - Kokkos::Impl::throw_runtime_exception( - "SharedAllocationHeader::print_records only works with " - "KOKKOS_ENABLE_DEBUG enabled"); -#endif -} - -void SharedAllocationRecord::print_records( - std::ostream &s, const Kokkos::CudaUVMSpace &, bool detail) { - (void)s; - (void)detail; -#ifdef KOKKOS_ENABLE_DEBUG - SharedAllocationRecord::print_host_accessible_records( - s, "CudaUVM", &s_root_record, detail); -#else - Kokkos::Impl::throw_runtime_exception( - "SharedAllocationHeader::print_records only works with " - "KOKKOS_ENABLE_DEBUG enabled"); -#endif -} - -void SharedAllocationRecord::print_records( - std::ostream &s, const Kokkos::CudaHostPinnedSpace &, bool detail) { - (void)s; - (void)detail; -#ifdef KOKKOS_ENABLE_DEBUG - SharedAllocationRecord::print_host_accessible_records( - s, "CudaHostPinned", &s_root_record, detail); -#else - Kokkos::Impl::throw_runtime_exception( - "SharedAllocationHeader::print_records only works with " - "KOKKOS_ENABLE_DEBUG enabled"); -#endif -} - -// end SharedAllocationRecord::print_records() }}}1 -//============================================================================== - void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes, bool to_device) { if ((ptr == nullptr) || (bytes == 0)) return; @@ -984,6 +600,29 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes, } // namespace Impl } // namespace Kokkos + +//============================================================================== +// {{{1 + +#include + +namespace Kokkos { +namespace Impl { + +// To avoid additional compilation cost for something that's (mostly?) not +// performance sensitive, we explicity instantiate these CRTP base classes here, +// where we have access to the associated *_timpl.hpp header files. +template class SharedAllocationRecordCommon; +template class HostInaccessibleSharedAllocationRecordCommon; +template class SharedAllocationRecordCommon; +template class SharedAllocationRecordCommon; + +} // end namespace Impl +} // end namespace Kokkos + +// end Explicit instantiations of CRTP Base classes }}}1 +//============================================================================== + #else void KOKKOS_CORE_SRC_CUDA_CUDASPACE_PREVENT_LINK_ERROR() {} #endif // KOKKOS_ENABLE_CUDA diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp index 0d6d3bdb3a..0f4259072d 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp @@ -140,7 +140,7 @@ inline int cuda_deduce_block_size(bool early_termination, } } - if (early_termination && blocks_per_sm != 0) break; + if (early_termination && opt_block_size != 0) break; } return opt_block_size; @@ -222,7 +222,8 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) { case 52: case 61: return 96; case 70: - case 80: return 8; + case 80: + case 86: return 8; case 75: return 32; default: Kokkos::Impl::throw_runtime_exception( diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp index a9a62380e5..ec9c434fe6 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp @@ -175,30 +175,42 @@ class half_t { return cast_from_half(*this); } + /** + * Conversion constructors. + * + * Support implicit conversions from impl_type, float, double -> half_t + * Mixed precision expressions require upcasting which is done in the + * "// Binary Arithmetic" operator overloads below. + * + * Support implicit conversions from integral types -> half_t. + * Expressions involving half_t with integral types require downcasting + * the integral types to half_t. Existing operator overloads can handle this + * with the addition of the below implicit conversion constructors. + */ KOKKOS_FUNCTION half_t(impl_type rhs) : val(rhs) {} KOKKOS_FUNCTION - explicit half_t(float rhs) : val(cast_to_half(rhs).val) {} + half_t(float rhs) : val(cast_to_half(rhs).val) {} + KOKKOS_FUNCTION + half_t(double rhs) : val(cast_to_half(rhs).val) {} KOKKOS_FUNCTION explicit half_t(bool rhs) : val(cast_to_half(rhs).val) {} KOKKOS_FUNCTION - explicit half_t(double rhs) : val(cast_to_half(rhs).val) {} + half_t(short rhs) : val(cast_to_half(rhs).val) {} KOKKOS_FUNCTION - explicit half_t(short rhs) : val(cast_to_half(rhs).val) {} + half_t(int rhs) : val(cast_to_half(rhs).val) {} KOKKOS_FUNCTION - explicit half_t(int rhs) : val(cast_to_half(rhs).val) {} + half_t(long rhs) : val(cast_to_half(rhs).val) {} KOKKOS_FUNCTION - explicit half_t(long rhs) : val(cast_to_half(rhs).val) {} + half_t(long long rhs) : val(cast_to_half(rhs).val) {} KOKKOS_FUNCTION - explicit half_t(long long rhs) : val(cast_to_half(rhs).val) {} + half_t(unsigned short rhs) : val(cast_to_half(rhs).val) {} KOKKOS_FUNCTION - explicit half_t(unsigned short rhs) : val(cast_to_half(rhs).val) {} + half_t(unsigned int rhs) : val(cast_to_half(rhs).val) {} KOKKOS_FUNCTION - explicit half_t(unsigned int rhs) : val(cast_to_half(rhs).val) {} + half_t(unsigned long rhs) : val(cast_to_half(rhs).val) {} KOKKOS_FUNCTION - explicit half_t(unsigned long rhs) : val(cast_to_half(rhs).val) {} - KOKKOS_FUNCTION - explicit half_t(unsigned long long rhs) : val(cast_to_half(rhs).val) {} + half_t(unsigned long long rhs) : val(cast_to_half(rhs).val) {} // Unary operators KOKKOS_FUNCTION @@ -243,7 +255,7 @@ class half_t { #else float tmp = __half2float(val); --tmp; - val = __float2half(tmp); + val = __float2half(tmp); #endif return *this; } @@ -276,88 +288,317 @@ class half_t { return *this; } + template + KOKKOS_FUNCTION void operator=(T rhs) volatile { + val = cast_to_half(rhs).val; + } + // Compound operators KOKKOS_FUNCTION half_t& operator+=(half_t rhs) { #ifdef __CUDA_ARCH__ val += rhs.val; #else - val = __float2half(__half2float(val) + __half2float(rhs.val)); + val = __float2half(__half2float(val) + __half2float(rhs.val)); #endif return *this; } + KOKKOS_FUNCTION + volatile half_t& operator+=(half_t rhs) volatile { +#ifdef __CUDA_ARCH__ + // Cuda 10 supports __half volatile stores but not volatile arithmetic + // operands. Cast away volatile-ness of val for arithmetic but not for store + // location. + val = const_cast(val) + rhs.val; +#else + // Use non-volatile val_ref to suppress: + // "warning: implicit dereference will not access object of type ‘volatile + // __half’ in statement" + auto val_ref = const_cast(val); + val_ref = __float2half(__half2float(const_cast(val)) + + __half2float(rhs.val)); +#endif + return *this; + } + + // Compund operators: upcast overloads for += + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator+=(T& lhs, half_t rhs) { + lhs += static_cast(rhs); + return lhs; + } + + KOKKOS_FUNCTION + half_t& operator+=(float rhs) { + float result = static_cast(val) + rhs; + val = static_cast(result); + return *this; + } + + KOKKOS_FUNCTION + half_t& operator+=(double rhs) { + double result = static_cast(val) + rhs; + val = static_cast(result); + return *this; + } + KOKKOS_FUNCTION half_t& operator-=(half_t rhs) { #ifdef __CUDA_ARCH__ val -= rhs.val; #else - val = __float2half(__half2float(val) - __half2float(rhs.val)); + val = __float2half(__half2float(val) - __half2float(rhs.val)); #endif return *this; } + KOKKOS_FUNCTION + volatile half_t& operator-=(half_t rhs) volatile { +#ifdef __CUDA_ARCH__ + // Cuda 10 supports __half volatile stores but not volatile arithmetic + // operands. Cast away volatile-ness of val for arithmetic but not for store + // location. + val = const_cast(val) - rhs.val; +#else + // Use non-volatile val_ref to suppress: + // "warning: implicit dereference will not access object of type ‘volatile + // __half’ in statement" + auto val_ref = const_cast(val); + val_ref = __float2half(__half2float(const_cast(val)) - + __half2float(rhs.val)); +#endif + return *this; + } + + // Compund operators: upcast overloads for -= + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator-=(T& lhs, half_t rhs) { + lhs -= static_cast(rhs); + return lhs; + } + + KOKKOS_FUNCTION + half_t& operator-=(float rhs) { + float result = static_cast(val) - rhs; + val = static_cast(result); + return *this; + } + + KOKKOS_FUNCTION + half_t& operator-=(double rhs) { + double result = static_cast(val) - rhs; + val = static_cast(result); + return *this; + } + KOKKOS_FUNCTION half_t& operator*=(half_t rhs) { #ifdef __CUDA_ARCH__ val *= rhs.val; #else - val = __float2half(__half2float(val) * __half2float(rhs.val)); + val = __float2half(__half2float(val) * __half2float(rhs.val)); #endif return *this; } + KOKKOS_FUNCTION + volatile half_t& operator*=(half_t rhs) volatile { +#ifdef __CUDA_ARCH__ + // Cuda 10 supports __half volatile stores but not volatile arithmetic + // operands. Cast away volatile-ness of val for arithmetic but not for store + // location. + val = const_cast(val) * rhs.val; +#else + // Use non-volatile val_ref to suppress: + // "warning: implicit dereference will not access object of type ‘volatile + // __half’ in statement" + auto val_ref = const_cast(val); + val_ref = __float2half(__half2float(const_cast(val)) * + __half2float(rhs.val)); +#endif + return *this; + } + + // Compund operators: upcast overloads for *= + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator*=(T& lhs, half_t rhs) { + lhs *= static_cast(rhs); + return lhs; + } + + KOKKOS_FUNCTION + half_t& operator*=(float rhs) { + float result = static_cast(val) * rhs; + val = static_cast(result); + return *this; + } + + KOKKOS_FUNCTION + half_t& operator*=(double rhs) { + double result = static_cast(val) * rhs; + val = static_cast(result); + return *this; + } + KOKKOS_FUNCTION half_t& operator/=(half_t rhs) { #ifdef __CUDA_ARCH__ val /= rhs.val; #else - val = __float2half(__half2float(val) / __half2float(rhs.val)); + val = __float2half(__half2float(val) / __half2float(rhs.val)); #endif return *this; } + KOKKOS_FUNCTION + volatile half_t& operator/=(half_t rhs) volatile { +#ifdef __CUDA_ARCH__ + // Cuda 10 supports __half volatile stores but not volatile arithmetic + // operands. Cast away volatile-ness of val for arithmetic but not for store + // location. + val = const_cast(val) / rhs.val; +#else + // Use non-volatile val_ref to suppress: + // "warning: implicit dereference will not access object of type ‘volatile + // __half’ in statement" + auto val_ref = const_cast(val); + val_ref = __float2half(__half2float(const_cast(val)) / + __half2float(rhs.val)); +#endif + return *this; + } + + // Compund operators: upcast overloads for /= + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator/=(T& lhs, half_t rhs) { + lhs /= static_cast(rhs); + return lhs; + } + + KOKKOS_FUNCTION + half_t& operator/=(float rhs) { + float result = static_cast(val) / rhs; + val = static_cast(result); + return *this; + } + + KOKKOS_FUNCTION + half_t& operator/=(double rhs) { + double result = static_cast(val) / rhs; + val = static_cast(result); + return *this; + } + // Binary Arithmetic KOKKOS_FUNCTION half_t friend operator+(half_t lhs, half_t rhs) { #ifdef __CUDA_ARCH__ lhs.val += rhs.val; #else - lhs.val = __float2half(__half2float(lhs.val) + __half2float(rhs.val)); + lhs.val = __float2half(__half2float(lhs.val) + __half2float(rhs.val)); #endif return lhs; } + // Binary Arithmetic upcast operators for + + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator+(half_t lhs, T rhs) { + return T(lhs) + rhs; + } + + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator+(T lhs, half_t rhs) { + return lhs + T(rhs); + } + KOKKOS_FUNCTION half_t friend operator-(half_t lhs, half_t rhs) { #ifdef __CUDA_ARCH__ lhs.val -= rhs.val; #else - lhs.val = __float2half(__half2float(lhs.val) - __half2float(rhs.val)); + lhs.val = __float2half(__half2float(lhs.val) - __half2float(rhs.val)); #endif return lhs; } + // Binary Arithmetic upcast operators for - + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator-(half_t lhs, T rhs) { + return T(lhs) - rhs; + } + + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator-(T lhs, half_t rhs) { + return lhs - T(rhs); + } + KOKKOS_FUNCTION half_t friend operator*(half_t lhs, half_t rhs) { #ifdef __CUDA_ARCH__ lhs.val *= rhs.val; #else - lhs.val = __float2half(__half2float(lhs.val) * __half2float(rhs.val)); + lhs.val = __float2half(__half2float(lhs.val) * __half2float(rhs.val)); #endif return lhs; } + // Binary Arithmetic upcast operators for * + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator*(half_t lhs, T rhs) { + return T(lhs) * rhs; + } + + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator*(T lhs, half_t rhs) { + return lhs * T(rhs); + } + KOKKOS_FUNCTION half_t friend operator/(half_t lhs, half_t rhs) { #ifdef __CUDA_ARCH__ lhs.val /= rhs.val; #else - lhs.val = __float2half(__half2float(lhs.val) / __half2float(rhs.val)); + lhs.val = __float2half(__half2float(lhs.val) / __half2float(rhs.val)); #endif return lhs; } + // Binary Arithmetic upcast operators for / + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator/(half_t lhs, T rhs) { + return T(lhs) / rhs; + } + + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator/(T lhs, half_t rhs) { + return lhs / T(rhs); + } + // Logical operators KOKKOS_FUNCTION bool operator!() const { diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp index b8e8163458..016cb6cdcb 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -54,6 +54,7 @@ #include #include #include +#include #include #include @@ -248,11 +249,11 @@ void CudaInternal::print_configuration(std::ostream &s) const { const CudaInternalDevices &dev_info = CudaInternalDevices::singleton(); #if defined(KOKKOS_ENABLE_CUDA) - s << "macro KOKKOS_ENABLE_CUDA : defined" << std::endl; + s << "macro KOKKOS_ENABLE_CUDA : defined\n"; #endif #if defined(CUDA_VERSION) s << "macro CUDA_VERSION = " << CUDA_VERSION << " = version " - << CUDA_VERSION / 1000 << "." << (CUDA_VERSION % 1000) / 10 << std::endl; + << CUDA_VERSION / 1000 << "." << (CUDA_VERSION % 1000) / 10 << '\n'; #endif for (int i = 0; i < dev_info.m_cudaDevCount; ++i) { @@ -274,7 +275,6 @@ CudaInternal::~CudaInternal() { m_scratchConcurrentBitset) { std::cerr << "Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()" << std::endl; - std::cerr.flush(); } m_cudaDev = -1; @@ -358,8 +358,7 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) { if (m_cudaArch == 0) { std::stringstream ss; - ss << "Kokkos::Cuda::initialize ERROR: likely mismatch of architecture" - << std::endl; + ss << "Kokkos::Cuda::initialize ERROR: likely mismatch of architecture\n"; std::string msg = ss.str(); Kokkos::abort(msg.c_str()); } @@ -373,7 +372,7 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) { "compute capability " << compiled_major << "." << compiled_minor << " on device with compute capability " << cudaProp.major << "." - << cudaProp.minor << " is not supported by CUDA!" << std::endl; + << cudaProp.minor << " is not supported by CUDA!\n"; std::string msg = ss.str(); Kokkos::abort(msg.c_str()); } @@ -458,7 +457,7 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) { Kokkos::Impl::SharedAllocationRecord; Record *const r = - Record::allocate(Kokkos::CudaSpace(), "InternalScratchBitset", + Record::allocate(Kokkos::CudaSpace(), "Kokkos::InternalScratchBitset", sizeof(uint32_t) * buffer_bound); Record::increment(r); @@ -492,17 +491,11 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) { #ifdef KOKKOS_ENABLE_CUDA_UVM if (Kokkos::show_warnings() && !cuda_launch_blocking()) { - std::cerr << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into " - "UVMSpace by default" - << std::endl; - std::cerr << " without setting " - "CUDA_LAUNCH_BLOCKING=1." - << std::endl; - std::cerr << " The code must call " - "Cuda().fence() after each kernel" - << std::endl; - std::cerr << " or will likely crash when " - "accessing data on the host." + std::cerr << R"warning( +Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default + without setting CUDA_LAUNCH_BLOCKING=1. + The code must call Cuda().fence() after each kernel + or will likely crash when accessing data on the host.)warning" << std::endl; } @@ -520,19 +513,13 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) { if (Kokkos::show_warnings() && (!visible_devices_one && !force_device_alloc)) { - std::cerr << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into " - "UVMSpace by default" + std::cerr << R"warning( +Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default + without setting CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 or + setting CUDA_VISIBLE_DEVICES. + This could on multi GPU systems lead to severe performance" + penalties.)warning" << std::endl; - std::cerr << " without setting " - "CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 or " - << std::endl; - std::cerr - << " setting CUDA_VISIBLE_DEVICES." - << std::endl; - std::cerr << " This could on multi GPU " - "systems lead to severe performance" - << std::endl; - std::cerr << " penalties." << std::endl; } #endif @@ -575,7 +562,7 @@ Cuda::size_type *CudaInternal::scratch_flags(const Cuda::size_type size) const { if (m_scratchFlags) Record::decrement(Record::get_record(m_scratchFlags)); Record *const r = - Record::allocate(Kokkos::CudaSpace(), "InternalScratchFlags", + Record::allocate(Kokkos::CudaSpace(), "Kokkos::InternalScratchFlags", (sizeof(ScratchGrain) * m_scratchFlagsCount)); Record::increment(r); @@ -600,7 +587,7 @@ Cuda::size_type *CudaInternal::scratch_space(const Cuda::size_type size) const { if (m_scratchSpace) Record::decrement(Record::get_record(m_scratchSpace)); Record *const r = - Record::allocate(Kokkos::CudaSpace(), "InternalScratchSpace", + Record::allocate(Kokkos::CudaSpace(), "Kokkos::InternalScratchSpace", (sizeof(ScratchGrain) * m_scratchSpaceCount)); Record::increment(r); @@ -624,7 +611,7 @@ Cuda::size_type *CudaInternal::scratch_unified( Record::decrement(Record::get_record(m_scratchUnified)); Record *const r = Record::allocate( - Kokkos::CudaHostPinnedSpace(), "InternalScratchUnified", + Kokkos::CudaHostPinnedSpace(), "Kokkos::InternalScratchUnified", (sizeof(ScratchGrain) * m_scratchUnifiedCount)); Record::increment(r); @@ -646,8 +633,9 @@ Cuda::size_type *CudaInternal::scratch_functor( if (m_scratchFunctor) Record::decrement(Record::get_record(m_scratchFunctor)); - Record *const r = Record::allocate( - Kokkos::CudaSpace(), "InternalScratchFunctor", m_scratchFunctorSize); + Record *const r = + Record::allocate(Kokkos::CudaSpace(), "Kokkos::InternalScratchFunctor", + m_scratchFunctorSize); Record::increment(r); @@ -662,7 +650,7 @@ void *CudaInternal::resize_team_scratch_space(std::int64_t bytes, if (m_team_scratch_current_size == 0) { m_team_scratch_current_size = bytes; m_team_scratch_ptr = Kokkos::kokkos_malloc( - "CudaSpace::ScratchMemory", m_team_scratch_current_size); + "Kokkos::CudaSpace::TeamScratchMemory", m_team_scratch_current_size); } if ((bytes > m_team_scratch_current_size) || ((bytes < m_team_scratch_current_size) && (force_shrink))) { @@ -676,6 +664,9 @@ void *CudaInternal::resize_team_scratch_space(std::int64_t bytes, //---------------------------------------------------------------------------- void CudaInternal::finalize() { + // skip if finalize() has already been called + if (was_finalized) return; + was_finalized = true; if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) { // Only finalize this if we're the singleton @@ -719,6 +710,11 @@ void CudaInternal::finalize() { if (this == &singleton()) { cudaFreeHost(constantMemHostStaging); cudaEventDestroy(constantMemReusable); + auto &deep_copy_space = + Kokkos::Impl::cuda_get_deep_copy_space(/*initialize*/ false); + if (deep_copy_space) + deep_copy_space->impl_internal_space_instance()->finalize(); + cudaStreamDestroy(cuda_get_deep_copy_stream()); } } @@ -821,62 +817,23 @@ Cuda::size_type Cuda::device_arch() { void Cuda::impl_finalize() { Impl::CudaInternal::singleton().finalize(); } Cuda::Cuda() - : m_space_instance(&Impl::CudaInternal::singleton()), m_counter(nullptr) { + : m_space_instance(&Impl::CudaInternal::singleton(), + [](Impl::CudaInternal *) {}) { Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor"); } Cuda::Cuda(cudaStream_t stream) - : m_space_instance(new Impl::CudaInternal), m_counter(new int(1)) { + : m_space_instance(new Impl::CudaInternal, [](Impl::CudaInternal *ptr) { + ptr->finalize(); + delete ptr; + }) { Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor"); m_space_instance->initialize(Impl::CudaInternal::singleton().m_cudaDev, stream); } -KOKKOS_FUNCTION Cuda::Cuda(Cuda &&other) noexcept { - m_space_instance = other.m_space_instance; - other.m_space_instance = nullptr; - m_counter = other.m_counter; - other.m_counter = nullptr; -} - -KOKKOS_FUNCTION Cuda::Cuda(const Cuda &other) - : m_space_instance(other.m_space_instance), m_counter(other.m_counter) { -#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA - if (m_counter) Kokkos::atomic_add(m_counter, 1); -#endif -} - -KOKKOS_FUNCTION Cuda &Cuda::operator=(Cuda &&other) noexcept { - m_space_instance = other.m_space_instance; - other.m_space_instance = nullptr; - m_counter = other.m_counter; - other.m_counter = nullptr; - return *this; -} - -KOKKOS_FUNCTION Cuda &Cuda::operator=(const Cuda &other) { - m_space_instance = other.m_space_instance; - m_counter = other.m_counter; -#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA - if (m_counter) Kokkos::atomic_add(m_counter, 1); -#endif - return *this; -} - -KOKKOS_FUNCTION Cuda::~Cuda() noexcept { -#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA - if (m_counter == nullptr) return; - int const count = Kokkos::atomic_fetch_sub(m_counter, 1); - if (count == 1) { - delete m_counter; - m_space_instance->finalize(); - delete m_space_instance; - } -#endif -} - void Cuda::print_configuration(std::ostream &s, const bool) { Impl::CudaInternal::singleton().print_configuration(s); } @@ -924,54 +881,53 @@ void CudaSpaceInitializer::fence() { Kokkos::Cuda::impl_static_fence(); } void CudaSpaceInitializer::print_configuration(std::ostream &msg, const bool detail) { - msg << "Device Execution Space:" << std::endl; - msg << " KOKKOS_ENABLE_CUDA: "; - msg << "yes" << std::endl; + msg << "Device Execution Space:\n"; + msg << " KOKKOS_ENABLE_CUDA: yes\n"; - msg << "Cuda Atomics:" << std::endl; + msg << "Cuda Atomics:\n"; msg << " KOKKOS_ENABLE_CUDA_ATOMICS: "; #ifdef KOKKOS_ENABLE_CUDA_ATOMICS - msg << "yes" << std::endl; + msg << "yes\n"; #else - msg << "no" << std::endl; + msg << "no\n"; #endif - msg << "Cuda Options:" << std::endl; + msg << "Cuda Options:\n"; msg << " KOKKOS_ENABLE_CUDA_LAMBDA: "; #ifdef KOKKOS_ENABLE_CUDA_LAMBDA - msg << "yes" << std::endl; + msg << "yes\n"; #else - msg << "no" << std::endl; + msg << "no\n"; #endif msg << " KOKKOS_ENABLE_CUDA_LDG_INTRINSIC: "; #ifdef KOKKOS_ENABLE_CUDA_LDG_INTRINSIC - msg << "yes" << std::endl; + msg << "yes\n"; #else - msg << "no" << std::endl; + msg << "no\n"; #endif msg << " KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE: "; #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE - msg << "yes" << std::endl; + msg << "yes\n"; #else - msg << "no" << std::endl; + msg << "no\n"; #endif msg << " KOKKOS_ENABLE_CUDA_UVM: "; #ifdef KOKKOS_ENABLE_CUDA_UVM - msg << "yes" << std::endl; + msg << "yes\n"; #else - msg << "no" << std::endl; + msg << "no\n"; #endif msg << " KOKKOS_ENABLE_CUSPARSE: "; #ifdef KOKKOS_ENABLE_CUSPARSE - msg << "yes" << std::endl; + msg << "yes\n"; #else - msg << "no" << std::endl; + msg << "no\n"; #endif msg << " KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA: "; #ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA - msg << "yes" << std::endl; + msg << "yes\n"; #else - msg << "no" << std::endl; + msg << "no\n"; #endif msg << "\nCuda Runtime Configuration:" << std::endl; diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp index 13773d70c5..aaec2c2926 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp @@ -17,30 +17,24 @@ namespace Kokkos { namespace Impl { struct CudaTraits { - enum : CudaSpace::size_type { WarpSize = 32 /* 0x0020 */ }; - enum : CudaSpace::size_type { - WarpIndexMask = 0x001f /* Mask for warpindex */ - }; - enum : CudaSpace::size_type { - WarpIndexShift = 5 /* WarpSize == 1 << WarpShift */ - }; + static constexpr CudaSpace::size_type WarpSize = 32 /* 0x0020 */; + static constexpr CudaSpace::size_type WarpIndexMask = + 0x001f; /* Mask for warpindex */ + static constexpr CudaSpace::size_type WarpIndexShift = + 5; /* WarpSize == 1 << WarpShift */ - enum : CudaSpace::size_type { - ConstantMemoryUsage = 0x008000 /* 32k bytes */ - }; - enum : CudaSpace::size_type { - ConstantMemoryCache = 0x002000 /* 8k bytes */ - }; - enum : CudaSpace::size_type { - KernelArgumentLimit = 0x001000 /* 4k bytes */ - }; - enum : CudaSpace::size_type { - MaxHierarchicalParallelism = 1024 /* team_size * vector_length */ - }; + static constexpr CudaSpace::size_type ConstantMemoryUsage = + 0x008000; /* 32k bytes */ + static constexpr CudaSpace::size_type ConstantMemoryCache = + 0x002000; /* 8k bytes */ + static constexpr CudaSpace::size_type KernelArgumentLimit = + 0x001000; /* 4k bytes */ + static constexpr CudaSpace::size_type MaxHierarchicalParallelism = + 1024; /* team_size * vector_length */ using ConstantGlobalBufferType = unsigned long[ConstantMemoryUsage / sizeof(unsigned long)]; - enum { ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */ }; + static constexpr int ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */; KOKKOS_INLINE_FUNCTION static CudaSpace::size_type warp_count( CudaSpace::size_type i) { diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp index 39404e0bf3..d892a893b3 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp @@ -158,6 +158,9 @@ inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) { } } +// This function needs to be template on DriverType and LaunchBounds +// so that the static bool is unique for each type combo +// KernelFuncPtr does not necessarily contain that type information. template inline void configure_shmem_preference(KernelFuncPtr const& func, bool prefer_shmem) { @@ -355,8 +358,7 @@ struct CudaParallelLaunchKernelInvoker< if (!Impl::is_empty_launch(grid, block)) { Impl::check_shmem_request(cuda_instance, shmem); - Impl::configure_shmem_preference( + Impl::configure_shmem_preference( base_t::get_kernel_func(), prefer_shmem); void const* args[] = {&driver}; @@ -449,8 +451,7 @@ struct CudaParallelLaunchKernelInvoker< if (!Impl::is_empty_launch(grid, block)) { Impl::check_shmem_request(cuda_instance, shmem); - Impl::configure_shmem_preference( + Impl::configure_shmem_preference( base_t::get_kernel_func(), prefer_shmem); auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver); @@ -627,9 +628,8 @@ struct CudaParallelLaunchImpl< get_cuda_func_attributes(), block, shmem, prefer_shmem); Impl::configure_shmem_preference< - DriverType, Kokkos::LaunchBounds, - decltype(base_t::get_kernel_func())>(base_t::get_kernel_func(), - prefer_shmem); + DriverType, Kokkos::LaunchBounds>( + base_t::get_kernel_func(), prefer_shmem); KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp new file mode 100644 index 0000000000..12b7f70a97 --- /dev/null +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp @@ -0,0 +1,37 @@ +#ifndef KOKKOS_CUDA_MDRANGEPOLICY_HPP_ +#define KOKKOS_CUDA_MDRANGEPOLICY_HPP_ + +#include + +namespace Kokkos { + +template <> +struct default_outer_direction { + using type = Iterate; + static constexpr Iterate value = Iterate::Left; +}; + +template <> +struct default_inner_direction { + using type = Iterate; + static constexpr Iterate value = Iterate::Left; +}; + +namespace Impl { + +// Settings for MDRangePolicy +template <> +inline TileSizeProperties get_tile_size_properties( + const Kokkos::Cuda& space) { + TileSizeProperties properties; + properties.max_threads = + space.impl_internal_space_instance()->m_maxThreadsPerSM; + properties.default_largest_tile_size = 16; + properties.default_tile_size = 2; + properties.max_total_tile_size = 512; + return properties; +} + +} // Namespace Impl +} // Namespace Kokkos +#endif diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp index 131d180980..2834e6f3de 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp @@ -60,6 +60,7 @@ #include #include #include +#include #include #include @@ -67,6 +68,7 @@ #include #include +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -474,7 +476,7 @@ class ParallelFor, Kokkos::Cuda> { Policy const& get_policy() const { return m_policy; } - inline __device__ void operator()(void) const { + inline __device__ void operator()() const { const Member work_stride = blockDim.y * gridDim.x; const Member work_end = m_policy.end(); @@ -537,9 +539,23 @@ class ParallelFor, Kokkos::Cuda> { const Policy m_rp; public: + template + static int max_tile_size_product(const Policy& pol, const Functor&) { + cudaFuncAttributes attr = + CudaParallelLaunch::get_cuda_func_attributes(); + auto const& prop = pol.space().cuda_device_prop(); + // Limits due to registers/SM, MDRange doesn't have + // shared memory constraints + int const regs_per_sm = prop.regsPerMultiprocessor; + int const regs_per_thread = attr.numRegs; + int const max_threads_per_sm = regs_per_sm / regs_per_thread; + return std::min( + max_threads_per_sm, + static_cast(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism)); + } Policy const& get_policy() const { return m_rp; } - - inline __device__ void operator()(void) const { + inline __device__ void operator()() const { Kokkos::Impl::DeviceIterateTile(m_rp, m_functor) .exec_range(); @@ -689,7 +705,7 @@ class ParallelFor, public: Policy const& get_policy() const { return m_policy; } - __device__ inline void operator()(void) const { + __device__ inline void operator()() const { // Iterate this block through the league int64_t threadid = 0; if (m_scratch_size[1] > 0) { @@ -1248,8 +1264,21 @@ class ParallelReduce, ReducerType, using DummySHMEMReductionType = int; public: + template + static int max_tile_size_product(const Policy& pol, const Functor&) { + cudaFuncAttributes attr = + CudaParallelLaunch::get_cuda_func_attributes(); + auto const& prop = pol.space().cuda_device_prop(); + // Limits due do registers/SM + int const regs_per_sm = prop.regsPerMultiprocessor; + int const regs_per_thread = attr.numRegs; + int const max_threads_per_sm = regs_per_sm / regs_per_thread; + return std::min( + max_threads_per_sm, + static_cast(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism)); + } Policy const& get_policy() const { return m_policy; } - inline __device__ void exec_range(reference_type update) const { Kokkos::Impl::Reduce::DeviceIterateTile, ReducerType, .exec_range(); } - inline __device__ void operator()(void) const { + inline __device__ void operator()() const { /* run(Kokkos::Impl::if_c::select(1,1.0) ); } @@ -2074,7 +2103,7 @@ class ParallelScan, Kokkos::Cuda> { //---------------------------------------- - __device__ inline void initial(void) const { + __device__ inline void initial() const { const integral_nonzero_constant word_count(ValueTraits::value_size(m_functor) / sizeof(size_type)); @@ -2110,7 +2139,7 @@ class ParallelScan, Kokkos::Cuda> { //---------------------------------------- - __device__ inline void final(void) const { + __device__ inline void final() const { const integral_nonzero_constant word_count(ValueTraits::value_size(m_functor) / sizeof(size_type)); @@ -2195,7 +2224,7 @@ class ParallelScan, Kokkos::Cuda> { //---------------------------------------- - __device__ inline void operator()(void) const { + __device__ inline void operator()() const { #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION if (m_run_serial) { typename ValueTraits::value_type value; @@ -2364,7 +2393,7 @@ class ParallelScanWithTotal, //---------------------------------------- - __device__ inline void initial(void) const { + __device__ inline void initial() const { const integral_nonzero_constant word_count(ValueTraits::value_size(m_functor) / sizeof(size_type)); @@ -2400,7 +2429,7 @@ class ParallelScanWithTotal, //---------------------------------------- - __device__ inline void final(void) const { + __device__ inline void final() const { const integral_nonzero_constant word_count(ValueTraits::value_size(m_functor) / sizeof(size_type)); @@ -2487,7 +2516,7 @@ class ParallelScanWithTotal, //---------------------------------------- - __device__ inline void operator()(void) const { + __device__ inline void operator()() const { #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION if (m_run_serial) { typename ValueTraits::value_type value; diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp index 4b472f5d4f..e780639015 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp @@ -661,13 +661,14 @@ KOKKOS_INLINE_FUNCTION thread, count); } -template -KOKKOS_INLINE_FUNCTION - Impl::ThreadVectorRangeBoundariesStruct - ThreadVectorRange(const Impl::CudaTeamMember& thread, iType arg_begin, - iType arg_end) { +template +KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct< + typename std::common_type::type, Impl::CudaTeamMember> +ThreadVectorRange(const Impl::CudaTeamMember& thread, iType1 arg_begin, + iType2 arg_end) { + using iType = typename std::common_type::type; return Impl::ThreadVectorRangeBoundariesStruct( - thread, arg_begin, arg_end); + thread, iType(arg_begin), iType(arg_end)); } KOKKOS_INLINE_FUNCTION @@ -983,7 +984,7 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( //---------------------------------------------------------------------------- -/** \brief Intra-thread vector parallel exclusive prefix sum. +/** \brief Intra-thread vector parallel scan with reducer. * * Executes closure(iType i, ValueType & val, bool final) for each i=[0..N) * @@ -991,25 +992,25 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( * thread and a scan operation is performed. * The last call to closure has final == true. */ -template -KOKKOS_INLINE_FUNCTION void parallel_scan( - const Impl::ThreadVectorRangeBoundariesStruct& - loop_boundaries, - const Closure& closure) { +template +KOKKOS_INLINE_FUNCTION + typename std::enable_if::value>::type + parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::CudaTeamMember>& loop_boundaries, + const Closure& closure, const ReducerType& reducer) { (void)loop_boundaries; (void)closure; + (void)reducer; #ifdef __CUDA_ARCH__ - // Extract value_type from closure - - using value_type = typename Kokkos::Impl::FunctorAnalysis< - Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type; + using value_type = typename ReducerType::value_type; + value_type accum; + reducer.init(accum); + const value_type identity = accum; // Loop through boundaries by vector-length chunks // must scan at each iteration - value_type accum = 0; - // All thread "lanes" must loop the same number of times. // Determine an loop end for all thread "lanes." // Requires: @@ -1026,44 +1027,68 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( const int end = loop_boundaries.end + (rem ? blockDim.x - rem : 0); for (int i = threadIdx.x; i < end; i += blockDim.x) { - value_type val = 0; + value_type val = identity; - // First acquire per-lane contributions: - if (i < loop_boundaries.end) closure(i, val, false); + // First acquire per-lane contributions. + // This sets i's val to i-1's contribution + // to make the latter in_place_shfl_up an + // exclusive scan -- the final accumulation + // of i's val will be included in the second + // closure call later. + if (i < loop_boundaries.end && threadIdx.x > 0) closure(i - 1, val, false); - value_type sval = val; - - // Bottom up inclusive scan in triangular pattern + // Bottom up exclusive scan in triangular pattern // where each CUDA thread is the root of a reduction tree // from the zeroth "lane" to itself. // [t] += [t-1] if t >= 1 // [t] += [t-2] if t >= 2 // [t] += [t-4] if t >= 4 // ... - + // This differs from the non-reducer overload, where an inclusive scan was + // implemented, because in general the binary operator cannot be inverted + // and we would not be able to remove the inclusive contribution by + // inversion. for (int j = 1; j < (int)blockDim.x; j <<= 1) { - value_type tmp = 0; - Impl::in_place_shfl_up(tmp, sval, j, blockDim.x, active_mask); + value_type tmp = identity; + Impl::in_place_shfl_up(tmp, val, j, blockDim.x, active_mask); if (j <= (int)threadIdx.x) { - sval += tmp; + reducer.join(val, tmp); } } - // Include accumulation and remove value for exclusive scan: - val = accum + sval - val; + // Include accumulation + reducer.join(val, accum); - // Provide exclusive scan value: + // Update i's contribution into the val + // and add it to accum for next round if (i < loop_boundaries.end) closure(i, val, true); - - // Accumulate the last value in the inclusive scan: - Impl::in_place_shfl(sval, sval, mask, blockDim.x, active_mask); - - accum += sval; + Impl::in_place_shfl(accum, val, mask, blockDim.x, active_mask); } #endif } +//---------------------------------------------------------------------------- + +/** \brief Intra-thread vector parallel exclusive prefix sum. + * + * Executes closure(iType i, ValueType & val, bool final) for each i=[0..N) + * + * The range [0..N) is mapped to all vector lanes in the + * thread and a scan operation is performed. + * The last call to closure has final == true. + */ +template +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::ThreadVectorRangeBoundariesStruct& + loop_boundaries, + const Closure& closure) { + using value_type = typename Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type; + value_type dummy; + parallel_scan(loop_boundaries, closure, Kokkos::Sum(dummy)); +} + } // namespace Kokkos namespace Kokkos { diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp index f24abb377d..c55956ede9 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp @@ -139,7 +139,7 @@ struct CudaLDGFetch { template KOKKOS_INLINE_FUNCTION ValueType operator[](const iType& i) const { -#ifdef __CUDA_ARCH__ +#if defined(__CUDA_ARCH__) && (350 <= _CUDA_ARCH__) AliasType v = __ldg(reinterpret_cast(&m_ptr[i])); return *(reinterpret_cast(&v)); #else diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp index 05876a9f02..fc52e41514 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp @@ -46,6 +46,7 @@ #define KOKKOS_CUDA_WORKGRAPHPOLICY_HPP #include +#include namespace Kokkos { namespace Impl { diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp index 89135b6c45..9278d1bdc9 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp @@ -75,17 +75,6 @@ void hipOccupancy(int *numBlocks, int blockSize, int sharedmem) { hipOccupancy( numBlocks, blockSize, sharedmem); } -template -struct HIPGetMaxBlockSize; - -template -int hip_get_max_block_size(typename DriverType::functor_type const &f, - size_t const vector_length, - size_t const shmem_extra_block, - size_t const shmem_extra_thread) { - return HIPGetMaxBlockSize::get_block_size( - f, vector_length, shmem_extra_block, shmem_extra_thread); -} template int hip_internal_get_block_size(const F &condition_check, @@ -131,10 +120,6 @@ int hip_internal_get_block_size(const F &condition_check, int opt_block_size = (blocks_per_sm >= min_blocks_per_sm) ? block_size : min_blocks_per_sm; int opt_threads_per_sm = threads_per_sm; - // printf("BlockSizeMax: %i Shmem: %i %i %i %i Regs: %i %i Blocks: %i %i - // Achieved: %i %i Opt: %i %i\n",block_size, - // shmem_per_sm,max_shmem_per_block,functor_shmem,total_shmem, - // regs_per_sm,regs_per_wavefront,max_blocks_shmem,max_blocks_regs,blocks_per_sm,threads_per_sm,opt_block_size,opt_threads_per_sm); block_size -= HIPTraits::WarpSize; while (condition_check(blocks_per_sm) && (block_size >= HIPTraits::WarpSize)) { @@ -160,10 +145,6 @@ int hip_internal_get_block_size(const F &condition_check, opt_threads_per_sm = threads_per_sm; } } - // printf("BlockSizeMax: %i Shmem: %i %i %i %i Regs: %i %i Blocks: %i %i - // Achieved: %i %i Opt: %i %i\n",block_size, - // shmem_per_sm,max_shmem_per_block,functor_shmem,total_shmem, - // regs_per_sm,regs_per_wavefront,max_blocks_shmem,max_blocks_regs,blocks_per_sm,threads_per_sm,opt_block_size,opt_threads_per_sm); block_size -= HIPTraits::WarpSize; } return opt_block_size; @@ -178,62 +159,6 @@ int hip_get_max_block_size(const HIPInternal *hip_instance, [](int x) { return x == 0; }, hip_instance, attr, f, vector_length, shmem_block, shmem_thread); } -template -struct HIPGetMaxBlockSize { - static int get_block_size(typename DriverType::functor_type const &f, - size_t const vector_length, - size_t const shmem_extra_block, - size_t const shmem_extra_thread) { - int numBlocks = 0; - int blockSize = LaunchBounds::maxTperB == 0 ? 1024 : LaunchBounds::maxTperB; - int sharedmem = - shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) + - ::Kokkos::Impl::FunctorTeamShmemSize< - typename DriverType::functor_type>::value(f, blockSize / - vector_length); - - hipOccupancy(&numBlocks, blockSize, sharedmem); - - if (numBlocks > 0) return blockSize; - while (blockSize > HIPTraits::WarpSize && numBlocks == 0) { - blockSize /= 2; - sharedmem = - shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) + - ::Kokkos::Impl::FunctorTeamShmemSize< - typename DriverType::functor_type>::value(f, blockSize / - vector_length); - - hipOccupancy(&numBlocks, blockSize, sharedmem); - } - int blockSizeUpperBound = blockSize * 2; - while (blockSize < blockSizeUpperBound && numBlocks > 0) { - blockSize += HIPTraits::WarpSize; - sharedmem = - shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) + - ::Kokkos::Impl::FunctorTeamShmemSize< - typename DriverType::functor_type>::value(f, blockSize / - vector_length); - - hipOccupancy(&numBlocks, blockSize, sharedmem); - } - return blockSize - HIPTraits::WarpSize; - } -}; - -template -struct HIPGetOptBlockSize; - -template -int hip_get_opt_block_size(typename DriverType::functor_type const &f, - size_t const vector_length, - size_t const shmem_extra_block, - size_t const shmem_extra_thread) { - return HIPGetOptBlockSize< - DriverType, LaunchBounds, - (HIPTraits::ConstantMemoryUseThreshold < - sizeof(DriverType))>::get_block_size(f, vector_length, shmem_extra_block, - shmem_extra_thread); -} template int hip_get_opt_block_size(HIPInternal const *hip_instance, @@ -245,157 +170,6 @@ int hip_get_opt_block_size(HIPInternal const *hip_instance, shmem_block, shmem_thread); } -// FIXME_HIP the code is identical to the false struct except for -// hip_parallel_launch_constant_memory -template -struct HIPGetOptBlockSize, true> { - static int get_block_size(typename DriverType::functor_type const &f, - size_t const vector_length, - size_t const shmem_extra_block, - size_t const shmem_extra_thread) { - int blockSize = HIPTraits::WarpSize / 2; - int numBlocks; - int sharedmem; - int maxOccupancy = 0; - int bestBlockSize = 0; - - while (blockSize < HIPTraits::MaxThreadsPerBlock) { - blockSize *= 2; - - // calculate the occupancy with that optBlockSize and check whether its - // larger than the largest one found so far - sharedmem = - shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) + - ::Kokkos::Impl::FunctorTeamShmemSize< - typename DriverType::functor_type>::value(f, blockSize / - vector_length); - hipOccupancy(&numBlocks, blockSize, sharedmem); - if (maxOccupancy < numBlocks * blockSize) { - maxOccupancy = numBlocks * blockSize; - bestBlockSize = blockSize; - } - } - return bestBlockSize; - } -}; - -template -struct HIPGetOptBlockSize, false> { - static int get_block_size(const typename DriverType::functor_type &f, - const size_t vector_length, - const size_t shmem_extra_block, - const size_t shmem_extra_thread) { - int blockSize = HIPTraits::WarpSize / 2; - int numBlocks; - int sharedmem; - int maxOccupancy = 0; - int bestBlockSize = 0; - - while (blockSize < HIPTraits::MaxThreadsPerBlock) { - blockSize *= 2; - sharedmem = - shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) + - ::Kokkos::Impl::FunctorTeamShmemSize< - typename DriverType::functor_type>::value(f, blockSize / - vector_length); - - hipOccupancy(&numBlocks, blockSize, sharedmem); - - if (maxOccupancy < numBlocks * blockSize) { - maxOccupancy = numBlocks * blockSize; - bestBlockSize = blockSize; - } - } - return bestBlockSize; - } -}; - -// FIXME_HIP the code is identical to the false struct except for -// hip_parallel_launch_constant_memory -template -struct HIPGetOptBlockSize< - DriverType, Kokkos::LaunchBounds, - true> { - static int get_block_size(const typename DriverType::functor_type &f, - const size_t vector_length, - const size_t shmem_extra_block, - const size_t shmem_extra_thread) { - int blockSize = HIPTraits::WarpSize / 2; - int numBlocks; - int sharedmem; - int maxOccupancy = 0; - int bestBlockSize = 0; - int max_threads_per_block = - std::min(MaxThreadsPerBlock, - hip_internal_maximum_warp_count() * HIPTraits::WarpSize); - - while (blockSize < max_threads_per_block) { - blockSize *= 2; - - // calculate the occupancy with that optBlockSize and check whether its - // larger than the largest one found so far - sharedmem = - shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) + - ::Kokkos::Impl::FunctorTeamShmemSize< - typename DriverType::functor_type>::value(f, blockSize / - vector_length); - hipOccupancy( - &numBlocks, blockSize, sharedmem); - if (numBlocks >= static_cast(MinBlocksPerSM) && - blockSize <= static_cast(MaxThreadsPerBlock)) { - if (maxOccupancy < numBlocks * blockSize) { - maxOccupancy = numBlocks * blockSize; - bestBlockSize = blockSize; - } - } - } - if (maxOccupancy > 0) return bestBlockSize; - return -1; - } -}; - -template -struct HIPGetOptBlockSize< - DriverType, Kokkos::LaunchBounds, - false> { - static int get_block_size(const typename DriverType::functor_type &f, - const size_t vector_length, - const size_t shmem_extra_block, - const size_t shmem_extra_thread) { - int blockSize = HIPTraits::WarpSize / 2; - int numBlocks; - int sharedmem; - int maxOccupancy = 0; - int bestBlockSize = 0; - int max_threads_per_block = - std::min(MaxThreadsPerBlock, - hip_internal_maximum_warp_count() * HIPTraits::WarpSize); - - while (blockSize < max_threads_per_block) { - blockSize *= 2; - sharedmem = - shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) + - ::Kokkos::Impl::FunctorTeamShmemSize< - typename DriverType::functor_type>::value(f, blockSize / - vector_length); - - hipOccupancy( - &numBlocks, blockSize, sharedmem); - if (numBlocks >= int(MinBlocksPerSM) && - blockSize <= int(MaxThreadsPerBlock)) { - if (maxOccupancy < numBlocks * blockSize) { - maxOccupancy = numBlocks * blockSize; - bestBlockSize = blockSize; - } - } - } - if (maxOccupancy > 0) return bestBlockSize; - return -1; - } -}; - } // namespace Impl } // namespace Experimental } // namespace Kokkos diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp index 45512038ac..18ef10e22c 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -164,6 +164,8 @@ HIPInternal &HIPInternal::singleton() { void HIPInternal::fence() const { HIP_SAFE_CALL(hipStreamSynchronize(m_stream)); + // can reset our cycle id now as well + m_cycleId = 0; } void HIPInternal::initialize(int hip_device_id, hipStream_t stream) { @@ -256,7 +258,7 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) { void>; Record *const r = Record::allocate(Kokkos::Experimental::HIPSpace(), - "InternalScratchBitset", + "Kokkos::InternalScratchBitset", sizeof(uint32_t) * buffer_bound); Record::increment(r); @@ -303,8 +305,10 @@ Kokkos::Experimental::HIP::size_type *HIPInternal::scratch_space( Kokkos::Impl::SharedAllocationRecord; - static Record *const r = Record::allocate( - Kokkos::Experimental::HIPSpace(), "InternalScratchSpace", + if (m_scratchSpace) Record::decrement(Record::get_record(m_scratchSpace)); + + Record *const r = Record::allocate( + Kokkos::Experimental::HIPSpace(), "Kokkos::InternalScratchSpace", (sizeScratchGrain * m_scratchSpaceCount)); Record::increment(r); @@ -325,8 +329,10 @@ Kokkos::Experimental::HIP::size_type *HIPInternal::scratch_flags( Kokkos::Impl::SharedAllocationRecord; + if (m_scratchFlags) Record::decrement(Record::get_record(m_scratchFlags)); + Record *const r = Record::allocate( - Kokkos::Experimental::HIPSpace(), "InternalScratchFlags", + Kokkos::Experimental::HIPSpace(), "Kokkos::InternalScratchFlags", (sizeScratchGrain * m_scratchFlagsCount)); Record::increment(r); @@ -345,7 +351,7 @@ void *HIPInternal::resize_team_scratch_space(std::int64_t bytes, if (m_team_scratch_current_size == 0) { m_team_scratch_current_size = bytes; m_team_scratch_ptr = Kokkos::kokkos_malloc( - "HIPSpace::ScratchMemory", m_team_scratch_current_size); + "Kokkos::HIPSpace::TeamScratchMemory", m_team_scratch_current_size); } if ((bytes > m_team_scratch_current_size) || ((bytes < m_team_scratch_current_size) && (force_shrink))) { @@ -388,6 +394,40 @@ void HIPInternal::finalize() { m_team_scratch_current_size = 0; m_team_scratch_ptr = nullptr; } + if (nullptr != d_driverWorkArray) { + HIP_SAFE_CALL(hipHostFree(d_driverWorkArray)); + d_driverWorkArray = nullptr; + } +} + +char *HIPInternal::get_next_driver(size_t driverTypeSize) const { + std::lock_guard const lock(m_mutexWorkArray); + if (d_driverWorkArray == nullptr) { + HIP_SAFE_CALL( + hipHostMalloc(&d_driverWorkArray, + m_maxDriverCycles * m_maxDriverTypeSize * sizeof(char), + hipHostMallocNonCoherent)); + } + if (driverTypeSize > m_maxDriverTypeSize) { + // fence handles the cycle id reset for us + fence(); + HIP_SAFE_CALL(hipHostFree(d_driverWorkArray)); + m_maxDriverTypeSize = driverTypeSize; + if (m_maxDriverTypeSize % 128 != 0) + m_maxDriverTypeSize = + m_maxDriverTypeSize + 128 - m_maxDriverTypeSize % 128; + HIP_SAFE_CALL( + hipHostMalloc(&d_driverWorkArray, + m_maxDriverCycles * m_maxDriverTypeSize * sizeof(char), + hipHostMallocNonCoherent)); + } else { + m_cycleId = (m_cycleId + 1) % m_maxDriverCycles; + if (m_cycleId == 0) { + // ensure any outstanding kernels are completed before we wrap around + fence(); + } + } + return &d_driverWorkArray[m_maxDriverTypeSize * m_cycleId]; } //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp index 07ec8625e6..f4f88628e3 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp @@ -49,6 +49,8 @@ #include +#include + namespace Kokkos { namespace Experimental { namespace Impl { @@ -83,33 +85,46 @@ class HIPInternal { public: using size_type = ::Kokkos::Experimental::HIP::size_type; - int m_hipDev; - int m_hipArch; - unsigned m_multiProcCount; - unsigned m_maxWarpCount; - unsigned m_maxBlock; - unsigned m_maxBlocksPerSM; - unsigned m_maxSharedWords; + int m_hipDev = -1; + int m_hipArch = -1; + unsigned m_multiProcCount = 0; + unsigned m_maxWarpCount = 0; + unsigned m_maxBlock = 0; + unsigned m_maxBlocksPerSM = 0; + unsigned m_maxSharedWords = 0; int m_regsPerSM; - int m_shmemPerSM; - int m_maxShmemPerBlock; - int m_maxThreadsPerSM; + int m_shmemPerSM = 0; + int m_maxShmemPerBlock = 0; + int m_maxThreadsPerSM = 0; + + // array of DriverTypes to be allocated in host-pinned memory for async + // kernel launches + mutable char *d_driverWorkArray = nullptr; + // number of kernel launches that can be in-flight w/o synchronization + const int m_maxDriverCycles = 100; + // max size of a DriverType [bytes] + mutable size_t m_maxDriverTypeSize = 1024 * 10; + // the current index in the driverWorkArray + mutable int m_cycleId = 0; + // mutex to access d_driverWorkArray + mutable std::mutex m_mutexWorkArray; // Scratch Spaces for Reductions - size_type m_scratchSpaceCount; - size_type m_scratchFlagsCount; + size_type m_scratchSpaceCount = 0; + size_type m_scratchFlagsCount = 0; - size_type *m_scratchSpace; - size_type *m_scratchFlags; + size_type *m_scratchSpace = nullptr; + size_type *m_scratchFlags = nullptr; uint32_t *m_scratchConcurrentBitset = nullptr; hipDeviceProp_t m_deviceProp; - hipStream_t m_stream; + hipStream_t m_stream = nullptr; // Team Scratch Level 1 Space - mutable int64_t m_team_scratch_current_size; - mutable void *m_team_scratch_ptr; + mutable int64_t m_team_scratch_current_size = 0; + mutable void *m_team_scratch_ptr = nullptr; + mutable std::mutex m_team_scratch_mutex; bool was_finalized = false; @@ -117,9 +132,7 @@ class HIPInternal { int verify_is_initialized(const char *const label) const; - int is_initialized() const { - return m_hipDev >= 0; - } // 0 != m_scratchSpace && 0 != m_scratchFlags ; } + int is_initialized() const { return m_hipDev >= 0; } void initialize(int hip_device_id, hipStream_t stream = nullptr); void finalize(); @@ -128,25 +141,12 @@ class HIPInternal { void fence() const; + // returns the next driver type pointer in our work array + char *get_next_driver(size_t driverTypeSize) const; + ~HIPInternal(); - HIPInternal() - : m_hipDev(-1), - m_hipArch(-1), - m_multiProcCount(0), - m_maxWarpCount(0), - m_maxBlock(0), - m_maxSharedWords(0), - m_shmemPerSM(0), - m_maxShmemPerBlock(0), - m_maxThreadsPerSM(0), - m_scratchSpaceCount(0), - m_scratchFlagsCount(0), - m_scratchSpace(nullptr), - m_scratchFlags(nullptr), - m_stream(nullptr), - m_team_scratch_current_size(0), - m_team_scratch_ptr(nullptr) {} + HIPInternal() = default; // Resizing of reduction related scratch spaces size_type *scratch_space(const size_type size); diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp index 3e972c7346..f774423b37 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp @@ -49,9 +49,9 @@ #if defined(__HIPCC__) -#include #include #include +#include // Must use global variable on the device with HIP-Clang #ifdef __HIP__ @@ -127,19 +127,69 @@ struct HIPDispatchProperties { HIPLaunchMechanism launch_mechanism = l; }; -template , +template +struct HIPParallelLaunchKernelFunc; + +template +struct HIPParallelLaunchKernelFunc< + DriverType, Kokkos::LaunchBounds, + HIPLaunchMechanism::LocalMemory> { + static auto get_kernel_func() { + return hip_parallel_launch_local_memory; + } +}; + +template +struct HIPParallelLaunchKernelFunc, + HIPLaunchMechanism::LocalMemory> { + static auto get_kernel_func() { + return hip_parallel_launch_local_memory; + } +}; + +template +struct HIPParallelLaunchKernelInvoker; + +template +struct HIPParallelLaunchKernelInvoker + : HIPParallelLaunchKernelFunc { + using base_t = HIPParallelLaunchKernelFunc; + + static void invoke_kernel(DriverType const *driver, dim3 const &grid, + dim3 const &block, int shmem, + HIPInternal const *hip_instance) { + (base_t::get_kernel_func())<<m_stream>>>( + driver); + } +}; + +template , HIPLaunchMechanism LaunchMechanism = HIPLaunchMechanism::LocalMemory> struct HIPParallelLaunch; -template struct HIPParallelLaunch< DriverType, Kokkos::LaunchBounds, - HIPLaunchMechanism::LocalMemory> { - inline HIPParallelLaunch(const DriverType &driver, const dim3 &grid, - const dim3 &block, const int shmem, - const HIPInternal *hip_instance, - const bool /*prefer_shmem*/) { + HIPLaunchMechanism::LocalMemory> + : HIPParallelLaunchKernelInvoker< + DriverType, Kokkos::LaunchBounds, + HIPLaunchMechanism::LocalMemory> { + using base_t = HIPParallelLaunchKernelInvoker< + DriverType, Kokkos::LaunchBounds, + HIPLaunchMechanism::LocalMemory>; + + HIPParallelLaunch(const DriverType &driver, const dim3 &grid, + const dim3 &block, const int shmem, + const HIPInternal *hip_instance, + const bool /*prefer_shmem*/) { if ((grid.x != 0) && ((block.x * block.y * block.z) != 0)) { if (hip_instance->m_maxShmemPerBlock < shmem) { Kokkos::Impl::throw_runtime_exception( @@ -148,72 +198,16 @@ struct HIPParallelLaunch< KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE(); - // FIXME_HIP -- there is currently an error copying (some) structs - // by value to the device in HIP-Clang / VDI - // As a workaround, we can malloc the DriverType and explictly copy over. - // To remove once solved in HIP - DriverType *d_driver; - HIP_SAFE_CALL(hipMalloc(&d_driver, sizeof(DriverType))); - HIP_SAFE_CALL(hipMemcpyAsync(d_driver, &driver, sizeof(DriverType), - hipMemcpyHostToDevice, - hip_instance->m_stream)); - hip_parallel_launch_local_memory - <<m_stream>>>(d_driver); - -#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - HIP_SAFE_CALL(hipGetLastError()); - hip_instance->fence(); -#endif - HIP_SAFE_CALL(hipFree(d_driver)); - } - } - - static hipFuncAttributes get_hip_func_attributes() { - static hipFuncAttributes attr = []() { - hipFuncAttributes attr; - HIP_SAFE_CALL(hipFuncGetAttributes( - &attr, - reinterpret_cast( - hip_parallel_launch_local_memory))); - return attr; - }(); - return attr; - } -}; - -template -struct HIPParallelLaunch, - HIPLaunchMechanism::LocalMemory> { - inline HIPParallelLaunch(const DriverType &driver, const dim3 &grid, - const dim3 &block, const int shmem, - const HIPInternal *hip_instance, - const bool /*prefer_shmem*/) { - if ((grid.x != 0) && ((block.x * block.y * block.z) != 0)) { - if (hip_instance->m_maxShmemPerBlock < shmem) { - Kokkos::Impl::throw_runtime_exception(std::string( - "HIPParallelLaunch FAILED: shared memory request is too large")); - } - - KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE(); - // Invoke the driver function on the device - - // FIXME_HIP -- see note about struct copy by value above - DriverType *d_driver; - HIP_SAFE_CALL(hipMalloc(&d_driver, sizeof(DriverType))); - HIP_SAFE_CALL(hipMemcpyAsync(d_driver, &driver, sizeof(DriverType), - hipMemcpyHostToDevice, - hip_instance->m_stream)); - hip_parallel_launch_local_memory - <<m_stream>>>(d_driver); + DriverType *d_driver = reinterpret_cast( + hip_instance->get_next_driver(sizeof(DriverType))); + std::memcpy((void *)d_driver, (void *)&driver, sizeof(DriverType)); + base_t::invoke_kernel(d_driver, grid, block, shmem, hip_instance); #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) HIP_SAFE_CALL(hipGetLastError()); hip_instance->fence(); #endif - HIP_SAFE_CALL(hipFree(d_driver)); } } @@ -221,8 +215,7 @@ struct HIPParallelLaunch, static hipFuncAttributes attr = []() { hipFuncAttributes attr; HIP_SAFE_CALL(hipFuncGetAttributes( - &attr, reinterpret_cast( - hip_parallel_launch_local_memory))); + &attr, reinterpret_cast(base_t::get_kernel_func()))); return attr; }(); return attr; diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp new file mode 100644 index 0000000000..ce1aff9586 --- /dev/null +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp @@ -0,0 +1,37 @@ +#ifndef KOKKOS_HIP_MDRANGEPOLICY_HPP_ +#define KOKKOS_HIP_MDRANGEPOLICY_HPP_ + +#include + +namespace Kokkos { + +template <> +struct default_outer_direction { + using type = Iterate; + static constexpr Iterate value = Iterate::Left; +}; + +template <> +struct default_inner_direction { + using type = Iterate; + static constexpr Iterate value = Iterate::Left; +}; + +namespace Impl { + +// Settings for MDRangePolicy +template <> +inline TileSizeProperties get_tile_size_properties( + const Kokkos::Experimental::HIP& space) { + TileSizeProperties properties; + properties.max_threads = + space.impl_internal_space_instance()->m_maxThreadsPerSM; + properties.default_largest_tile_size = 16; + properties.default_tile_size = 4; + properties.max_total_tile_size = 1024; + return properties; +} + +} // Namespace Impl +} // Namespace Kokkos +#endif diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp index 6b831ff7a3..35e7d6fb85 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp @@ -49,6 +49,7 @@ #include #include #include +#include #include namespace Kokkos { @@ -72,7 +73,7 @@ class ParallelFor, ParallelFor& operator=(ParallelFor const&) = delete; public: - inline __device__ void operator()(void) const { + inline __device__ void operator()() const { Kokkos::Impl::DeviceIterateTile(m_policy, m_functor) @@ -175,6 +176,25 @@ class ParallelFor, ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) : m_functor(arg_functor), m_policy(arg_policy) {} + + template + static int max_tile_size_product(const Policy& pol, const Functor&) { + using closure_type = + ParallelFor, + Kokkos::Experimental::HIP>; + hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch< + closure_type, LaunchBounds>::get_hip_func_attributes(); + auto const& prop = pol.space().hip_device_prop(); + // Limits due to registers/SM, MDRange doesn't have + // shared memory constraints + int const regs_per_sm = prop.regsPerMultiprocessor; + int const regs_per_thread = attr.numRegs; + int const max_threads_per_sm = regs_per_sm / regs_per_thread; + return std::min( + max_threads_per_sm, + static_cast( + Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock)); + } }; // ParallelReduce @@ -231,7 +251,7 @@ class ParallelReduce, ReducerType, DeviceIteratePattern(m_policy, m_functor, update).exec_range(); } - inline __device__ void operator()(void) const { + inline __device__ void operator()() const { const integral_nonzero_constant word_count(ValueTraits::value_size( @@ -291,13 +311,19 @@ class ParallelReduce, ReducerType, ::Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock; int shmem_size = ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem< false, FunctorType, WorkTag>(f, n); + using closure_type = Impl::ParallelReduce; + hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch< + closure_type, LaunchBounds>::get_hip_func_attributes(); while ( (n && (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < shmem_size)) || - (n > static_cast( - ::Kokkos::Experimental::Impl::hip_get_max_block_size< - ParallelReduce, LaunchBounds>(f, 1, shmem_size, 0)))) { + (n > + static_cast( + ::Kokkos::Experimental::Impl::hip_get_max_block_size( + m_policy.space().impl_internal_space_instance(), attr, f, 1, + shmem_size, 0)))) { n >>= 1; shmem_size = ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem< false, FunctorType, WorkTag>(f, n); @@ -391,6 +417,23 @@ class ParallelReduce, ReducerType, memory_space>::accessible), m_scratch_space(nullptr), m_scratch_flags(nullptr) {} + template + static int max_tile_size_product(const Policy& pol, const Functor&) { + using closure_type = + ParallelReduce, + ReducerType, Kokkos::Experimental::HIP>; + hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch< + closure_type, LaunchBounds>::get_hip_func_attributes(); + auto const& prop = pol.space().hip_device_prop(); + // Limits due do registers/SM + int const regs_per_sm = prop.regsPerMultiprocessor; + int const regs_per_thread = attr.numRegs; + int const max_threads_per_sm = regs_per_sm / regs_per_thread; + return std::min( + max_threads_per_sm, + static_cast( + Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock)); + } }; } // namespace Impl } // namespace Kokkos diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp index 5607f1c91a..7d2825eeb4 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp @@ -92,7 +92,7 @@ class ParallelFor, public: using functor_type = FunctorType; - inline __device__ void operator()(void) const { + inline __device__ void operator()() const { const Member work_stride = blockDim.y * gridDim.x; const Member work_end = m_policy.end(); @@ -174,11 +174,14 @@ class ParallelReduce, ReducerType, size_type* m_scratch_space = nullptr; size_type* m_scratch_flags = nullptr; - // FIXME_HIP_PERFORMANCE Need a rule to choose when to use shared memory and - // when to use shuffle +#if HIP_VERSION < 401 static bool constexpr UseShflReduction = ((sizeof(value_type) > 2 * sizeof(double)) && static_cast(ValueTraits::StaticValueSize)); +#else + static bool constexpr UseShflReduction = + static_cast(ValueTraits::StaticValueSize); +#endif private: struct ShflReductionTag {}; @@ -330,13 +333,19 @@ class ParallelReduce, ReducerType, int shmem_size = hip_single_inter_block_reduce_scan_shmem( f, n); + using closure_type = Impl::ParallelReduce; + hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch< + closure_type, LaunchBounds>::get_hip_func_attributes(); while ( (n && (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < shmem_size)) || - (n > static_cast( - Kokkos::Experimental::Impl::hip_get_max_block_size< - ParallelReduce, LaunchBounds>(f, 1, shmem_size, 0)))) { + (n > + static_cast( + ::Kokkos::Experimental::Impl::hip_get_max_block_size( + m_policy.space().impl_internal_space_instance(), attr, f, 1, + shmem_size, 0)))) { n >>= 1; shmem_size = hip_single_inter_block_reduce_scan_shmem( @@ -493,7 +502,7 @@ class ParallelScanHIPBase { //---------------------------------------- - __device__ inline void initial(void) const { + __device__ inline void initial() const { const integral_nonzero_constant word_count(ValueTraits::value_size(m_functor) / sizeof(size_type)); @@ -529,7 +538,7 @@ class ParallelScanHIPBase { //---------------------------------------- - __device__ inline void final(void) const { + __device__ inline void final() const { const integral_nonzero_constant word_count(ValueTraits::value_size(m_functor) / sizeof(size_type)); @@ -606,7 +615,7 @@ class ParallelScanHIPBase { public: //---------------------------------------- - __device__ inline void operator()(void) const { + __device__ inline void operator()() const { if (!m_final) { initial(); } else { diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp index 5da83d289e..96c3ff2a75 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp @@ -433,6 +433,9 @@ class ParallelFor, int m_shmem_size; void* m_scratch_ptr[2]; int m_scratch_size[2]; + // Only let one ParallelFor/Reduce modify the team scratch memory. The + // constructor acquires the mutex which is released in the destructor. + std::unique_lock m_scratch_lock; template __device__ inline @@ -449,7 +452,7 @@ class ParallelFor, } public: - __device__ inline void operator()(void) const { + __device__ inline void operator()() const { // Iterate this block through the league int64_t threadid = 0; if (m_scratch_size[1] > 0) { @@ -513,7 +516,10 @@ class ParallelFor, m_policy(arg_policy), m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()) { + m_vector_size(arg_policy.impl_vector_length()), + m_scratch_lock(m_policy.space() + .impl_internal_space_instance() + ->m_team_scratch_mutex) { hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch< ParallelFor, launch_bounds>::get_hip_func_attributes(); m_team_size = @@ -640,6 +646,9 @@ class ParallelReduce, const size_type m_league_size; int m_team_size; const size_type m_vector_size; + // Only let one ParallelFor/Reduce modify the team scratch memory. The + // constructor acquires the mutex which is released in the destructor. + std::unique_lock m_scratch_lock; template __device__ inline @@ -877,7 +886,10 @@ class ParallelReduce, m_scratch_ptr{nullptr, nullptr}, m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()) { + m_vector_size(arg_policy.impl_vector_length()), + m_scratch_lock(m_policy.space() + .impl_internal_space_instance() + ->m_team_scratch_mutex) { hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch< ParallelReduce, launch_bounds>::get_hip_func_attributes(); m_team_size = @@ -976,7 +988,10 @@ class ParallelReduce, m_scratch_ptr{nullptr, nullptr}, m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()) { + m_vector_size(arg_policy.impl_vector_length()), + m_scratch_lock(m_policy.space() + .impl_internal_space_instance() + ->m_team_scratch_mutex) { hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch< ParallelReduce, launch_bounds>::get_hip_func_attributes(); m_team_size = diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp index 00cef28f82..15ca089d14 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp @@ -42,12 +42,6 @@ //@HEADER */ -#include -#include -#include -#include -#include -#include #include #include @@ -57,6 +51,13 @@ #include #include +#include +#include +#include +#include +#include +#include + /*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/ namespace Kokkos { @@ -172,14 +173,14 @@ void DeepCopyAsyncHIP(void* dst, void const* src, size_t n) { namespace Kokkos { -void Experimental::HIPSpace::access_error() { +KOKKOS_DEPRECATED void Experimental::HIPSpace::access_error() { const std::string msg( "Kokkos::Experimental::HIPSpace::access_error attempt to execute " "Experimental::HIP function from non-HIP space"); Kokkos::Impl::throw_runtime_exception(msg); } -void Experimental::HIPSpace::access_error(const void* const) { +KOKKOS_DEPRECATED void Experimental::HIPSpace::access_error(const void* const) { const std::string msg( "Kokkos::Experimental::HIPSpace::access_error attempt to execute " "Experimental::HIP function from non-HIP space"); @@ -326,45 +327,6 @@ SharedAllocationRecord SharedAllocationRecord< Kokkos::Experimental::HIPHostPinnedSpace, void>::s_root_record; #endif -std::string SharedAllocationRecord::get_label() const { - SharedAllocationHeader header; - - Kokkos::Impl::DeepCopy( - &header, RecordBase::head(), sizeof(SharedAllocationHeader)); - - return std::string(header.m_label); -} - -std::string SharedAllocationRecord::get_label() const { - return std::string(RecordBase::head()->m_label); -} - -SharedAllocationRecord* -SharedAllocationRecord::allocate( - const Kokkos::Experimental::HIPSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size) { - return new SharedAllocationRecord(arg_space, arg_label, arg_alloc_size); -} - -SharedAllocationRecord* -SharedAllocationRecord:: - allocate(const Kokkos::Experimental::HIPHostPinnedSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size) { - return new SharedAllocationRecord(arg_space, arg_label, arg_alloc_size); -} - -void SharedAllocationRecord::deallocate( - SharedAllocationRecord* arg_rec) { - delete static_cast(arg_rec); -} - -void SharedAllocationRecord:: - deallocate(SharedAllocationRecord* arg_rec) { - delete static_cast(arg_rec); -} - SharedAllocationRecord::~SharedAllocationRecord() { const char* label = nullptr; @@ -393,7 +355,7 @@ SharedAllocationRecord:: const SharedAllocationRecord::function_type arg_dealloc) // Pass through allocated [ SharedAllocationHeader , user_memory ] // Pass through deallocation function - : SharedAllocationRecord( + : base_t( #ifdef KOKKOS_ENABLE_DEBUG &SharedAllocationRecord::s_root_record, @@ -405,13 +367,7 @@ SharedAllocationRecord:: SharedAllocationHeader header; - // Fill in the Header information - header.m_record = static_cast*>(this); - - strncpy(header.m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length); - // Set last element zero, in case c_str is too long - header.m_label[SharedAllocationHeader::maximum_label_length - 1] = (char)0; + this->base_t::_fill_host_accessible_header_info(header, arg_label); // Copy to device memory Kokkos::Impl::DeepCopy( @@ -425,7 +381,7 @@ SharedAllocationRecord:: const SharedAllocationRecord::function_type arg_dealloc) // Pass through allocated [ SharedAllocationHeader , user_memory ] // Pass through deallocation function - : SharedAllocationRecord( + : base_t( #ifdef KOKKOS_ENABLE_DEBUG &SharedAllocationRecord::s_root_record, @@ -435,223 +391,8 @@ SharedAllocationRecord:: sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc), m_space(arg_space) { // Fill in the Header information, directly accessible via host pinned memory - - RecordBase::m_alloc_ptr->m_record = this; - - strncpy(RecordBase::m_alloc_ptr->m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length); - // Set last element zero, in case c_str is too long - RecordBase::m_alloc_ptr - ->m_label[SharedAllocationHeader::maximum_label_length - 1] = (char)0; -} - -//---------------------------------------------------------------------------- - -void* SharedAllocationRecord:: - allocate_tracked(const Kokkos::Experimental::HIPSpace& arg_space, - const std::string& arg_alloc_label, - const size_t arg_alloc_size) { - if (!arg_alloc_size) return nullptr; - - SharedAllocationRecord* const r = - allocate(arg_space, arg_alloc_label, arg_alloc_size); - - RecordBase::increment(r); - - return r->data(); -} - -void SharedAllocationRecord::deallocate_tracked(void* const - arg_alloc_ptr) { - if (arg_alloc_ptr != nullptr) { - SharedAllocationRecord* const r = get_record(arg_alloc_ptr); - - RecordBase::decrement(r); - } -} - -void* SharedAllocationRecord:: - reallocate_tracked(void* const arg_alloc_ptr, const size_t arg_alloc_size) { - SharedAllocationRecord* const r_old = get_record(arg_alloc_ptr); - SharedAllocationRecord* const r_new = - allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); - - Kokkos::Impl::DeepCopy( - r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); - - RecordBase::increment(r_new); - RecordBase::decrement(r_old); - - return r_new->data(); -} - -void* SharedAllocationRecord:: - allocate_tracked(const Kokkos::Experimental::HIPHostPinnedSpace& arg_space, - const std::string& arg_alloc_label, - const size_t arg_alloc_size) { - if (!arg_alloc_size) return nullptr; - - SharedAllocationRecord* const r = - allocate(arg_space, arg_alloc_label, arg_alloc_size); - - RecordBase::increment(r); - - return r->data(); -} - -void SharedAllocationRecord::deallocate_tracked(void* const - arg_alloc_ptr) { - if (arg_alloc_ptr) { - SharedAllocationRecord* const r = get_record(arg_alloc_ptr); - - RecordBase::decrement(r); - } -} - -void* SharedAllocationRecord:: - reallocate_tracked(void* const arg_alloc_ptr, const size_t arg_alloc_size) { - SharedAllocationRecord* const r_old = get_record(arg_alloc_ptr); - SharedAllocationRecord* const r_new = - allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); - - using HIPHostPinnedSpace = Kokkos::Experimental::HIPHostPinnedSpace; - Kokkos::Impl::DeepCopy( - r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); - - RecordBase::increment(r_new); - RecordBase::decrement(r_old); - - return r_new->data(); -} - -//---------------------------------------------------------------------------- - -SharedAllocationRecord* -SharedAllocationRecord::get_record( - void* alloc_ptr) { - using Header = SharedAllocationHeader; - using RecordHIP = - SharedAllocationRecord; - - // Copy the header from the allocation - Header head; - - Header const* const head_hip = - alloc_ptr ? Header::get_header(alloc_ptr) : nullptr; - - if (alloc_ptr) { - Kokkos::Impl::DeepCopy( - &head, head_hip, sizeof(SharedAllocationHeader)); - } - - RecordHIP* const record = - alloc_ptr ? static_cast(head.m_record) : nullptr; - - if (!alloc_ptr || record->m_alloc_ptr != head_hip) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HIPSpace " - ", void >::get_record ERROR")); - } - - return record; -} - -SharedAllocationRecord* -SharedAllocationRecord::get_record(void* alloc_ptr) { - using Header = SharedAllocationHeader; - using RecordHIP = - SharedAllocationRecord; - - Header* const h = - alloc_ptr ? reinterpret_cast(alloc_ptr) - 1 : nullptr; - - if (!alloc_ptr || h->m_record->m_alloc_ptr != h) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::SharedAllocationRecord< " - "Kokkos::Experimental::HIPHostPinnedSpace , void >::get_record ERROR")); - } - - return static_cast(h->m_record); -} - -// Iterate records to print orphaned memory ... -void SharedAllocationRecord:: - print_records(std::ostream& s, const Kokkos::Experimental::HIPSpace&, - bool detail) { -#ifdef KOKKOS_ENABLE_DEBUG - SharedAllocationRecord* r = &s_root_record; - - char buffer[256]; - - SharedAllocationHeader head; - - if (detail) { - do { - if (r->m_alloc_ptr) { - Kokkos::Impl::DeepCopy( - &head, r->m_alloc_ptr, sizeof(SharedAllocationHeader)); - } else { - head.m_label[0] = 0; - } - - // Formatting dependent on sizeof(uintptr_t) - const char* format_string; - - if (sizeof(uintptr_t) == sizeof(unsigned long)) { - format_string = - "HIP addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx + " - "%.8ld ] count(%d) dealloc(0x%.12lx) %s\n"; - } else if (sizeof(uintptr_t) == sizeof(unsigned long long)) { - format_string = - "HIP addr( 0x%.12llx ) list( 0x%.12llx 0x%.12llx ) extent[ " - "0x%.12llx + %.8ld ] count(%d) dealloc(0x%.12llx) %s\n"; - } - - snprintf(buffer, 256, format_string, reinterpret_cast(r), - reinterpret_cast(r->m_prev), - reinterpret_cast(r->m_next), - reinterpret_cast(r->m_alloc_ptr), r->m_alloc_size, - r->m_count, reinterpret_cast(r->m_dealloc), - head.m_label); - s << buffer; - r = r->m_next; - } while (r != &s_root_record); - } else { - do { - if (r->m_alloc_ptr) { - Kokkos::Impl::DeepCopy( - &head, r->m_alloc_ptr, sizeof(SharedAllocationHeader)); - - // Formatting dependent on sizeof(uintptr_t) - const char* format_string; - - if (sizeof(uintptr_t) == sizeof(unsigned long)) { - format_string = "HIP [ 0x%.12lx + %ld ] %s\n"; - } else if (sizeof(uintptr_t) == sizeof(unsigned long long)) { - format_string = "HIP [ 0x%.12llx + %ld ] %s\n"; - } - - snprintf(buffer, 256, format_string, - reinterpret_cast(r->data()), r->size(), - head.m_label); - } else { - snprintf(buffer, 256, "HIP [ 0 + 0 ]\n"); - } - s << buffer; - r = r->m_next; - } while (r != &s_root_record); - } -#else - (void)s; - (void)detail; - throw_runtime_exception( - "Kokkos::Impl::SharedAllocationRecord::print_records" - " only works with KOKKOS_ENABLE_DEBUG enabled"); -#endif + this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr, + arg_label); } } // namespace Impl @@ -680,63 +421,22 @@ void HIP::impl_initialize(const HIP::SelectDevice config) { void HIP::impl_finalize() { Impl::HIPInternal::singleton().finalize(); } HIP::HIP() - : m_space_instance(&Impl::HIPInternal::singleton()), m_counter(nullptr) { + : m_space_instance(&Impl::HIPInternal::singleton(), + [](Impl::HIPInternal*) {}) { Impl::HIPInternal::singleton().verify_is_initialized( "HIP instance constructor"); } HIP::HIP(hipStream_t const stream) - : m_space_instance(new Impl::HIPInternal), m_counter(new int(1)) { + : m_space_instance(new Impl::HIPInternal, [](Impl::HIPInternal* ptr) { + ptr->finalize(); + delete ptr; + }) { Impl::HIPInternal::singleton().verify_is_initialized( "HIP instance constructor"); m_space_instance->initialize(Impl::HIPInternal::singleton().m_hipDev, stream); } -KOKKOS_FUNCTION HIP::HIP(HIP&& other) noexcept { - m_space_instance = other.m_space_instance; - other.m_space_instance = nullptr; - m_counter = other.m_counter; - other.m_counter = nullptr; -} - -KOKKOS_FUNCTION HIP::HIP(HIP const& other) - : m_space_instance(other.m_space_instance), m_counter(other.m_counter) { -#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HIP_GPU - if (m_counter) Kokkos::atomic_add(m_counter, 1); -#endif -} - -KOKKOS_FUNCTION HIP& HIP::operator=(HIP&& other) noexcept { - m_space_instance = other.m_space_instance; - other.m_space_instance = nullptr; - m_counter = other.m_counter; - other.m_counter = nullptr; - - return *this; -} - -KOKKOS_FUNCTION HIP& HIP::operator=(HIP const& other) { - m_space_instance = other.m_space_instance; - m_counter = other.m_counter; -#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HIP_GPU - if (m_counter) Kokkos::atomic_add(m_counter, 1); -#endif - - return *this; -} - -KOKKOS_FUNCTION HIP::~HIP() noexcept { -#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HIP_GPU - if (m_counter == nullptr) return; - int const count = Kokkos::atomic_fetch_sub(m_counter, 1); - if (count == 1) { - delete m_counter; - m_space_instance->finalize(); - delete m_space_instance; - } -#endif -} - void HIP::print_configuration(std::ostream& s, const bool) { Impl::HIPInternal::singleton().print_configuration(s); } @@ -810,3 +510,26 @@ void HIPSpaceInitializer::print_configuration(std::ostream& msg, } // namespace Impl } // namespace Kokkos + +//============================================================================== +// {{{1 + +#include + +namespace Kokkos { +namespace Impl { + +// To avoid additional compilation cost for something that's (mostly?) not +// performance sensitive, we explicity instantiate these CRTP base classes here, +// where we have access to the associated *_timpl.hpp header files. +template class HostInaccessibleSharedAllocationRecordCommon< + Kokkos::Experimental::HIPSpace>; +template class SharedAllocationRecordCommon; +template class SharedAllocationRecordCommon< + Kokkos::Experimental::HIPHostPinnedSpace>; + +} // end namespace Impl +} // end namespace Kokkos + +// end Explicit instantiations of CRTP Base classes }}}1 +//============================================================================== diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp index 7571510c31..fe52886ced 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp @@ -644,13 +644,14 @@ KOKKOS_INLINE_FUNCTION thread, count); } -template -KOKKOS_INLINE_FUNCTION - Impl::ThreadVectorRangeBoundariesStruct - ThreadVectorRange(const Impl::HIPTeamMember& thread, iType arg_begin, - iType arg_end) { +template +KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct< + typename std::common_type::type, Impl::HIPTeamMember> +ThreadVectorRange(const Impl::HIPTeamMember& thread, iType1 arg_begin, + iType2 arg_end) { + using iType = typename std::common_type::type; return Impl::ThreadVectorRangeBoundariesStruct( - thread, arg_begin, arg_end); + thread, iType(arg_begin), iType(arg_end)); } KOKKOS_INLINE_FUNCTION @@ -961,7 +962,7 @@ KOKKOS_INLINE_FUNCTION //---------------------------------------------------------------------------- -/** \brief Intra-thread vector parallel exclusive prefix sum. +/** \brief Intra-thread vector parallel scan with reducer. * * Executes closure(iType i, ValueType & val, bool final) for each i=[0..N) * @@ -969,22 +970,21 @@ KOKKOS_INLINE_FUNCTION * thread and a scan operation is performed. * The last call to closure has final == true. */ -template -KOKKOS_INLINE_FUNCTION void parallel_scan( - const Impl::ThreadVectorRangeBoundariesStruct& - loop_boundaries, - const Closure& closure) { +template +KOKKOS_INLINE_FUNCTION + typename std::enable_if::value>::type + parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::HIPTeamMember>& loop_boundaries, + const Closure& closure, const ReducerType& reducer) { #ifdef __HIP_DEVICE_COMPILE__ - // Extract value_type from closure - - using value_type = typename Kokkos::Impl::FunctorAnalysis< - Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type; + using value_type = typename ReducerType::value_type; + value_type accum; + reducer.init(accum); + const value_type identity = accum; // Loop through boundaries by vector-length chunks // must scan at each iteration - value_type accum = 0; - // All thread "lanes" must loop the same number of times. // Determine an loop end for all thread "lanes." // Requires: @@ -997,47 +997,72 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( const int end = loop_boundaries.end + (rem ? blockDim.x - rem : 0); for (int i = threadIdx.x; i < end; i += blockDim.x) { - value_type val = 0; + value_type val = identity; - // First acquire per-lane contributions: - if (i < loop_boundaries.end) closure(i, val, false); + // First acquire per-lane contributions. + // This sets i's val to i-1's contribution + // to make the latter in_place_shfl_up an + // exclusive scan -- the final accumulation + // of i's val will be included in the second + // closure call later. + if (i < loop_boundaries.end && threadIdx.x > 0) closure(i - 1, val, false); - value_type sval = val; - - // Bottom up inclusive scan in triangular pattern + // Bottom up exclusive scan in triangular pattern // where each HIP thread is the root of a reduction tree // from the zeroth "lane" to itself. // [t] += [t-1] if t >= 1 // [t] += [t-2] if t >= 2 // [t] += [t-4] if t >= 4 // ... - + // This differs from the non-reducer overload, where an inclusive scan was + // implemented, because in general the binary operator cannot be inverted + // and we would not be able to remove the inclusive contribution by + // inversion. for (int j = 1; j < static_cast(blockDim.x); j <<= 1) { - value_type tmp = 0; - ::Kokkos::Experimental::Impl::in_place_shfl_up(tmp, sval, j, blockDim.x); + value_type tmp = identity; + ::Kokkos::Experimental::Impl::in_place_shfl_up(tmp, val, j, blockDim.x); if (j <= static_cast(threadIdx.x)) { - sval += tmp; + reducer.join(val, tmp); } } - // Include accumulation and remove value for exclusive scan: - val = accum + sval - val; + // Include accumulation + reducer.join(val, accum); - // Provide exclusive scan value: + // Update i's contribution into the val + // and add it to accum for next round if (i < loop_boundaries.end) closure(i, val, true); - - // Accumulate the last value in the inclusive scan: - ::Kokkos::Experimental::Impl::in_place_shfl(sval, sval, blockDim.x - 1, + ::Kokkos::Experimental::Impl::in_place_shfl(accum, val, blockDim.x - 1, blockDim.x); - - accum += sval; } #else (void)loop_boundaries; (void)closure; + (void)reducer; #endif } +//---------------------------------------------------------------------------- + +/** \brief Intra-thread vector parallel exclusive prefix sum. + * + * Executes closure(iType i, ValueType & val, bool final) for each i=[0..N) + * + * The range [0..N) is mapped to all vector lanes in the + * thread and a scan operation is performed. + * The last call to closure has final == true. + */ +template +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::ThreadVectorRangeBoundariesStruct& + loop_boundaries, + const Closure& closure) { + using value_type = typename Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type; + value_type dummy; + parallel_scan(loop_boundaries, closure, Kokkos::Sum(dummy)); +} + } // namespace Kokkos namespace Kokkos { diff --git a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp index 140376425c..b7d8e62f69 100644 --- a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp +++ b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp @@ -48,17 +48,11 @@ #include #include - +#include #include #include -#include #include -#if defined(KOKKOS_ENABLE_CUDA) || \ - (defined(__HIPCC__) && defined(KOKKOS_ENABLE_HIP)) -#include -#endif - namespace Kokkos { // ------------------------------------------------------------------ // @@ -74,22 +68,14 @@ enum class Iterate template struct default_outer_direction { - using type = Iterate; -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) - static constexpr Iterate value = Iterate::Left; -#else + using type = Iterate; static constexpr Iterate value = Iterate::Right; -#endif }; template struct default_inner_direction { - using type = Iterate; -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) - static constexpr Iterate value = Iterate::Left; -#else + using type = Iterate; static constexpr Iterate value = Iterate::Right; -#endif }; // Iteration Pattern @@ -179,6 +165,25 @@ constexpr NVCC_WONT_LET_ME_CALL_YOU_Array to_array_potentially_narrowing( } return a; } + +struct TileSizeProperties { + int max_threads; + int default_largest_tile_size; + int default_tile_size; + int max_total_tile_size; +}; + +template +TileSizeProperties get_tile_size_properties(const ExecutionSpace&) { + // Host settings + TileSizeProperties properties; + properties.max_threads = std::numeric_limits::max(); + properties.default_largest_tile_size = 0; + properties.default_tile_size = 2; + properties.max_total_tile_size = std::numeric_limits::max(); + return properties; +} + } // namespace Impl // multi-dimensional iteration pattern @@ -208,7 +213,7 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { using launch_bounds = typename traits::launch_bounds; using member_type = typename range_policy::member_type; - enum { rank = static_cast(iteration_pattern::rank) }; + static constexpr int rank = iteration_pattern::rank; using index_type = typename traits::index_type; using array_index_type = std::int64_t; @@ -231,37 +236,20 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { point_type m_tile_end = {}; index_type m_num_tiles = 1; index_type m_prod_tile_dims = 1; + bool m_tune_tile_size = false; - /* - // NDE enum impl definition alternative - replace static constexpr int ? - enum { outer_direction = static_cast ( - (iteration_pattern::outer_direction != Iterate::Default) - ? iteration_pattern::outer_direction - : default_outer_direction< typename traits::execution_space>::value ) }; - - enum { inner_direction = static_cast ( - iteration_pattern::inner_direction != Iterate::Default - ? iteration_pattern::inner_direction - : default_inner_direction< typename traits::execution_space>::value ) }; - - enum { Right = static_cast( Iterate::Right ) }; - enum { Left = static_cast( Iterate::Left ) }; - */ - // static constexpr int rank = iteration_pattern::rank; - - static constexpr int outer_direction = static_cast( + static constexpr auto outer_direction = (iteration_pattern::outer_direction != Iterate::Default) ? iteration_pattern::outer_direction - : default_outer_direction::value); + : default_outer_direction::value; - static constexpr int inner_direction = static_cast( + static constexpr auto inner_direction = iteration_pattern::inner_direction != Iterate::Default ? iteration_pattern::inner_direction - : default_inner_direction::value); + : default_inner_direction::value; - // Ugly ugly workaround intel 14 not handling scoped enum correctly - static constexpr int Right = static_cast(Iterate::Right); - static constexpr int Left = static_cast(Iterate::Left); + static constexpr auto Right = Iterate::Right; + static constexpr auto Left = Iterate::Left; KOKKOS_INLINE_FUNCTION const typename traits::execution_space& space() const { return m_space; @@ -320,7 +308,7 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { point_type const& lower, point_type const& upper, tile_type const& tile = tile_type{}) : m_space(work_space), m_lower(lower), m_upper(upper), m_tile(tile) { - init(); + init_helper(Impl::get_tile_size_properties(work_space)); } template { m_tile(p.m_tile), m_tile_end(p.m_tile_end), m_num_tiles(p.m_num_tiles), - m_prod_tile_dims(p.m_prod_tile_dims) {} + m_prod_tile_dims(p.m_prod_tile_dims), + m_tune_tile_size(p.m_tune_tile_size) {} + + void impl_change_tile_size(const point_type& tile) { + m_tile = tile; + init_helper(Impl::get_tile_size_properties(m_space)); + } + bool impl_tune_tile_size() const { return m_tune_tile_size; } private: - void init() { - // Host - if (true -#if defined(KOKKOS_ENABLE_CUDA) - && !std::is_same::value -#endif -#if defined(KOKKOS_ENABLE_HIP) - && !std::is_same::value -#endif - ) { - index_type span; - for (int i = 0; i < rank; ++i) { - span = m_upper[i] - m_lower[i]; - if (m_tile[i] <= 0) { - if (((int)inner_direction == (int)Right && (i < rank - 1)) || - ((int)inner_direction == (int)Left && (i > 0))) { - m_tile[i] = 2; - } else { - m_tile[i] = (span == 0 ? 1 : span); - } - } - m_tile_end[i] = - static_cast((span + m_tile[i] - 1) / m_tile[i]); - m_num_tiles *= m_tile_end[i]; - m_prod_tile_dims *= m_tile[i]; - } + void init_helper(Impl::TileSizeProperties properties) { + m_prod_tile_dims = 1; + int increment = 1; + int rank_start = 0; + int rank_end = rank; + if (inner_direction == Iterate::Right) { + increment = -1; + rank_start = rank - 1; + rank_end = -1; } -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) - else // Cuda or HIP - { - index_type span; - int increment = 1; - int rank_start = 0; - int rank_end = rank; - if ((int)inner_direction == (int)Right) { - increment = -1; - rank_start = rank - 1; - rank_end = -1; - } - bool is_cuda_exec_space = -#if defined(KOKKOS_ENABLE_CUDA) - std::is_same::value; -#else - false; -#endif - for (int i = rank_start; i != rank_end; i += increment) { - span = m_upper[i] - m_lower[i]; - if (m_tile[i] <= 0) { - // TODO: determine what is a good default tile size for Cuda and HIP - // may be rank dependent - if (((int)inner_direction == (int)Right && (i < rank - 1)) || - ((int)inner_direction == (int)Left && (i > 0))) { - if (m_prod_tile_dims < 256) { - m_tile[i] = (is_cuda_exec_space) ? 2 : 4; - } else { - m_tile[i] = 1; - } + for (int i = rank_start; i != rank_end; i += increment) { + const index_type length = m_upper[i] - m_lower[i]; + if (m_tile[i] <= 0) { + m_tune_tile_size = true; + if ((inner_direction == Iterate::Right && (i < rank - 1)) || + (inner_direction == Iterate::Left && (i > 0))) { + if (m_prod_tile_dims * properties.default_tile_size < + static_cast(properties.max_total_tile_size)) { + m_tile[i] = properties.default_tile_size; } else { - m_tile[i] = 16; + m_tile[i] = 1; } - } - m_tile_end[i] = - static_cast((span + m_tile[i] - 1) / m_tile[i]); - m_num_tiles *= m_tile_end[i]; - m_prod_tile_dims *= m_tile[i]; - } - if (m_prod_tile_dims > - 1024) { // Match Cuda restriction for ParallelReduce; 1024,1024,64 - // max per dim (Kepler), but product num_threads < 1024 - if (is_cuda_exec_space) { - printf(" Tile dimensions exceed Cuda limits\n"); - Kokkos::abort( - "Cuda ExecSpace Error: MDRange tile dims exceed maximum number " - "of threads per block - choose smaller tile dims"); } else { - printf(" Tile dimensions exceed HIP limits\n"); - Kokkos::abort( - "HIP ExecSpace Error: MDRange tile dims exceed maximum number of " - "threads per block - choose smaller tile dims"); + m_tile[i] = properties.default_largest_tile_size == 0 + ? std::max(length, 1) + : properties.default_largest_tile_size; } } + m_tile_end[i] = + static_cast((length + m_tile[i] - 1) / m_tile[i]); + m_num_tiles *= m_tile_end[i]; + m_prod_tile_dims *= m_tile[i]; + } + if (m_prod_tile_dims > static_cast(properties.max_threads)) { + printf(" Product of tile dimensions exceed maximum limit: %d\n", + static_cast(properties.max_threads)); + Kokkos::abort( + "ExecSpace Error: MDRange tile dims exceed maximum number " + "of threads per block - choose smaller tile dims"); } -#endif } }; diff --git a/lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp b/lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp index 8e226a078d..fb94049d7a 100644 --- a/lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp @@ -104,20 +104,6 @@ struct MemorySpaceAccess { enum : bool { deepcopy = true }; }; -template -struct VerifyExecutionCanAccessMemorySpace { - enum { value = 1 }; - KOKKOS_INLINE_FUNCTION static void verify(void) {} - KOKKOS_INLINE_FUNCTION static void verify(const void *) {} -}; - -template -struct VerifyExecutionCanAccessMemorySpace { - enum { value = 1 }; - KOKKOS_INLINE_FUNCTION static void verify(void) {} - KOKKOS_INLINE_FUNCTION static void verify(const void *) {} -}; - } // namespace Impl } // namespace Kokkos diff --git a/lib/kokkos/core/src/Kokkos_Complex.hpp b/lib/kokkos/core/src/Kokkos_Complex.hpp index fb2925a066..6578723fc8 100644 --- a/lib/kokkos/core/src/Kokkos_Complex.hpp +++ b/lib/kokkos/core/src/Kokkos_Complex.hpp @@ -45,14 +45,13 @@ #define KOKKOS_COMPLEX_HPP #include +#include #include +#include #include +#include #include -#ifdef KOKKOS_ENABLE_SYCL -#include -#endif - namespace Kokkos { /// \class complex @@ -220,10 +219,11 @@ class // Conditional noexcept, just in case RType throws on divide-by-zero KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator/=( const complex& y) noexcept(noexcept(RealType{} / RealType{})) { + using Kokkos::Experimental::fabs; // Scale (by the "1-norm" of y) to avoid unwarranted overflow. // If the real part is +/-Inf and the imaginary part is -/+Inf, // this won't change the result. - const RealType s = std::fabs(y.real()) + std::fabs(y.imag()); + const RealType s = fabs(y.real()) + fabs(y.imag()); // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0. // In that case, the relation x/y == (x/s) / (y/s) doesn't hold, @@ -248,10 +248,11 @@ class KOKKOS_INLINE_FUNCTION complex& operator/=( const std::complex& y) noexcept(noexcept(RealType{} / RealType{})) { + using Kokkos::Experimental::fabs; // Scale (by the "1-norm" of y) to avoid unwarranted overflow. // If the real part is +/-Inf and the imaginary part is -/+Inf, // this won't change the result. - const RealType s = std::fabs(y.real()) + std::fabs(y.imag()); + const RealType s = fabs(y.real()) + fabs(y.imag()); // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0. // In that case, the relation x/y == (x/s) / (y/s) doesn't hold, @@ -693,35 +694,96 @@ KOKKOS_INLINE_FUNCTION RealType real(const complex& x) noexcept { return x.real(); } +//! Constructs a complex number from magnitude and phase angle +template +KOKKOS_INLINE_FUNCTION complex polar(const T& r, const T& theta = T()) { + using Kokkos::Experimental::cos; + using Kokkos::Experimental::sin; + KOKKOS_EXPECTS(r >= 0); + return complex(r * cos(theta), r * sin(theta)); +} + //! Absolute value (magnitude) of a complex number. template KOKKOS_INLINE_FUNCTION RealType abs(const complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::hypot; -#else - using std::hypot; -#endif + using Kokkos::Experimental::hypot; return hypot(x.real(), x.imag()); } //! Power of a complex number -template -KOKKOS_INLINE_FUNCTION Kokkos::complex pow(const complex& x, - const RealType& e) { - RealType r = abs(x); -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::atan; - using cl::sycl::cos; - using cl::sycl::pow; - using cl::sycl::sin; -#else - using std::atan; - using std::cos; - using std::pow; - using std::sin; -#endif - RealType phi = atan(x.imag() / x.real()); - return pow(r, e) * Kokkos::complex(cos(phi * e), sin(phi * e)); +template +KOKKOS_INLINE_FUNCTION complex pow(const complex& x, const T& y) { + using Kokkos::Experimental::atan2; + using Kokkos::Experimental::pow; + T r = abs(x); + T theta = atan2(x.imag(), x.real()); + return polar(pow(r, y), y * theta); +} + +template +KOKKOS_INLINE_FUNCTION complex pow(const T& x, const complex& y) { + return pow(complex(x), y); +} + +template +KOKKOS_INLINE_FUNCTION complex pow(const complex& x, + const complex& y) { + using Kokkos::Experimental::log; + + return x == T() ? T() : exp(y * log(x)); +} + +namespace Impl { +// NOTE promote would also be useful for math functions +template ::value> +struct promote { + using type = double; +}; +template +struct promote {}; +template <> +struct promote { + using type = long double; +}; +template <> +struct promote { + using type = double; +}; +template <> +struct promote { + using type = float; +}; +template +using promote_t = typename promote::type; +template +struct promote_2 { + using type = decltype(promote_t() + promote_t()); +}; +template +using promote_2_t = typename promote_2::type; +} // namespace Impl + +template ::value>> +KOKKOS_INLINE_FUNCTION complex> pow( + const T& x, const complex& y) { + using type = Impl::promote_2_t; + return pow(type(x), complex(y)); +} + +template ::value>> +KOKKOS_INLINE_FUNCTION complex> pow(const complex& x, + const U& y) { + using type = Impl::promote_2_t; + return pow(complex(x), type(y)); +} + +template +KOKKOS_INLINE_FUNCTION complex> pow( + const complex& x, const complex& y) { + using type = Impl::promote_2_t; + return pow(complex(x), complex(y)); } //! Square root of a complex number. This is intended to match the stdc++ @@ -729,26 +791,21 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex pow(const complex& x, template KOKKOS_INLINE_FUNCTION Kokkos::complex sqrt( const complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::abs; - using cl::sycl::sqrt; -#else - using std::abs; - using std::sqrt; -#endif + using Kokkos::Experimental::fabs; + using Kokkos::Experimental::sqrt; RealType r = x.real(); RealType i = x.imag(); if (r == RealType()) { - RealType t = sqrt(abs(i) / 2); + RealType t = sqrt(fabs(i) / 2); return Kokkos::complex(t, i < RealType() ? -t : t); } else { - RealType t = sqrt(2 * (abs(x) + abs(r))); + RealType t = sqrt(2 * (abs(x) + fabs(r))); RealType u = t / 2; - return r > RealType() - ? Kokkos::complex(u, i / t) - : Kokkos::complex(abs(i) / t, i < RealType() ? -u : u); + return r > RealType() ? Kokkos::complex(u, i / t) + : Kokkos::complex(fabs(i) / t, + i < RealType() ? -u : u); } } @@ -762,15 +819,9 @@ KOKKOS_INLINE_FUNCTION complex conj( //! Exponential of a complex number. template KOKKOS_INLINE_FUNCTION complex exp(const complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::cos; - using cl::sycl::exp; - using cl::sycl::sin; -#else - using std::cos; - using std::exp; - using std::sin; -#endif + using Kokkos::Experimental::cos; + using Kokkos::Experimental::exp; + using Kokkos::Experimental::sin; return exp(x.real()) * complex(cos(x.imag()), sin(x.imag())); } @@ -778,14 +829,9 @@ KOKKOS_INLINE_FUNCTION complex exp(const complex& x) { template KOKKOS_INLINE_FUNCTION Kokkos::complex log( const complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::atan; - using cl::sycl::log; -#else - using std::atan; - using std::log; -#endif - RealType phi = atan(x.imag() / x.real()); + using Kokkos::Experimental::atan2; + using Kokkos::Experimental::log; + RealType phi = atan2(x.imag(), x.real()); return Kokkos::complex(log(abs(x)), phi); } @@ -793,17 +839,10 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex log( template KOKKOS_INLINE_FUNCTION Kokkos::complex sin( const complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::cos; - using cl::sycl::cosh; - using cl::sycl::sin; - using cl::sycl::sinh; -#else - using std::cos; - using std::cosh; - using std::sin; - using std::sinh; -#endif + using Kokkos::Experimental::cos; + using Kokkos::Experimental::cosh; + using Kokkos::Experimental::sin; + using Kokkos::Experimental::sinh; return Kokkos::complex(sin(x.real()) * cosh(x.imag()), cos(x.real()) * sinh(x.imag())); } @@ -812,17 +851,10 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex sin( template KOKKOS_INLINE_FUNCTION Kokkos::complex cos( const complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::cos; - using cl::sycl::cosh; - using cl::sycl::sin; - using cl::sycl::sinh; -#else - using std::cos; - using std::cosh; - using std::sin; - using std::sinh; -#endif + using Kokkos::Experimental::cos; + using Kokkos::Experimental::cosh; + using Kokkos::Experimental::sin; + using Kokkos::Experimental::sinh; return Kokkos::complex(cos(x.real()) * cosh(x.imag()), -sin(x.real()) * sinh(x.imag())); } @@ -838,17 +870,10 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex tan( template KOKKOS_INLINE_FUNCTION Kokkos::complex sinh( const complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::cos; - using cl::sycl::cosh; - using cl::sycl::sin; - using cl::sycl::sinh; -#else - using std::cos; - using std::cosh; - using std::sin; - using std::sinh; -#endif + using Kokkos::Experimental::cos; + using Kokkos::Experimental::cosh; + using Kokkos::Experimental::sin; + using Kokkos::Experimental::sinh; return Kokkos::complex(sinh(x.real()) * cos(x.imag()), cosh(x.real()) * sin(x.imag())); } @@ -857,17 +882,10 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex sinh( template KOKKOS_INLINE_FUNCTION Kokkos::complex cosh( const complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::cos; - using cl::sycl::cosh; - using cl::sycl::sin; - using cl::sycl::sinh; -#else - using std::cos; - using std::cosh; - using std::sin; - using std::sinh; -#endif + using Kokkos::Experimental::cos; + using Kokkos::Experimental::cosh; + using Kokkos::Experimental::sin; + using Kokkos::Experimental::sinh; return Kokkos::complex(cosh(x.real()) * cos(x.imag()), sinh(x.real()) * sin(x.imag())); } @@ -898,13 +916,8 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex acosh( template KOKKOS_INLINE_FUNCTION Kokkos::complex atanh( const complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::atan2; - using cl::sycl::log; -#else - using std::atan2; - using std::log; -#endif + using Kokkos::Experimental::atan2; + using Kokkos::Experimental::log; const RealType i2 = x.imag() * x.imag(); const RealType r = RealType(1.0) - i2 - x.real() * x.real(); @@ -933,12 +946,7 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex asin( template KOKKOS_INLINE_FUNCTION Kokkos::complex acos( const complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::acos; - -#else - using std::acos; -#endif + using Kokkos::Experimental::acos; Kokkos::complex t = asin(x); RealType pi_2 = acos(RealType(0.0)); return Kokkos::complex(pi_2 - t.real(), -t.imag()); @@ -948,13 +956,8 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex acos( template KOKKOS_INLINE_FUNCTION Kokkos::complex atan( const complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::atan2; - using cl::sycl::log; -#else - using std::atan2; - using std::log; -#endif + using Kokkos::Experimental::atan2; + using Kokkos::Experimental::log; const RealType r2 = x.real() * x.real(); const RealType i = RealType(1.0) - r2 - x.imag() * x.imag(); @@ -996,12 +999,13 @@ KOKKOS_INLINE_FUNCTION operator/(const complex& x, const complex& y) noexcept(noexcept(RealType1{} / RealType2{})) { + using Kokkos::Experimental::fabs; // Scale (by the "1-norm" of y) to avoid unwarranted overflow. // If the real part is +/-Inf and the imaginary part is -/+Inf, // this won't change the result. using common_real_type = typename std::common_type::type; - const common_real_type s = std::fabs(real(y)) + std::fabs(imag(y)); + const common_real_type s = fabs(real(y)) + fabs(imag(y)); // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0. // In that case, the relation x/y == (x/s) / (y/s) doesn't hold, @@ -1046,7 +1050,7 @@ std::istream& operator>>(std::istream& is, complex& x) { } template -struct reduction_identity > { +struct reduction_identity> { using t_red_ident = reduction_identity; KOKKOS_FORCEINLINE_FUNCTION constexpr static Kokkos::complex sum() noexcept { diff --git a/lib/kokkos/core/src/Kokkos_Core.hpp b/lib/kokkos/core/src/Kokkos_Core.hpp index 4dac463a66..c3771ab393 100644 --- a/lib/kokkos/core/src/Kokkos_Core.hpp +++ b/lib/kokkos/core/src/Kokkos_Core.hpp @@ -58,6 +58,7 @@ #include #include #include +#include #include #include #include @@ -86,6 +87,10 @@ struct InitArguments { int skip_device; bool disable_warnings; bool tune_internals; + bool tool_help = false; + std::string tool_lib = {}; + std::string tool_args = {}; + InitArguments(int nt = -1, int nn = -1, int dv = -1, bool dw = false, bool ti = false) : num_threads{nt}, @@ -139,6 +144,10 @@ void pre_initialize(const InitArguments& args); void post_initialize(const InitArguments& args); +void declare_configuration_metadata(const std::string& category, + const std::string& key, + const std::string& value); + } // namespace Impl bool is_initialized() noexcept; diff --git a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp index 7502719c73..fe7eba3f6e 100644 --- a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp +++ b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp @@ -50,6 +50,7 @@ // and compiler environment then sets a collection of #define macros. #include +#include #include #include @@ -180,7 +181,6 @@ using DefaultHostExecutionSpace KOKKOS_IMPL_DEFAULT_HOST_EXEC_SPACE_ANNOTATION = // a given memory space. namespace Kokkos { - namespace Impl { #if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA) && \ @@ -196,16 +196,22 @@ using ActiveExecutionMemorySpace = Kokkos::HostSpace; using ActiveExecutionMemorySpace = void; #endif -template -struct VerifyExecutionCanAccessMemorySpace { - enum { value = 0 }; +template +struct MemorySpaceAccess; + +template ::accessible> +struct verify_space { + KOKKOS_FUNCTION static void check() {} }; -template -struct VerifyExecutionCanAccessMemorySpace { - enum { value = 1 }; - KOKKOS_INLINE_FUNCTION static void verify(void) {} - KOKKOS_INLINE_FUNCTION static void verify(const void *) {} +template +struct verify_space { + KOKKOS_FUNCTION static void check() { + Kokkos::abort( + "Kokkos::View ERROR: attempt to access inaccessible memory space"); + }; }; // Base class for exec space initializer factories @@ -220,13 +226,13 @@ class LogicalMemorySpace; } // namespace Kokkos -#define KOKKOS_RESTRICT_EXECUTION_TO_DATA(DATA_SPACE, DATA_PTR) \ - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \ - Kokkos::Impl::ActiveExecutionMemorySpace, DATA_SPACE>::verify(DATA_PTR) +#define KOKKOS_RESTRICT_EXECUTION_TO_DATA(DATA_SPACE, DATA_PTR) \ + Kokkos::Impl::verify_space::check(); -#define KOKKOS_RESTRICT_EXECUTION_TO_(DATA_SPACE) \ - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \ - Kokkos::Impl::ActiveExecutionMemorySpace, DATA_SPACE>::verify() +#define KOKKOS_RESTRICT_EXECUTION_TO_(DATA_SPACE) \ + Kokkos::Impl::verify_space::check(); //---------------------------------------------------------------------------- @@ -256,8 +262,7 @@ template struct ViewCopy; -template +template struct FunctorPolicyExecutionSpace; //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Kokkos_Crs.hpp b/lib/kokkos/core/src/Kokkos_Crs.hpp index 4a573d82c0..1a10500b19 100644 --- a/lib/kokkos/core/src/Kokkos_Crs.hpp +++ b/lib/kokkos/core/src/Kokkos_Crs.hpp @@ -199,7 +199,7 @@ class CrsRowMapFromCounts { public: KOKKOS_INLINE_FUNCTION void operator()(index_type i, value_type& update, bool final_pass) const { - if (i < m_in.size()) { + if (i < static_cast(m_in.size())) { update += m_in(i); if (final_pass) m_out(i + 1) = update; } else if (final_pass) { diff --git a/lib/kokkos/core/src/Kokkos_Cuda.hpp b/lib/kokkos/core/src/Kokkos_Cuda.hpp index 81e11f3f12..7a218120bb 100644 --- a/lib/kokkos/core/src/Kokkos_Cuda.hpp +++ b/lib/kokkos/core/src/Kokkos_Cuda.hpp @@ -63,6 +63,7 @@ #include #include #include +#include /*--------------------------------------------------------------------------*/ @@ -198,16 +199,6 @@ class Cuda { Cuda(); - KOKKOS_FUNCTION Cuda(Cuda&& other) noexcept; - - KOKKOS_FUNCTION Cuda(const Cuda& other); - - KOKKOS_FUNCTION Cuda& operator=(Cuda&& other) noexcept; - - KOKKOS_FUNCTION Cuda& operator=(const Cuda& other); - - KOKKOS_FUNCTION ~Cuda() noexcept; - Cuda(cudaStream_t stream); //-------------------------------------------------------------------------- @@ -253,13 +244,12 @@ class Cuda { static const char* name(); inline Impl::CudaInternal* impl_internal_space_instance() const { - return m_space_instance; + return m_space_instance.get(); } uint32_t impl_instance_id() const noexcept { return 0; } private: - Impl::CudaInternal* m_space_instance; - int* m_counter; + Kokkos::Impl::HostSharedPtr m_space_instance; }; namespace Tools { @@ -319,38 +309,8 @@ struct MemorySpaceAccess -struct VerifyExecutionCanAccessMemorySpace { - enum : bool { value = true }; - KOKKOS_INLINE_FUNCTION static void verify(void) {} - KOKKOS_INLINE_FUNCTION static void verify(const void*) {} -}; - -template <> -struct VerifyExecutionCanAccessMemorySpace { - enum : bool { value = false }; - inline static void verify(void) { CudaSpace::access_error(); } - inline static void verify(const void* p) { CudaSpace::access_error(p); } -}; - } // namespace Impl } // namespace Kokkos -/*--------------------------------------------------------------------------*/ -/*--------------------------------------------------------------------------*/ - -#include -#include -#include -#include -#include -#include -#include - -#include -//---------------------------------------------------------------------------- - #endif /* #if defined( KOKKOS_ENABLE_CUDA ) */ #endif /* #ifndef KOKKOS_CUDA_HPP */ diff --git a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp index fc1c0e2f8a..e10fae93c7 100644 --- a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp @@ -53,8 +53,10 @@ #include #include #include +#include #include +#include #include @@ -119,8 +121,8 @@ class CudaSpace { /*--------------------------------*/ /** \brief Error reporting for HostSpace attempt to access CudaSpace */ - static void access_error(); - static void access_error(const void* const); + KOKKOS_DEPRECATED static void access_error(); + KOKKOS_DEPRECATED static void access_error(const void* const); private: int m_device; ///< Which Cuda device @@ -128,42 +130,6 @@ class CudaSpace { static constexpr const char* m_name = "Cuda"; friend class Kokkos::Impl::SharedAllocationRecord; }; - -namespace Impl { -/// \brief Initialize lock array for arbitrary size atomics. -/// -/// Arbitrary atomics are implemented using a hash table of locks -/// where the hash value is derived from the address of the -/// object for which an atomic operation is performed. -/// This function initializes the locks to zero (unset). -void init_lock_arrays_cuda_space(); - -/// \brief Retrieve the pointer to the lock array for arbitrary size atomics. -/// -/// Arbitrary atomics are implemented using a hash table of locks -/// where the hash value is derived from the address of the -/// object for which an atomic operation is performed. -/// This function retrieves the lock array pointer. -/// If the array is not yet allocated it will do so. -int* atomic_lock_array_cuda_space_ptr(bool deallocate = false); - -/// \brief Retrieve the pointer to the scratch array for team and thread private -/// global memory. -/// -/// Team and Thread private scratch allocations in -/// global memory are acquired via locks. -/// This function retrieves the lock array pointer. -/// If the array is not yet allocated it will do so. -int* scratch_lock_array_cuda_space_ptr(bool deallocate = false); - -/// \brief Retrieve the pointer to the scratch array for unique identifiers. -/// -/// Unique identifiers in the range 0-Cuda::concurrency -/// are provided via locks. -/// This function retrieves the lock array pointer. -/// If the array is not yet allocated it will do so. -int* threadid_lock_array_cuda_space_ptr(bool deallocate = false); -} // namespace Impl } // namespace Kokkos /*--------------------------------------------------------------------------*/ @@ -313,6 +279,11 @@ class CudaHostPinnedSpace { namespace Kokkos { namespace Impl { +cudaStream_t cuda_get_deep_copy_stream(); + +const std::unique_ptr& cuda_get_deep_copy_space( + bool initialize = true); + static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, ""); @@ -784,104 +755,21 @@ struct DeepCopy { namespace Kokkos { namespace Impl { -/** Running in CudaSpace attempting to access HostSpace: error */ -template <> -struct VerifyExecutionCanAccessMemorySpace { - enum : bool { value = false }; - KOKKOS_INLINE_FUNCTION static void verify(void) { - Kokkos::abort("Cuda code attempted to access HostSpace memory"); - } - - KOKKOS_INLINE_FUNCTION static void verify(const void*) { - Kokkos::abort("Cuda code attempted to access HostSpace memory"); - } -}; - -/** Running in CudaSpace accessing CudaUVMSpace: ok */ -template <> -struct VerifyExecutionCanAccessMemorySpace { - enum : bool { value = true }; - KOKKOS_INLINE_FUNCTION static void verify(void) {} - KOKKOS_INLINE_FUNCTION static void verify(const void*) {} -}; - -/** Running in CudaSpace accessing CudaHostPinnedSpace: ok */ -template <> -struct VerifyExecutionCanAccessMemorySpace { - enum : bool { value = true }; - KOKKOS_INLINE_FUNCTION static void verify(void) {} - KOKKOS_INLINE_FUNCTION static void verify(const void*) {} -}; - -/** Running in CudaSpace attempting to access an unknown space: error */ -template -struct VerifyExecutionCanAccessMemorySpace< - typename std::enable_if::value, - Kokkos::CudaSpace>::type, - OtherSpace> { - enum : bool { value = false }; - KOKKOS_INLINE_FUNCTION static void verify(void) { - Kokkos::abort("Cuda code attempted to access unknown Space memory"); - } - - KOKKOS_INLINE_FUNCTION static void verify(const void*) { - Kokkos::abort("Cuda code attempted to access unknown Space memory"); - } -}; - -//---------------------------------------------------------------------------- -/** Running in HostSpace attempting to access CudaSpace */ -template <> -struct VerifyExecutionCanAccessMemorySpace { - enum : bool { value = false }; - inline static void verify(void) { CudaSpace::access_error(); } - inline static void verify(const void* p) { CudaSpace::access_error(p); } -}; - -/** Running in HostSpace accessing CudaUVMSpace is OK */ -template <> -struct VerifyExecutionCanAccessMemorySpace { - enum : bool { value = true }; - inline static void verify(void) {} - inline static void verify(const void*) {} -}; - -/** Running in HostSpace accessing CudaHostPinnedSpace is OK */ -template <> -struct VerifyExecutionCanAccessMemorySpace { - enum : bool { value = true }; - KOKKOS_INLINE_FUNCTION static void verify(void) {} - KOKKOS_INLINE_FUNCTION static void verify(const void*) {} -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - template <> class SharedAllocationRecord - : public SharedAllocationRecord { + : public HostInaccessibleSharedAllocationRecordCommon { private: friend class SharedAllocationRecord; + friend class SharedAllocationRecordCommon; + friend class HostInaccessibleSharedAllocationRecordCommon; using RecordBase = SharedAllocationRecord; + using base_t = + HostInaccessibleSharedAllocationRecordCommon; SharedAllocationRecord(const SharedAllocationRecord&) = delete; SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - static void deallocate(RecordBase*); - static ::cudaTextureObject_t attach_texture_object( const unsigned sizeof_alias, void* const alloc_ptr, const size_t alloc_size); @@ -890,39 +778,19 @@ class SharedAllocationRecord static RecordBase s_root_record; #endif - ::cudaTextureObject_t m_tex_obj; + ::cudaTextureObject_t m_tex_obj = 0; const Kokkos::CudaSpace m_space; protected: ~SharedAllocationRecord(); - SharedAllocationRecord() : RecordBase(), m_tex_obj(0), m_space() {} + SharedAllocationRecord() = default; SharedAllocationRecord( const Kokkos::CudaSpace& arg_space, const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate); + const RecordBase::function_type arg_dealloc = &base_t::deallocate); public: - std::string get_label() const; - - static SharedAllocationRecord* allocate(const Kokkos::CudaSpace& arg_space, - const std::string& arg_label, - const size_t arg_alloc_size); - - /**\brief Allocate tracked memory in the space */ - static void* allocate_tracked(const Kokkos::CudaSpace& arg_space, - const std::string& arg_label, - const size_t arg_alloc_size); - - /**\brief Reallocate tracked memory in the space */ - static void* reallocate_tracked(void* const arg_alloc_ptr, - const size_t arg_alloc_size); - - /**\brief Deallocate tracked memory in the space */ - static void deallocate_tracked(void* const arg_alloc_ptr); - - static SharedAllocationRecord* get_record(void* arg_alloc_ptr); - template inline ::cudaTextureObject_t attach_texture_object() { static_assert((std::is_same::value || @@ -945,57 +813,35 @@ class SharedAllocationRecord // Texture object is attached to the entire allocation range return ptr - reinterpret_cast(RecordBase::m_alloc_ptr); } - - static void print_records(std::ostream&, const Kokkos::CudaSpace&, - bool detail = false); }; template <> class SharedAllocationRecord - : public SharedAllocationRecord { + : public SharedAllocationRecordCommon { private: + friend class SharedAllocationRecordCommon; + + using base_t = SharedAllocationRecordCommon; using RecordBase = SharedAllocationRecord; SharedAllocationRecord(const SharedAllocationRecord&) = delete; SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - static void deallocate(RecordBase*); - static RecordBase s_root_record; - ::cudaTextureObject_t m_tex_obj; + ::cudaTextureObject_t m_tex_obj = 0; const Kokkos::CudaUVMSpace m_space; protected: ~SharedAllocationRecord(); - SharedAllocationRecord() : RecordBase(), m_tex_obj(0), m_space() {} + SharedAllocationRecord() = default; SharedAllocationRecord( const Kokkos::CudaUVMSpace& arg_space, const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate); + const RecordBase::function_type arg_dealloc = &base_t::deallocate); public: - std::string get_label() const; - - static SharedAllocationRecord* allocate(const Kokkos::CudaUVMSpace& arg_space, - const std::string& arg_label, - const size_t arg_alloc_size); - - /**\brief Allocate tracked memory in the space */ - static void* allocate_tracked(const Kokkos::CudaUVMSpace& arg_space, - const std::string& arg_label, - const size_t arg_alloc_size); - - /**\brief Reallocate tracked memory in the space */ - static void* reallocate_tracked(void* const arg_alloc_ptr, - const size_t arg_alloc_size); - - /**\brief Deallocate tracked memory in the space */ - static void deallocate_tracked(void* const arg_alloc_ptr); - - static SharedAllocationRecord* get_record(void* arg_alloc_ptr); - template inline ::cudaTextureObject_t attach_texture_object() { static_assert((std::is_same::value || @@ -1019,57 +865,32 @@ class SharedAllocationRecord // Texture object is attached to the entire allocation range return ptr - reinterpret_cast(RecordBase::m_alloc_ptr); } - - static void print_records(std::ostream&, const Kokkos::CudaUVMSpace&, - bool detail = false); }; template <> class SharedAllocationRecord - : public SharedAllocationRecord { + : public SharedAllocationRecordCommon { private: + friend class SharedAllocationRecordCommon; + using RecordBase = SharedAllocationRecord; + using base_t = SharedAllocationRecordCommon; SharedAllocationRecord(const SharedAllocationRecord&) = delete; SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - static void deallocate(RecordBase*); - static RecordBase s_root_record; const Kokkos::CudaHostPinnedSpace m_space; protected: ~SharedAllocationRecord(); - SharedAllocationRecord() : RecordBase(), m_space() {} + SharedAllocationRecord() = default; SharedAllocationRecord( const Kokkos::CudaHostPinnedSpace& arg_space, const std::string& arg_label, const size_t arg_alloc_size, const RecordBase::function_type arg_dealloc = &deallocate); - - public: - std::string get_label() const; - - static SharedAllocationRecord* allocate( - const Kokkos::CudaHostPinnedSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size); - /**\brief Allocate tracked memory in the space */ - static void* allocate_tracked(const Kokkos::CudaHostPinnedSpace& arg_space, - const std::string& arg_label, - const size_t arg_alloc_size); - - /**\brief Reallocate tracked memory in the space */ - static void* reallocate_tracked(void* const arg_alloc_ptr, - const size_t arg_alloc_size); - - /**\brief Deallocate tracked memory in the space */ - static void deallocate_tracked(void* const arg_alloc_ptr); - - static SharedAllocationRecord* get_record(void* arg_alloc_ptr); - - static void print_records(std::ostream&, const Kokkos::CudaHostPinnedSpace&, - bool detail = false); }; } // namespace Impl diff --git a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp index 3afe081701..55aed13670 100644 --- a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp +++ b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp @@ -856,11 +856,12 @@ KOKKOS_INLINE_FUNCTION_DELETED Impl::ThreadVectorRangeBoundariesStruct ThreadVectorRange(const TeamMemberType&, const iType& count) = delete; -template -KOKKOS_INLINE_FUNCTION_DELETED - Impl::ThreadVectorRangeBoundariesStruct - ThreadVectorRange(const TeamMemberType&, const iType& arg_begin, - const iType& arg_end) = delete; +template +KOKKOS_INLINE_FUNCTION_DELETED Impl::ThreadVectorRangeBoundariesStruct< + typename std::common_type::type, TeamMemberType> +ThreadVectorRange(const TeamMemberType&, const iType1& arg_begin, + const iType2& arg_end) = delete; namespace Impl { @@ -902,85 +903,6 @@ struct ParallelConstructName { } // namespace Kokkos namespace Kokkos { -namespace Experimental { - -namespace Impl { -template -struct PolicyPropertyAdaptor; - -template class Policy, - class... Properties> -struct PolicyPropertyAdaptor, - Policy> { - using policy_in_t = Policy; - static_assert(is_execution_policy::value, ""); - using policy_out_t = Policy, - typename policy_in_t::traits::occupancy_control>; -}; - -template