Merge branch 'master' into collected-small-changes

2021-09-26 18:12:40 -04:00
parent 9da8c932ab f3b1da83f7
commit ea030c6dd8
36 changed files with 1131 additions and 272 deletions
--- a/cmake/Modules/Packages/GPU.cmake
+++ b/cmake/Modules/Packages/GPU.cmake
@ -217,13 +217,20 @@ elseif(GPU_API STREQUAL "OPENCL")
 elseif(GPU_API STREQUAL "HIP")
  if(NOT DEFINED HIP_PATH)
      if(NOT DEFINED ENV{HIP_PATH})
-          set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
+          set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to HIP installation")
      else()
-          set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
+          set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to HIP installation")
      endif()
  endif()
-  set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
+  if(NOT DEFINED ROCM_PATH)
-  find_package(HIP REQUIRED)
+      if(NOT DEFINED ENV{ROCM_PATH})
          set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to ROCm installation")
      else()
          set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to ROCm installation")
      endif()
  endif()
  list(APPEND CMAKE_PREFIX_PATH ${HIP_PATH} ${ROCM_PATH})
  find_package(hip REQUIRED)
  option(HIP_USE_DEVICE_SORT "Use GPU sorting" ON)
  if(NOT DEFINED HIP_PLATFORM)
@ -325,10 +332,11 @@ elseif(GPU_API STREQUAL "HIP")
  set_directory_properties(PROPERTIES ADDITIONAL_MAKE_CLEAN_FILES "${LAMMPS_LIB_BINARY_DIR}/gpu/*_cubin.h ${LAMMPS_LIB_BINARY_DIR}/gpu/*.cu.cpp")
-  hip_add_library(gpu STATIC ${GPU_LIB_SOURCES})
+  add_library(gpu STATIC ${GPU_LIB_SOURCES})
  target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu)
  target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT)
  target_compile_definitions(gpu PRIVATE -DUSE_HIP)
  target_link_libraries(gpu PRIVATE hip::host)
  if(HIP_USE_DEVICE_SORT)
    # add hipCUB
@ -377,8 +385,9 @@ elseif(GPU_API STREQUAL "HIP")
    endif()
  endif()
-  hip_add_executable(hip_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp)
+  add_executable(hip_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp)
  target_compile_definitions(hip_get_devices PRIVATE -DUCL_HIP)
  target_link_libraries(hip_get_devices hip::host)
  if(HIP_PLATFORM STREQUAL "nvcc")
    target_compile_definitions(gpu PRIVATE -D__HIP_PLATFORM_NVCC__)
--- a/cmake/presets/hip_amd.cmake
+++ b/cmake/presets/hip_amd.cmake
@ -0,0 +1,30 @@
 # preset that will enable hip (clang/clang++) with support for MPI and OpenMP (on Linux boxes)
 # prefer flang over gfortran, if available
 find_program(CLANG_FORTRAN NAMES flang gfortran f95)
 set(ENV{OMPI_FC} ${CLANG_FORTRAN})
 set(CMAKE_CXX_COMPILER "hipcc" CACHE STRING "" FORCE)
 set(CMAKE_C_COMPILER "hipcc" CACHE STRING "" FORCE)
 set(CMAKE_Fortran_COMPILER ${CLANG_FORTRAN} CACHE STRING "" FORCE)
 set(CMAKE_CXX_FLAGS_DEBUG "-Wall -Wextra -g" CACHE STRING "" FORCE)
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-Wall -Wextra -g -O2 -DNDEBUG" CACHE STRING "" FORCE)
 set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG" CACHE STRING "" FORCE)
 set(CMAKE_Fortran_FLAGS_DEBUG "-Wall -Wextra -g -std=f2003" CACHE STRING "" FORCE)
 set(CMAKE_Fortran_FLAGS_RELWITHDEBINFO "-Wall -Wextra -g -O2 -DNDEBUG -std=f2003" CACHE STRING "" FORCE)
 set(CMAKE_Fortran_FLAGS_RELEASE "-O3 -DNDEBUG -std=f2003" CACHE STRING "" FORCE)
 set(CMAKE_C_FLAGS_DEBUG "-Wall -Wextra -g" CACHE STRING "" FORCE)
 set(CMAKE_C_FLAGS_RELWITHDEBINFO "-Wall -Wextra -g -O2 -DNDEBUG" CACHE STRING "" FORCE)
 set(CMAKE_C_FLAGS_RELEASE "-O3 -DNDEBUG" CACHE STRING "" FORCE)
 set(MPI_CXX "hipcc" CACHE STRING "" FORCE)
 set(MPI_CXX_COMPILER "mpicxx" CACHE STRING "" FORCE)
 unset(HAVE_OMP_H_INCLUDE CACHE)
 set(OpenMP_C "hipcc" CACHE STRING "" FORCE)
 set(OpenMP_C_FLAGS "-fopenmp" CACHE STRING "" FORCE)
 set(OpenMP_C_LIB_NAMES "omp" CACHE STRING "" FORCE)
 set(OpenMP_CXX "hipcc" CACHE STRING "" FORCE)
 set(OpenMP_CXX_FLAGS "-fopenmp" CACHE STRING "" FORCE)
 set(OpenMP_CXX_LIB_NAMES "omp" CACHE STRING "" FORCE)
 set(OpenMP_omp_LIBRARY "libomp.so" CACHE PATH "" FORCE)
--- a/doc/src/Developer.rst
+++ b/doc/src/Developer.rst
@ -11,6 +11,7 @@ of time and requests from the LAMMPS user community.
   :maxdepth: 1
   Developer_org
   Developer_parallel
   Developer_flow
   Developer_write
   Developer_notes
--- a/doc/src/Developer_par_comm.rst
+++ b/doc/src/Developer_par_comm.rst
@ -0,0 +1,120 @@
 Communication
 ^^^^^^^^^^^^^
 Following the partitioning scheme in use all per-atom data is
 distributed across the MPI processes, which allows LAMMPS to handle very
 large systems provided it uses a correspondingly large number of MPI
 processes.  Since The per-atom data (atom IDs, positions, velocities,
 types, etc.)  To be able to compute the short-range interactions MPI
 processes need not only access to data of atoms they "own" but also
 information about atoms from neighboring sub-domains, in LAMMPS referred
 to as "ghost" atoms.  These are copies of atoms storing required
 per-atom data for up to the communication cutoff distance. The green
 dashed-line boxes in the :ref:`domain-decomposition` figure illustrate
 the extended ghost-atom sub-domain for one processor.
 This approach is also used to implement periodic boundary
 conditions: atoms that lie within the cutoff distance across a periodic
 boundary are also stored as ghost atoms and taken from the periodic
 replication of the sub-domain, which may be the same sub-domain, e.g. if
 running in serial.  As a consequence of this, force computation in
 LAMMPS is not subject to minimum image conventions and thus cutoffs may
 be larger than half the simulation domain.
 .. _ghost-atom-comm:
 .. figure:: img/ghost-comm.png
   :align: center
   ghost atom communication
   This figure shows the ghost atom communication patterns between
   sub-domains for "brick" (left) and "tiled" communication styles for
   2d simulations.  The numbers indicate MPI process ranks.  Here the
   sub-domains are drawn spatially separated for clarity.  The
   dashed-line box is the extended sub-domain of processor 0 which
   includes its ghost atoms.  The red- and blue-shaded boxes are the
   regions of communicated ghost atoms.
 Efficient communication patterns are needed to update the "ghost" atom
 data, since that needs to be done at every MD time step or minimization
 step.  The diagrams of the `ghost-atom-comm` figure illustrate how ghost
 atom communication is performed in two stages for a 2d simulation (three
 in 3d) for both a regular and irregular partitioning of the simulation
 box.  For the regular case (left) atoms are exchanged first in the
 *x*-direction, then in *y*, with four neighbors in the grid of processor
 sub-domains.
 In the *x* stage, processor ranks 1 and 2 send owned atoms in their
 red-shaded regions to rank 0 (and vice versa).  Then in the *y* stage,
 ranks 3 and 4 send atoms in their blue-shaded regions to rank 0, which
 includes ghost atoms they received in the *x* stage.  Rank 0 thus
 acquires all its ghost atoms; atoms in the solid blue corner regions
 are communicated twice before rank 0 receives them.
 For the irregular case (right) the two stages are similar, but a
 processor can have more than one neighbor in each direction.  In the
 *x* stage, MPI ranks 1,2,3 send owned atoms in their red-shaded regions to
 rank 0 (and vice versa).  These include only atoms between the lower
 and upper *y*-boundary of rank 0's sub-domain.  In the *y* stage, ranks
 4,5,6 send atoms in their blue-shaded regions to rank 0.  This may
 include ghost atoms they received in the *x* stage, but only if they
 are needed by rank 0 to fill its extended ghost atom regions in the
 +/-*y* directions (blue rectangles).  Thus in this case, ranks 5 and
 6 do not include ghost atoms they received from each other (in the *x*
 stage) in the atoms they send to rank 0.  The key point is that while
 the pattern of communication is more complex in the irregular
 partitioning case, it can still proceed in two stages (three in 3d)
 via atom exchanges with only neighboring processors.
 When attributes of owned atoms are sent to neighboring processors to
 become attributes of their ghost atoms, LAMMPS calls this a "forward"
 communication.  On timesteps when atoms migrate to new owning processors
 and neighbor lists are rebuilt, each processor creates a list of its
 owned atoms which are ghost atoms in each of its neighbor processors.
 These lists are used to pack per-atom coordinates (for example) into
 message buffers in subsequent steps until the next reneighboring.
 A "reverse" communication is when computed ghost atom attributes are
 sent back to the processor who owns the atom.  This is used (for
 example) to sum partial forces on ghost atoms to the complete force on
 owned atoms.  The order of the two stages described in the
 :ref:`ghost-atom-comm` figure is inverted and the same lists of atoms
 are used to pack and unpack message buffers with per-atom forces.  When
 a received buffer is unpacked, the ghost forces are summed to owned atom
 forces.  As in forward communication, forces on atoms in the four blue
 corners of the diagrams are sent, received, and summed twice (once at
 each stage) before owning processors have the full force.
 These two operations are used many places within LAMMPS aside from
 exchange of coordinates and forces, for example by manybody potentials
 to share intermediate per-atom values, or by rigid-body integrators to
 enable each atom in a body to access body properties.  Here are
 additional details about how these communication operations are
 performed in LAMMPS:
 - When exchanging data with different processors, forward and reverse
  communication is done using ``MPI_Send()`` and ``MPI_IRecv()`` calls.
  If a processor is "exchanging" atoms with itself, only the pack and
  unpack operations are performed, e.g. to create ghost atoms across
  periodic boundaries when running on a single processor.
 - For forward communication of owned atom coordinates, periodic box
  lengths are added and subtracted when the receiving processor is
  across a periodic boundary from the sender.  There is then no need to
  apply a minimum image convention when calculating distances between
  atom pairs when building neighbor lists or computing forces.
 - The cutoff distance for exchanging ghost atoms is typically equal to
  the neighbor cutoff.  But it can also chosen to be longer if needed,
  e.g. half the diameter of a rigid body composed of multiple atoms or
  over 3x the length of a stretched bond for dihedral interactions.  It
  can also exceed the periodic box size.  For the regular communication
  pattern (left), if the cutoff distance extends beyond a neighbor
  processor's sub-domain, then multiple exchanges are performed in the
  same direction.  Each exchange is with the same neighbor processor,
  but buffers are packed/unpacked using a different list of atoms. For
  forward communication, in the first exchange a processor sends only
  owned atoms.  In subsequent exchanges, it sends ghost atoms received
  in previous exchanges.  For the irregular pattern (right) overlaps of
  a processor's extended ghost-atom sub-domain with all other processors
  in each dimension are detected.
--- a/doc/src/Developer_par_long.rst
+++ b/doc/src/Developer_par_long.rst
@ -0,0 +1,188 @@
 Long-range interactions
 ^^^^^^^^^^^^^^^^^^^^^^^
 For charged systems, LAMMPS can compute long-range Coulombic
 interactions via the FFT-based particle-particle/particle-mesh (PPPM)
 method implemented in :doc:`kspace style pppm and its variants
 <kspace_style>`.  For that Coulombic interactions are partitioned into
 short- and long-range components.  The short-ranged portion is computed
 in real space as a loop over pairs of charges within a cutoff distance,
 using neighbor lists.  The long-range portion is computed in reciprocal
 space using a kspace style.  For the PPPM implementation the simulation
 cell is overlaid with a regular FFT grid in 3d. It proceeds in several stages:
 a) each atom's point charge is interpolated to nearby FFT grid points,
 b) a forward 3d FFT is performed,
 c) a convolution operation is performed in reciprocal space,
 d) one or more inverse 3d FFTs are performed, and
 e) electric field values from grid points near each atom are interpolated to compute
   its forces.
 For any of the spatial-decomposition partitioning schemes each processor
 owns the brick-shaped portion of FFT grid points contained within its
 sub-domain.  The two interpolation operations use a stencil of grid
 points surrounding each atom.  To accommodate the stencil size, each
 processor also stores a few layers of ghost grid points surrounding its
 brick.  Forward and reverse communication of grid point values is
 performed similar to the corresponding :doc:`atom data communication
 <Developer_par_comm>`.  In this case, electric field values on owned
 grid points are sent to neighboring processors to become ghost point
 values.  Likewise charge values on ghost points are sent and summed to
 values on owned points.
 For triclinic simulation boxes, the FFT grid planes are parallel to
 the box faces, but the mapping of charge and electric field values
 to/from grid points is done in reduced coordinates where the tilted
 box is conceptually a unit cube, so that the stencil and FFT
 operations are unchanged.  However the FFT grid size required for a
 given accuracy is larger for triclinic domains than it is for
 orthogonal boxes.
 .. _fft-parallel:
 .. figure:: img/fft-decomp-parallel.png
   :align: center
   parallel FFT in PPPM
   Stages of a parallel FFT for a simulation domain overlaid
   with an 8x8x8 3d FFT grid, partitioned across 64 processors.
   Within each of the 4 diagrams, grid cells of the same color are
   owned by a single processor; for simplicity only cells owned by 4
   or 8 of the 64 processors are colored.  The two images on the left
   illustrate brick-to-pencil communication.  The two images on the
   right illustrate pencil-to-pencil communication, which in this
   case transposes the *y* and *z* dimensions of the grid.
 Parallel 3d FFTs require substantial communication relative to their
 computational cost.  A 3d FFT is implemented by a series of 1d FFTs
 along the *x-*, *y-*, and *z-*\ direction of the FFT grid.  Thus the FFT
 grid cannot be decomposed like atoms into 3 dimensions for parallel
 processing of the FFTs but only in 1 (as planes) or 2 (as pencils)
 dimensions and in between the steps the grid needs to be transposed to
 have the FFT grid portion "owned" by each MPI process complete in the
 direction of the 1d FFTs it has to perform. LAMMPS uses the
 pencil-decomposition algorithm as shown in the :ref:`fft-parallel` figure.
 Initially (far left), each processor owns a brick of same-color grid
 cells (actually grid points) contained within in its sub-domain.  A
 brick-to-pencil communication operation converts this layout to 1d
 pencils in the *x*-dimension (center left).  Again, cells of the same
 color are owned by the same processor.  Each processor can then compute
 a 1d FFT on each pencil of data it wholly owns using a call to the
 configured FFT library.  A pencil-to-pencil communication then converts
 this layout to pencils in the *y* dimension (center right) which
 effectively transposes the *x* and *y* dimensions of the grid, followed
 by 1d FFTs in *y*.  A final transpose of pencils from *y* to *z* (far
 right) followed by 1d FFTs in *z* completes the forward FFT.  The data
 is left in a *z*-pencil layout for the convolution operation.  One or
 more inverse FFTs then perform the sequence of 1d FFTs and communication
 steps in reverse order; the final layout of resulting grid values is the
 same as the initial brick layout.
 Each communication operation within the FFT (brick-to-pencil or
 pencil-to-pencil or pencil-to-brick) converts one tiling of the 3d grid
 to another, where a tiling in this context means an assignment of a
 small brick-shaped subset of grid points to each processor, the union of
 which comprise the entire grid.  The parallel `fftMPI library
 <https://lammps.github.io/fftmpi/>`_ written for LAMMPS allows arbitrary
 definitions of the tiling so that an irregular partitioning of the
 simulation domain can use it directly.  Transforming data from one
 tiling to another is implemented in `fftMPI` using point-to-point
 communication, where each processor sends data to a few other
 processors, since each tile in the initial tiling overlaps with a
 handful of tiles in the final tiling.
 The transformations could also be done using collective communication
 across all $P$ processors with a single call to ``MPI_Alltoall()``, but
 this is typically much slower.  However, for the specialized brick and
 pencil tiling illustrated in :ref:`fft-parallel` figure, collective
 communication across the entire MPI communicator is not required.  In
 the example an :math:`8^3` grid with 512 grid cells is partitioned
 across 64 processors; each processor owns a 2x2x2 3d brick of grid
 cells.  The initial brick-to-pencil communication (upper left to upper
 right) only requires collective communication within subgroups of 4
 processors, as illustrated by the 4 colors.  More generally, a
 brick-to-pencil communication can be performed by partitioning *P*
 processors into :math:`P^{\frac{2}{3}}` subgroups of
 :math:`P^{\frac{1}{3}}` processors each.  Each subgroup performs
 collective communication only within its subgroup.  Similarly,
 pencil-to-pencil communication can be performed by partitioning *P*
 processors into :math:`P^{\frac{1}{2}}` subgroups of
 :math:`P^{\frac{1}{2}}` processors each.  This is illustrated in the
 figure for the :math:`y \Rightarrow z` communication (center).  An
 eight-processor subgroup owns the front *yz* plane of data and performs
 collective communication within the subgroup to transpose from a
 *y*-pencil to *z*-pencil layout.
 LAMMPS invokes point-to-point communication by default, but also
 provides the option of partitioned collective communication when using a
 :doc:`kspace_modify collective yes <kspace_modify>` command to switch to
 that mode.  In the latter case, the code detects the size of the
 disjoint subgroups and partitions the single *P*-size communicator into
 multiple smaller communicators, each of which invokes collective
 communication.  Testing on a large IBM Blue Gene/Q machine at Argonne
 National Labs showed a significant improvement in FFT performance for
 large processor counts; partitioned collective communication was faster
 than point-to-point communication or global collective communication
 involving all *P* processors.
 Here are some additional details about FFTs for long-range and related
 grid/particle operations that LAMMPS supports:
 - The fftMPI library allows each grid dimension to be a multiple of
  small prime factors (2,3,5), and allows any number of processors to
  perform the FFT.  The resulting brick and pencil decompositions are
  thus not always as well-aligned but the size of subgroups of
  processors for the two modes of communication (brick/pencil and
  pencil/pencil) still scale as :math:`O(P^{\frac{1}{3}})` and
  :math:`O(P^{\frac{1}{2}})`.
 - For efficiency in performing 1d FFTs, the grid transpose
  operations illustrated in Figure \ref{fig:fft} also involve
  reordering the 3d data so that a different dimension is contiguous
  in memory.  This reordering can be done during the packing or
  unpacking of buffers for MPI communication.
 - For large systems and particularly a large number of MPI processes,
  the dominant cost for parallel FFTs is often the communication, not
  the computation of 1d FFTs, even though the latter scales as :math:`N
  \log(N)` in the number of grid points *N* per grid direction.  This is
  due to the fact that only a 2d decomposition into pencils is possible
  while atom data (and their corresponding short-range force and energy
  computations) can be decomposed efficiently in 3d.
  This can be addressed by reducing the number of MPI processes involved
  in the MPI communication by using :doc:`hybrid MPI + OpenMP
  parallelization <Speed_omp>`.  This will use OpenMP parallelization
  inside the MPI domains and while that may have a lower parallel
  efficiency, it reduces the communication overhead.
  As an alternative it is also possible to start a :ref:`multi-partition
  <partition>` calculation and then use the :doc:`verlet/split
  integrator <run_style>` to perform the PPPM computation on a
  dedicated, separate partition of MPI processes.  This uses an integer
  "1:*p*" mapping of *p* sub-domains of the atom decomposition to one
  sub-domain of the FFT grid decomposition and where pairwise non-bonded
  and bonded forces and energies are computed on the larger partition
  and the PPPM kspace computation concurrently on the smaller partition.
 - LAMMPS also implements PPPM-based solvers for other long-range
  interactions, dipole and dispersion (Lennard-Jones), which can be used
  in conjunction with long-range  Coulombics for point charges.
 - LAMMPS implements a ``GridComm`` class which overlays the simulation
  domain with a regular grid, partitions it across processors in a
  manner consistent with processor sub-domains, and provides methods for
  forward and reverse communication of owned and ghost grid point
  values.  It is used for PPPM as an FFT grid (as outlined above) and
  also for the MSM algorithm which uses a cascade of grid sizes from
  fine to coarse to compute long-range Coulombic forces.  The GridComm
  class is also useful for models where continuum fields interact with
  particles.  For example, the two-temperature model (TTM) defines heat
  transfer between atoms (particles) and electrons (continuum gas) where
  spatial variations in the electron temperature are computed by finite
  differences of a discretized heat equation on a regular grid.  The
  :doc:`fix ttm/grid <fix_ttm>` command uses the ``GridComm`` class
  internally to perform its grid operations on a distributed grid
  instead of the original :doc:`fix ttm <fix_ttm>` which uses a
  replicated grid.
--- a/doc/src/Developer_par_neigh.rst
+++ b/doc/src/Developer_par_neigh.rst
@ -0,0 +1,159 @@
 Neighbor lists
 ^^^^^^^^^^^^^^
 To compute forces efficiently, each processor creates a Verlet-style
 neighbor list which enumerates all pairs of atoms *i,j* (*i* = owned,
 *j* = owned or ghost) with separation less than the applicable
 neighbor list cutoff distance.  In LAMMPS the neighbor lists are stored
 in a multiple-page data structure; each page is a contiguous chunk of
 memory which stores vectors of neighbor atoms *j* for many *i* atoms.
 This allows pages to be incrementally allocated or deallocated in blocks
 as needed.  Neighbor lists typically consume the most memory of any data
 structure in LAMMPS.  The neighbor list is rebuilt (from scratch) once
 every few timesteps, then used repeatedly each step for force or other
 computations.  The neighbor cutoff distance is :math:`R_n = R_f +
 \Delta_s`, where :math:`R_f` is the (largest) force cutoff defined by
 the interatomic potential for computing short-range pairwise or manybody
 forces and :math:`\Delta_s` is a "skin" distance that allows the list to
 be used for multiple steps assuming that atoms do not move very far
 between consecutive time steps.  Typically the code triggers
 reneighboring when any atom has moved half the skin distance since the
 last reneighboring; this and other options of the neighbor list rebuild
 can be adjusted with the :doc:`neigh_modify <neigh_modify>` command.
 On steps when reneighboring is performed, atoms which have moved outside
 their owning processor's sub-domain are first migrated to new processors
 via communication.  Periodic boundary conditions are also (only)
 enforced on these steps to ensure each atom is re-assigned to the
 correct processor.  After migration, the atoms owned by each processor
 are stored in a contiguous vector.  Periodically each processor
 spatially sorts owned atoms within its vector to reorder it for improved
 cache efficiency in force computations and neighbor list building.  For
 that atoms are spatially binned and then reordered so that atoms in the
 same bin are adjacent in the vector.  Atom sorting can be disabled or
 its settings modified with the :doc:`atom_modify <atom_modify>` command.
 .. _neighbor-stencil:
 .. figure:: img/neigh-stencil.png
   :align: center
   neighbor list stencils
   A 2d simulation sub-domain (thick black line) and the corresponding
   ghost atom cutoff region (dashed blue line) for both orthogonal
   (left) and triclinic (right) domains.  A regular grid of neighbor
   bins (thin lines) overlays the entire simulation domain and need not
   align with sub-domain boundaries; only the portion overlapping the
   augmented sub-domain is shown.  In the triclinic case it overlaps the
   bounding box of the tilted rectangle.  The blue- and red-shaded bins
   represent a stencil of bins searched to find neighbors of a particular
   atom (black dot).
 To build a local neighbor list in linear time, the simulation domain is
 overlaid (conceptually) with a regular 3d (or 2d) grid of neighbor bins,
 as shown in the :ref:`neighbor-stencil` figure for 2d models and a
 single MPI processor's sub-domain.  Each processor stores a set of
 neighbor bins which overlap its sub-domain extended by the neighbor
 cutoff distance :math:`R_n`.  As illustrated, the bins need not align
 with processor boundaries; an integer number in each dimension is fit to
 the size of the entire simulation box.
 Most often LAMMPS builds what it calls a "half" neighbor list where
 each *i,j* neighbor pair is stored only once, with either atom *i* or
 *j* as the central atom.  The build can be done efficiently by using a
 pre-computed "stencil" of bins around a central origin bin which
 contains the atom whose neighbors are being searched for.  A stencil
 is simply a list of integer offsets in *x,y,z* of nearby bins
 surrounding the origin bin which are close enough to contain any
 neighbor atom *j* within a distance :math:`R_n` from any atom *i* in the
 origin bin.  Note that for a half neighbor list, the stencil can be
 asymmetric since each atom only need store half its nearby neighbors.
 These stencils are illustrated in the figure for a half list and a bin
 size of :math:`\frac{1}{2} R_n`.  There are 13 red+blue stencil bins in
 2d (for the orthogonal case, 15 for triclinic).  In 3d there would be
 63, 13 in the plane of bins that contain the origin bin and 25 in each
 of the two planes above it in the *z* direction (75 for triclinic).  The
 reason the triclinic stencil has extra bins is because the bins tile the
 bounding box of the entire triclinic domain and thus are not periodic
 with respect to the simulation box itself.  The stencil and logic for
 determining which *i,j* pairs to include in the neighbor list are
 altered slightly to account for this.
 To build a neighbor list, a processor first loops over its "owned" plus
 "ghost" atoms and assigns each to a neighbor bin.  This uses an integer
 vector to create a linked list of atom indices within each bin.  It then
 performs a triply-nested loop over its owned atoms *i*, the stencil of
 bins surrounding atom *i*'s bin, and the *j* atoms in each stencil bin
 (including ghost atoms).  If the distance :math:`r_{ij} < R_n`, then
 atom *j* is added to the vector of atom *i*'s neighbors.
 Here are additional details about neighbor list build options LAMMPS
 supports:
 - The choice of bin size is an option; a size half of :math:`R_n` has
  been found to be optimal for many typical cases.  Smaller bins incur
  additional overhead to loop over; larger bins require more distance
  calculations.  Note that for smaller bin sizes, the 2d stencil in the
  figure would be more semi-circular in shape (hemispherical in 3d),
  with bins near the corners of the square eliminated due to their
  distance from the origin bin.
 - Depending on the interatomic potential(s) and other commands used in
  an input script, multiple neighbor lists and stencils with different
  attributes may be needed.  This includes lists with different cutoff
  distances, e.g. for force computation versus occasional diagnostic
  computations such as a radial distribution function, or for the
  r-RESPA time integrator which can partition pairwise forces by
  distance into subsets computed at different time intervals.  It
  includes "full" lists (as opposed to half lists) where each *i,j* pair
  appears twice, stored once with *i* and *j*, and which use a larger
  symmetric stencil.  It also includes lists with partial enumeration of
  ghost atom neighbors.  The full and ghost-atom lists are used by
  various manybody interatomic potentials.  Lists may also use different
  criteria for inclusion of a pair interaction.  Typically this simply
  depends only on the distance between two atoms and the cutoff
  distance.  But for finite-size coarse-grained particles with
  individual diameters (e.g. polydisperse granular particles), it can
  also depend on the diameters of the two particles.
 - When using :doc:`pair style hybrid <pair_hybrid>` multiple sub-lists
  of the master neighbor list for the full system need to be generated,
  one for each sub-style, which contains only the *i,j* pairs needed to
  compute interactions between subsets of atoms for the corresponding
  potential.  This means not all *i* or *j* atoms owned by a processor
  are included in a particular sub-list.
 - Some models use different cutoff lengths for pairwise interactions
  between different kinds of particles which are stored in a single
  neighbor list.  One example is a solvated colloidal system with large
  colloidal particles where colloid/colloid, colloid/solvent, and
  solvent/solvent interaction cutoffs can be dramatically different.
  Another is a model of polydisperse finite-size granular particles;
  pairs of particles interact only when they are in contact with each
  other.  Mixtures with particle size ratios as high as 10-100x may be
  used to model realistic systems.  Efficient neighbor list building
  algorithms for these kinds of systems are available in LAMMPS.  They
  include a method which uses different stencils for different cutoff
  lengths and trims the stencil to only include bins that straddle the
  cutoff sphere surface.  More recently a method which uses both
  multiple stencils and multiple bin sizes was developed; it builds
  neighbor lists efficiently for systems with particles of any size
  ratio, though other considerations (timestep size, force computations)
  may limit the ability to model systems with huge polydispersity.
 - For small and sparse systems and as a fallback method, LAMMPS also
  supports neighbor list construction without binning by using a full
  :math:`O(N^2)` loop over all *i,j* atom pairs in a sub-domain when
  using the :doc:`neighbor nsq <neighbor>` command.
 - Dependent on the "pair" setting of the :doc:`newton <newton>` command,
  the "half" neighbor lists may contain **all** pairs of atoms where
  atom *j* is a ghost atom (i.e. when the newton pair setting is *off*)
  For the newton pair *on* setting the atom *j* is only added to the
  list if its *z* coordinate is larger, or if equal the *y* coordinate
  is larger, and that is equal, too, the *x* coordinate is larger.  For
  homogeneously dense systems that will result in picking neighbors from
  a same size sector in always the same direction relative to the
  "owned" atom and thus it should lead to similar length neighbor lists
  and thus reduce the chance of a load imbalance.
--- a/doc/src/Developer_par_openmp.rst
+++ b/doc/src/Developer_par_openmp.rst
@ -0,0 +1,114 @@
 OpenMP Parallelism
 ^^^^^^^^^^^^^^^^^^
 The styles in the INTEL, KOKKOS, and OPENMP package offer to use OpenMP
 thread parallelism to predominantly distribute loops over local data
 and thus follow an orthogonal parallelization strategy to the
 decomposition into spatial domains used by the :doc:`MPI partitioning
 <Developer_par_part>`.  For clarity, this section discusses only the
 implementation in the OPENMP package as it is the simplest. The INTEL
 and KOKKOS package offer additional options and are more complex since
 they support more features and different hardware like co-processors
 or GPUs.
 One of the key decisions when implementing the OPENMP package was to
 keep the changes to the source code small, so that it would be easier to
 maintain the code and keep it in sync with the non-threaded standard
 implementation.  this is achieved by a) making the OPENMP version a
 derived class from the regular version (e.g. ``PairLJCutOMP`` from
 ``PairLJCut``) and overriding only methods that are multi-threaded or
 need to be modified to support multi-threading (similar to what was done
 in the OPT package), b) keeping the structure in the modified code very
 similar so that side-by-side comparisons are still useful, and c)
 offloading additional functionality and multi-thread support functions
 into three separate classes ``ThrOMP``, ``ThrData``, and ``FixOMP``.
 ``ThrOMP`` provides additional, multi-thread aware functionality not
 available in the corresponding base class (e.g. ``Pair`` for
 ``PairLJCutOMP``) like multi-thread aware variants of the "tally"
 functions. Those functions are made available through multiple
 inheritance so those new functions have to have unique names to avoid
 ambiguities; typically ``_thr`` is appended to the name of the function.
 ``ThrData`` is a classes that manages per-thread data structures.
 It is used instead of extending the corresponding storage to per-thread
 arrays to avoid slowdowns due to "false sharing" when multiple threads
 update adjacent elements in an array and thus force the CPU cache lines
 to be reset and re-fetched.  ``FixOMP`` finally manages the "multi-thread
 state" like settings and access to per-thread storage, it is activated
 by the :doc:`package omp <package>` command.
 Avoiding data races
 """""""""""""""""""
 A key problem when implementing thread parallelism in an MD code is
 to avoid data races when updating accumulated properties like forces,
 energies, and stresses.  When interactions are computed, they always
 involve multiple atoms and thus there are race conditions when multiple
 threads want to update per-atom data of the same atoms.  Five possible
 strategies have been considered to avoid this:
 1) restructure the code so that there is no overlapping access possible
   when computing in parallel, e.g. by breaking lists into multiple
   parts and synchronizing threads in between.
 2) have each thread be "responsible" for a specific group of atoms and
   compute these interactions multiple times, once on each thread that
   is responsible for a given atom and then have each thread only update
   the properties of this atom.
 3) use mutexes around functions and regions of code where the data race
   could happen
 4) use atomic operations when updating per-atom properties
 5) use replicated per-thread data structures to accumulate data without
   conflicts and then use a reduction to combine those results into the
   data structures used by the regular style.
 Option 5 was chosen for the OPENMP package because it would retain the
 performance for the case of 1 thread and the code would be more
 maintainable.  Option 1 would require extensive code changes,
 particularly to the neighbor list code; options 2 would have incurred a
 2x or more performance penalty for the serial case; option 3 causes
 significant overhead and would enforce serialization of operations in
 inner loops and thus defeat the purpose of multi-threading; option 4
 slows down the serial case although not quite as bad as option 2.  The
 downside of option 5 is that the overhead of the reduction operations
 grows with the number of threads used, so there would be a crossover
 point where options 2 or 4 would result in faster executing.  That is
 why option 2 for example is used in the GPU package because a GPU is a
 processor with a massive number of threads.  However, since the MPI
 parallelization is generally more effective for typical MD systems, the
 expectation is that thread parallelism is only used for a smaller number
 of threads (2-8).  At the time of its implementation, that number was
 equivalent to the number of CPU cores per CPU socket on high-end
 supercomputers.
 Thus arrays like the force array are dimensioned to the number of atoms
 times the number of threads when enabling OpenMP support and inside the
 compute functions a pointer to a different chunk is obtained by each thread.
 Similarly, accumulators like potential energy or virial are kept in
 per-thread instances of the ``ThrData`` class and then only reduced and
 stored in their global counterparts at the end of the force computation.
 Loop scheduling
 """""""""""""""
 Multi-thread parallelization is applied by distributing (outer) loops
 statically across threads.  Typically this would be the loop over local
 atoms *i* when processing *i,j* pairs of atoms from a neighbor list.
 The design of the neighbor list code results in atoms having a similar
 number of neighbors for homogeneous systems and thus load imbalances
 across threads are not common and typically happen for systems where
 also the MPI parallelization would be unbalanced, which would typically
 have a more pronounced impact on the performance.  This same loop
 scheduling scheme can also be applied to the reduction operations on
 per-atom data to try and reduce the overhead of the reduction operation.
 Neighbor list parallelization
 """""""""""""""""""""""""""""
 In addition to the parallelization of force computations, also the
 generation of the neighbor lists is parallelized.  As explained
 previously, neighbor lists are built by looping over "owned" atoms and
 storing the neighbors in "pages".  In the OPENMP variants of the
 neighbor list code, each thread operates on a different chunk of "owned"
 atoms and allocates and fills its own set of pages with neighbor list
 data.  This is achieved by each thread keeping its own instance of the
 :cpp:class:`MyPage <LAMMPS_NS::MyPage>` page allocator class.
--- a/doc/src/Developer_par_part.rst
+++ b/doc/src/Developer_par_part.rst
@ -0,0 +1,89 @@
 Partitioning
 ^^^^^^^^^^^^
 The underlying spatial decomposition strategy used by LAMMPS for
 distributed-memory parallelism is set with the :doc:`comm_style command
 <comm_style>` and can be either "brick" (a regular grid) or "tiled".
 .. _domain-decomposition:
 .. figure:: img/domain-decomp.png
   :align: center
   domain decomposition
   This figure shows the different kinds of domain decomposition used
   for MPI parallelization: "brick" on the left with an orthogonal
   (left) and a triclinic (middle) simulation domain, and a "tiled"
   decomposition (right).  The black lines show the division into
   sub-domains and the contained atoms are "owned" by the corresponding
   MPI process. The green dashed lines indicate how sub-domains are
   extended with "ghost" atoms up to the communication cutoff distance.
 The LAMMPS simulation box is a 3d or 2d volume, which can be orthogonal
 or triclinic in shape, as illustrated in the :ref:`domain-decomposition`
 figure for the 2d case.  Orthogonal means the box edges are aligned with
 the *x*, *y*, *z* Cartesian axes, and the box faces are thus all
 rectangular.  Triclinic allows for a more general parallelepiped shape
 in which edges are aligned with three arbitrary vectors and the box
 faces are parallelograms.  In each dimension box faces can be periodic,
 or non-periodic with fixed or shrink-wrapped boundaries.  In the fixed
 case, atoms which move outside the face are deleted; shrink-wrapped
 means the position of the box face adjusts continuously to enclose all
 the atoms.
 For distributed-memory MPI parallelism, the simulation box is spatially
 decomposed (partitioned) into non-overlapping sub-domains which fill the
 box. The default partitioning, "brick", is most suitable when atom
 density is roughly uniform, as shown in the left-side images of the
 :ref:`domain-decomposition` figure.  The sub-domains comprise a regular
 grid and all sub-domains are identical in size and shape.  Both the
 orthogonal and triclinic boxes can deform continuously during a
 simulation, e.g. to compress a solid or shear a liquid, in which case
 the processor sub-domains likewise deform.
 For models with non-uniform density, the number of particles per
 processor can be load-imbalanced with the default partitioning.  This
 reduces parallel efficiency, as the overall simulation rate is limited
 by the slowest processor, i.e. the one with the largest computational
 load.  For such models, LAMMPS supports multiple strategies to reduce
 the load imbalance:
 - The processor grid decomposition is by default based on the simulation
  cell volume and tries to optimize the volume to surface ratio for the sub-domains.
  This can be changed with the :doc:`processors command <processors>`.
 - The parallel planes defining the size of the sub-domains can be shifted
  with the :doc:`balance command <balance>`. Which can be done in addition
  to choosing a more optimal processor grid.
 - The recursive bisectioning algorithm in combination with the "tiled"
  communication style can produce a partitioning with equal numbers of
  particles in each sub-domain.
 .. |decomp1| image:: img/decomp-regular.png
   :width: 24%
 .. |decomp2| image:: img/decomp-processors.png
   :width: 24%
 .. |decomp3| image:: img/decomp-balance.png
   :width: 24%
 .. |decomp4| image:: img/decomp-rcb.png
   :width: 24%
 |decomp1|  |decomp2|  |decomp3|  |decomp4|
 The pictures above demonstrate different decompositions for a 2d system
 with 12 MPI ranks.  The atom colors indicate the load imbalance of each
 sub-domain with green being optimal and red the least optimal.
 Due to the vacuum in the system, the default decomposition is unbalanced
 with several MPI ranks without atoms (left). By forcing a 1x12x1
 processor grid, every MPI rank does computations now, but number of
 atoms per sub-domain is still uneven and the thin slice shape increases
 the amount of communication between sub-domains (center left). With a
 2x6x1 processor grid and shifting the sub-domain divisions, the load
 imbalance is further reduced and the amount of communication required
 between sub-domains is less (center right).  And using the recursive
 bisectioning leads to further improved decomposition (right).
--- a/doc/src/Developer_parallel.rst
+++ b/doc/src/Developer_parallel.rst
@ -0,0 +1,28 @@
 Parallel algorithms
 -------------------
 LAMMPS is designed to enable running simulations in parallel using the
 MPI parallel communication standard with distributed data via domain
 decomposition.  The parallelization aims to be efficient result in good
 strong scaling (= good speedup for the same system) and good weak
 scaling (= the computational cost of enlarging the system is
 proportional to the system size).  Additional parallelization using GPUs
 or OpenMP can also be applied within the sub-domain assigned to an MPI
 process.  For clarity, most of the following illustrations show the 2d
 simulation case. The underlying algorithms in those cases, however,
 apply to both 2d and 3d cases equally well.
 .. note::
   The text and most of the figures in this chapter were adapted
   for the manual from the section on parallel algorithms in the
   :ref:`new LAMMPS paper <lammps_paper>`.
 .. toctree::
   :maxdepth: 1
   Developer_par_part
   Developer_par_comm
   Developer_par_neigh
   Developer_par_long
   Developer_par_openmp
--- a/doc/src/Intro_citing.rst
+++ b/doc/src/Intro_citing.rst
@ -4,28 +4,40 @@ Citing LAMMPS
 Core Algorithms
 ^^^^^^^^^^^^^^^
-Since LAMMPS is a community project, there is not a single one
+The paper mentioned below is the best overview of LAMMPS, but there are
-publication or reference that describes **all** of LAMMPS.
+also publications describing particular models or algorithms implemented
-The canonical publication that describes the foundation, that is
+in LAMMPS or complementary software that is has interfaces to.  Please
-the basic spatial decomposition approach, the neighbor finding,
+see below for how to cite contributions to LAMMPS.
-and basic communications algorithms used in LAMMPS is:
+
 .. _lammps_paper:
 The latest canonical publication that describes the basic features, the
 source code design, the program structure, the spatial decomposition
 approach, the neighbor finding, basic communications algorithms, and how
 users and developers have contributed to LAMMPS is:
  `LAMMPS - A flexible simulation tool for particle-based materials modeling at the atomic, meso, and continuum scales, Comp. Phys. Comm. (accepted 09/2021), DOI:10.1016/j.cpc.2021.108171 <https://doi.org/10.1016/j.cpc.2021.108171>`_
 So a project using LAMMPS or a derivative application that uses LAMMPS
 as a simulation engine should cite this paper.  The paper is expected to
 be published in its final form under the same DOI in the first half
 of 2022.
 The original publication describing the parallel algorithms used in the
 initial versions of LAMMPS is:
  `S. Plimpton, Fast Parallel Algorithms for Short-Range Molecular Dynamics, J Comp Phys, 117, 1-19 (1995). <http://www.sandia.gov/~sjplimp/papers/jcompphys95.pdf>`_
 So any project using LAMMPS (or a derivative application using LAMMPS as
 a simulation engine) should cite this paper. A new publication
 describing the developments and improvements of LAMMPS in the 25 years
 since then is currently in preparation.
 DOI for the LAMMPS code
 ^^^^^^^^^^^^^^^^^^^^^^^
-LAMMPS developers use the `Zenodo service at CERN
+LAMMPS developers use the `Zenodo service at CERN <https://zenodo.org/>`_
-<https://zenodo.org/>`_ to create digital object identifies (DOI) for
+to create digital object identifies (DOI) for stable releases of the
-stable releases of the LAMMPS code. There are two types of DOIs for the
+LAMMPS source code. There are two types of DOIs for the LAMMPS source code.
-LAMMPS source code: the canonical DOI for **all** versions of LAMMPS,
+
-which will always point to the **latest** stable release version is:
+The canonical DOI for **all** versions of LAMMPS, which will always
 point to the **latest** stable release version is:
 - DOI: `10.5281/zenodo.3726416 <https://dx.doi.org/10.5281/zenodo.3726416>`_
@ -45,11 +57,13 @@ about LAMMPS and its features.
 Citing contributions
 ^^^^^^^^^^^^^^^^^^^^
-LAMMPS has many features and that use either previously published
+LAMMPS has many features that use either previously published methods
-methods and algorithms or novel features.  It also includes potential
+and algorithms or novel features.  It also includes potential parameter
-parameter filed for specific models.  Where available, a reminder about
+files for specific models.  Where available, a reminder about references
-references for optional features used in a specific run is printed to
+for optional features used in a specific run is printed to the screen
-the screen and log file.  Style and output location can be selected with
+and log file.  Style and output location can be selected with the
-the :ref:`-cite command-line switch <cite>`.  Additional references are
+:ref:`-cite command-line switch <cite>`.  Additional references are
 given in the documentation of the :doc:`corresponding commands
-<Commands_all>` or in the :doc:`Howto tutorials <Howto>`.
+<Commands_all>` or in the :doc:`Howto tutorials <Howto>`.  So please
 make certain, that you provide the proper acknowledgments and citations
 in any published works using LAMMPS.
--- a/doc/src/img/decomp-balance.png
+++ b/doc/src/img/decomp-balance.png
--- a/doc/src/img/decomp-processors.png
+++ b/doc/src/img/decomp-processors.png
--- a/doc/src/img/decomp-rcb.png
+++ b/doc/src/img/decomp-rcb.png
--- a/doc/src/img/decomp-regular.png
+++ b/doc/src/img/decomp-regular.png
--- a/doc/src/img/domain-decomp.png
+++ b/doc/src/img/domain-decomp.png
--- a/doc/src/img/fft-decomp-parallel.png
+++ b/doc/src/img/fft-decomp-parallel.png
--- a/doc/src/img/ghost-comm.png
+++ b/doc/src/img/ghost-comm.png
--- a/doc/src/img/neigh-stencil.png
+++ b/doc/src/img/neigh-stencil.png
--- a/src/KIM/kim_init.cpp
+++ b/src/KIM/kim_init.cpp
@ -1,4 +1,3 @@
 // clang-format off
 /* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
@ -85,21 +84,23 @@ void KimInit::command(int narg, char **arg)
  if ((narg < 2) || (narg > 3)) error->all(FLERR, "Illegal 'kim init' command");
  if (domain->box_exist)
-    error->all(FLERR, "Must use 'kim init' command before "
+    error->all(FLERR, "Must use 'kim init' command before simulation box is defined");
                      "simulation box is defined");
  char *model_name = utils::strdup(arg[0]);
  char *user_units = utils::strdup(arg[1]);
  if (narg == 3) {
    auto arg_str = std::string(arg[2]);
-    if (arg_str == "unit_conversion_mode") unit_conversion_mode = true;
+    if (arg_str == "unit_conversion_mode")
      unit_conversion_mode = true;
    else {
-      error->all(FLERR, "Illegal 'kim init' command.\nThe argument "
+      error->all(FLERR,
-                                    "followed by unit_style {} is an optional "
+                 "Illegal 'kim init' command.\n"
-                                    "argument and when is used must "
+                 "The argument followed by unit_style {} is an optional argument and when "
-                                    "be unit_conversion_mode", user_units);
+                 "is used must be unit_conversion_mode",
                 user_units);
    }
-  } else unit_conversion_mode = false;
+  } else
    unit_conversion_mode = false;
  char *model_units;
  KIM_Model *pkim = nullptr;
@ -117,14 +118,9 @@ void KimInit::command(int narg, char **arg)
 /* ---------------------------------------------------------------------- */
 namespace {
-void get_kim_unit_names(
+void get_kim_unit_names(char const *const system, KIM_LengthUnit &lengthUnit,
-    char const * const system,
+                        KIM_EnergyUnit &energyUnit, KIM_ChargeUnit &chargeUnit,
-    KIM_LengthUnit & lengthUnit,
+                        KIM_TemperatureUnit &temperatureUnit, KIM_TimeUnit &timeUnit, Error *error)
    KIM_EnergyUnit & energyUnit,
    KIM_ChargeUnit & chargeUnit,
    KIM_TemperatureUnit & temperatureUnit,
    KIM_TimeUnit & timeUnit,
    Error * error)
 {
  const std::string system_str(system);
  if (system_str == "real") {
@ -157,20 +153,64 @@ void get_kim_unit_names(
    chargeUnit = KIM_CHARGE_UNIT_e;
    temperatureUnit = KIM_TEMPERATURE_UNIT_K;
    timeUnit = KIM_TIME_UNIT_fs;
-  } else if ((system_str == "lj") ||
+  } else if ((system_str == "lj") || (system_str == "micro") || (system_str == "nano")) {
-             (system_str == "micro") ||
+    error->all(FLERR, "LAMMPS unit_style {} not supported by KIM models", system_str);
             (system_str == "nano")) {
    error->all(FLERR, "LAMMPS unit_style {} not supported "
                                  "by KIM models", system_str);
  } else {
    error->all(FLERR, "Unknown unit_style");
  }
 }
 }    // namespace
-void KimInit::determine_model_type_and_units(char * model_name,
+void KimInit::print_dirs(struct KIM_Collections *const collections) const
-                                             char * user_units,
+{
-                                             char ** model_units,
+  int kim_error = 0;
  int dirListExtent = 0;
  int dirCounter = 0;
  std::string mesg = "#=== KIM is looking for 'Portable Models' in these directories ===\n";
  std::vector<struct KIM_Collection> collection_list;
  collection_list.push_back(KIM_COLLECTION_currentWorkingDirectory);
  collection_list.push_back(KIM_COLLECTION_environmentVariable);
  collection_list.push_back(KIM_COLLECTION_user);
  collection_list.push_back(KIM_COLLECTION_system);
  for (auto col : collection_list) {
    kim_error = KIM_Collections_CacheListOfDirectoryNames(
        collections, col, KIM_COLLECTION_ITEM_TYPE_portableModel, &dirListExtent);
    if (!kim_error) {
      for (int i = 0; i < dirListExtent; ++i) {
        char const *name;
        kim_error = KIM_Collections_GetDirectoryName(collections, i, &name);
        // Don't check for error due to bug in kim-api-2.2.1 and below.
 #if ((KIM_VERSION_MAJOR * 1000 + KIM_VERSION_MINOR) * 1000 + KIM_VERSION_PATCH) <= 2002001
        kim_error = 0;
 #endif
        if (!kim_error) mesg += fmt::format("# {:2}: {}\n", ++dirCounter, name);
      }
    }
  }
  dirCounter = 0;
  mesg += "#=== KIM is looking for 'Simulator Models' in these directories ===\n";
  for (auto col : collection_list) {
    kim_error = KIM_Collections_CacheListOfDirectoryNames(
        collections, col, KIM_COLLECTION_ITEM_TYPE_simulatorModel, &dirListExtent);
    if (!kim_error) {
      for (int i = 0; i < dirListExtent; ++i) {
        char const *name;
        kim_error = KIM_Collections_GetDirectoryName(collections, i, &name);
        // Don't check for error due to bug in kim-api-2.2.1 and below.
 #if ((KIM_VERSION_MAJOR * 1000 + KIM_VERSION_MINOR) * 1000 + KIM_VERSION_PATCH) <= 2002001
        kim_error = 0;
 #endif
        if (!kim_error) mesg += fmt::format("# {:2}: {}\n", ++dirCounter, name);
      }
    }
  }
  input->write_echo(mesg);
 }
 void KimInit::determine_model_type_and_units(char *model_name, char *user_units, char **model_units,
                                             KIM_Model *&pkim)
 {
  KIM_LengthUnit lengthUnit;
@ -183,29 +223,22 @@ void KimInit::determine_model_type_and_units(char * model_name,
  KIM_CollectionItemType itemType;
  int kim_error = KIM_Collections_Create(&collections);
-  if (kim_error)
+  if (kim_error) error->all(FLERR, "Unable to access KIM Collections to find Model");
    error->all(FLERR, "Unable to access KIM Collections to find Model");
  auto logID = fmt::format("{}_Collections", comm->me);
  KIM_Collections_SetLogID(collections, logID.c_str());
  print_dirs(collections);
  kim_error = KIM_Collections_GetItemType(collections, model_name, &itemType);
  if (kim_error) error->all(FLERR, "KIM Model name not found");
  KIM_Collections_Destroy(&collections);
-  if (KIM_CollectionItemType_Equal(itemType,
+  if (KIM_CollectionItemType_Equal(itemType, KIM_COLLECTION_ITEM_TYPE_portableModel)) {
-                                   KIM_COLLECTION_ITEM_TYPE_portableModel)) {
+    get_kim_unit_names(user_units, lengthUnit, energyUnit, chargeUnit, temperatureUnit, timeUnit,
-    get_kim_unit_names(user_units, lengthUnit, energyUnit,
+                       error);
-                       chargeUnit, temperatureUnit, timeUnit, error);
+    int kim_error = KIM_Model_Create(KIM_NUMBERING_zeroBased, lengthUnit, energyUnit, chargeUnit,
-    int kim_error = KIM_Model_Create(KIM_NUMBERING_zeroBased,
+                                     temperatureUnit, timeUnit, model_name, &units_accepted, &pkim);
                                     lengthUnit,
                                     energyUnit,
                                     chargeUnit,
                                     temperatureUnit,
                                     timeUnit,
                                     model_name,
                                     &units_accepted,
                                     &pkim);
    if (kim_error) error->all(FLERR, "Unable to load KIM Simulator Model");
@ -219,20 +252,12 @@ void KimInit::determine_model_type_and_units(char * model_name,
    } else if (unit_conversion_mode) {
      KIM_Model_Destroy(&pkim);
      int const num_systems = 5;
-      char const * const systems[num_systems]
+      char const *const systems[num_systems] = {"metal", "real", "si", "cgs", "electron"};
          = {"metal", "real", "si", "cgs", "electron"};
      for (int i = 0; i < num_systems; ++i) {
-        get_kim_unit_names(systems[i], lengthUnit, energyUnit,
+        get_kim_unit_names(systems[i], lengthUnit, energyUnit, chargeUnit, temperatureUnit,
-                           chargeUnit, temperatureUnit, timeUnit, error);
+                           timeUnit, error);
-        kim_error = KIM_Model_Create(KIM_NUMBERING_zeroBased,
+        kim_error = KIM_Model_Create(KIM_NUMBERING_zeroBased, lengthUnit, energyUnit, chargeUnit,
-                                     lengthUnit,
+                                     temperatureUnit, timeUnit, model_name, &units_accepted, &pkim);
                                     energyUnit,
                                     chargeUnit,
                                     temperatureUnit,
                                     timeUnit,
                                     model_name,
                                     &units_accepted,
                                     &pkim);
        if (units_accepted) {
          logID = fmt::format("{}_Model", comm->me);
          KIM_Model_SetLogID(pkim, logID.c_str());
@ -246,12 +271,10 @@ void KimInit::determine_model_type_and_units(char * model_name,
      KIM_Model_Destroy(&pkim);
      error->all(FLERR, "KIM Model does not support the requested unit system");
    }
-  } else if (KIM_CollectionItemType_Equal(
+  } else if (KIM_CollectionItemType_Equal(itemType, KIM_COLLECTION_ITEM_TYPE_simulatorModel)) {
             itemType, KIM_COLLECTION_ITEM_TYPE_simulatorModel)) {
    KIM_SimulatorModel *simulatorModel;
    kim_error = KIM_SimulatorModel_Create(model_name, &simulatorModel);
-    if (kim_error)
+    if (kim_error) error->all(FLERR, "Unable to load KIM Simulator Model");
      error->all(FLERR, "Unable to load KIM Simulator Model");
    model_type = SM;
    logID = fmt::format("{}_SimulatorModel", comm->me);
@ -264,13 +287,11 @@ void KimInit::determine_model_type_and_units(char * model_name,
    KIM_SimulatorModel_GetNumberOfSimulatorFields(simulatorModel, &sim_fields);
    KIM_SimulatorModel_CloseTemplateMap(simulatorModel);
    for (int i = 0; i < sim_fields; ++i) {
-      KIM_SimulatorModel_GetSimulatorFieldMetadata(
+      KIM_SimulatorModel_GetSimulatorFieldMetadata(simulatorModel, i, &sim_lines, &sim_field);
        simulatorModel, i, &sim_lines, &sim_field);
      const std::string sim_field_str(sim_field);
      if (sim_field_str == "units") {
-        KIM_SimulatorModel_GetSimulatorFieldLine(
+        KIM_SimulatorModel_GetSimulatorFieldLine(simulatorModel, i, 0, &sim_value);
          simulatorModel, i, 0, &sim_value);
        *model_units = utils::strdup(sim_value);
        break;
      }
@ -280,16 +301,15 @@ void KimInit::determine_model_type_and_units(char * model_name,
    const std::string model_units_str(*model_units);
    const std::string user_units_str(user_units);
    if ((!unit_conversion_mode) && (model_units_str != user_units_str)) {
-      error->all(FLERR, "Incompatible units for KIM Simulator Model"
+      error->all(FLERR, "Incompatible units for KIM Simulator Model, required units = {}",
-                                    ", required units = {}", model_units_str);
+                 model_units_str);
    }
  }
 }
 /* ---------------------------------------------------------------------- */
-void KimInit::do_init(char *model_name, char *user_units, char *model_units,
+void KimInit::do_init(char *model_name, char *user_units, char *model_units, KIM_Model *&pkim)
                      KIM_Model *&pkim)
 {
  // create storage proxy fix. delete existing fix, if needed.
@ -304,8 +324,7 @@ void KimInit::do_init(char *model_name, char *user_units, char *model_units,
  fix_store->setptr("model_units", (void *) model_units);
  // Begin output to log file
-  input->write_echo("#=== BEGIN kim init ==================================="
+  input->write_echo("#=== BEGIN kim init ==========================================\n");
                    "=======\n");
  KIM_SimulatorModel *simulatorModel;
  if (model_type == SM) {
@ -316,18 +335,16 @@ void KimInit::do_init(char *model_name, char *user_units, char *model_units,
    KIM_SimulatorModel_SetLogID(simulatorModel, logID.c_str());
    char const *sim_name, *sim_version;
-    KIM_SimulatorModel_GetSimulatorNameAndVersion(
+    KIM_SimulatorModel_GetSimulatorNameAndVersion(simulatorModel, &sim_name, &sim_version);
        simulatorModel, &sim_name, &sim_version);
    const std::string sim_name_str(sim_name);
-    if (sim_name_str != "LAMMPS")
+    if (sim_name_str != "LAMMPS") error->all(FLERR, "Incompatible KIM Simulator Model");
      error->all(FLERR, "Incompatible KIM Simulator Model");
    if (comm->me == 0) {
      auto mesg = fmt::format("# Using KIM Simulator Model : {}\n"
                              "# For Simulator             : {} {}\n"
-        "# Running on                : LAMMPS {}\n#\n", model_name,
+                              "# Running on                : LAMMPS {}\n#\n",
-        sim_name_str, sim_version, lmp->version);
+                              model_name, sim_name_str, sim_version, lmp->version);
      utils::logmesg(lmp, mesg);
    }
@ -350,18 +367,16 @@ void KimInit::do_init(char *model_name, char *user_units, char *model_units,
  // Set the skin and timestep default values as
  // 2.0 Angstroms and 1.0 femtosecond
-  const std::string skin_cmd =
+  const std::string skin_cmd = (model_units_str == "real") ? "neighbor 2.0 bin   # Angstroms"
-    (model_units_str == "real") ? "neighbor 2.0 bin   # Angstroms":
+      : (model_units_str == "metal")                       ? "neighbor 2.0 bin   # Angstroms"
-    (model_units_str == "metal") ? "neighbor 2.0 bin   # Angstroms":
+      : (model_units_str == "si")                          ? "neighbor 2e-10 bin   # meters"
-    (model_units_str == "si") ? "neighbor 2e-10 bin   # meters":
+      : (model_units_str == "cgs")                         ? "neighbor 2e-8 bin   # centimeters"
-    (model_units_str == "cgs") ? "neighbor 2e-8 bin   # centimeters":
+                                                           : "neighbor 3.77945224 bin   # Bohr";
-    "neighbor 3.77945224 bin   # Bohr";
+  const std::string step_cmd = (model_units_str == "real") ? "timestep 1.0       # femtoseconds"
-  const std::string step_cmd =
+      : (model_units_str == "metal")                       ? "timestep 1.0e-3    # picoseconds"
-    (model_units_str == "real") ? "timestep 1.0       # femtoseconds":
+      : (model_units_str == "si")                          ? "timestep 1e-15       # seconds"
-    (model_units_str == "metal") ? "timestep 1.0e-3    # picoseconds":
+      : (model_units_str == "cgs")                         ? "timestep 1e-15      # seconds"
-    (model_units_str == "si") ? "timestep 1e-15       # seconds":
+                                   : "timestep 1.0              # femtoseconds";
    (model_units_str == "cgs") ? "timestep 1e-15      # seconds":
    "timestep 1.0              # femtoseconds";
  input->one(skin_cmd);
  input->one(step_cmd);
@ -373,14 +388,12 @@ void KimInit::do_init(char *model_name, char *user_units, char *model_units,
    // init model
    for (int i = 0; i < sim_fields; ++i) {
-      KIM_SimulatorModel_GetSimulatorFieldMetadata(
+      KIM_SimulatorModel_GetSimulatorFieldMetadata(simulatorModel, i, &sim_lines, &sim_field);
        simulatorModel, i, &sim_lines, &sim_field);
      const std::string sim_field_str(sim_field);
      if (sim_field_str == "model-init") {
        for (int j = 0; j < sim_lines; ++j) {
-          KIM_SimulatorModel_GetSimulatorFieldLine(
+          KIM_SimulatorModel_GetSimulatorFieldLine(simulatorModel, i, j, &sim_value);
            simulatorModel, i, j, &sim_value);
          input->one(sim_value);
        }
        break;
@ -404,31 +417,28 @@ void KimInit::do_init(char *model_name, char *user_units, char *model_units,
      int max_len(0);
      for (int i = 0; i < numberOfParameters; ++i) {
-        KIM_Model_GetParameterMetadata(pkim, i, &kim_DataType,
+        KIM_Model_GetParameterMetadata(pkim, i, &kim_DataType, &extent, &str_name, &str_desc);
        &extent, &str_name, &str_desc);
        max_len = MAX(max_len, (int) strlen(str_name));
      }
      max_len = MAX(18, max_len + 1);
-      mesg += fmt::format(" No.      | {:<{}} | data type  | extent\n",
+      mesg += fmt::format(" No.      | {:<{}} | data type  | extent\n", "Parameter name", max_len);
                          "Parameter name", max_len);
      mesg += fmt::format("{:-<{}}\n", "-", max_len + 35);
      for (int i = 0; i < numberOfParameters; ++i) {
-        KIM_Model_GetParameterMetadata(pkim, i, &kim_DataType,
+        KIM_Model_GetParameterMetadata(pkim, i, &kim_DataType, &extent, &str_name, &str_desc);
        &extent, &str_name, &str_desc);
        auto data_type = std::string("\"");
        data_type += KIM_DataType_ToString(kim_DataType) + std::string("\"");
-        mesg += fmt::format(" {:<8} | {:<{}} | {:<10} | {}\n", i + 1, str_name,
+        mesg += fmt::format(" {:<8} | {:<{}} | {:<10} | {}\n", i + 1, str_name, max_len, data_type,
-                            max_len, data_type, extent);
+                            extent);
      }
-    } else mesg += "No mutable parameters.\n";
+    } else
      mesg += "No mutable parameters.\n";
    KIM_Model_Destroy(&pkim);
    input->write_echo(mesg);
  }
  // End output to log file
-  input->write_echo("#=== END kim init ====================================="
+  input->write_echo("#=== END kim init ============================================\n\n");
                    "=======\n\n");
 }
 /* ---------------------------------------------------------------------- */
@ -446,24 +456,11 @@ void KimInit::do_variables(const std::string &from, const std::string &to)
  int ier;
  std::string var_str;
  int v_unit;
-  const char *units[] = {"mass",
+  const char *units[] = {"mass",   "distance", "time",        "energy",   "velocity",
-                         "distance",
+                         "force",  "torque",   "temperature", "pressure", "viscosity",
-                         "time",
+                         "charge", "dipole",   "efield",      "density",  nullptr};
                         "energy",
                         "velocity",
                         "force",
                         "torque",
                         "temperature",
                         "pressure",
                         "viscosity",
                         "charge",
                         "dipole",
                         "efield",
                         "density",
                         nullptr};
-  input->write_echo(fmt::format("# Conversion factors from {} to {}:\n",
+  input->write_echo(fmt::format("# Conversion factors from {} to {}:\n", from, to));
                                from, to));
  auto variable = input->variable;
  for (int i = 0; units[i] != nullptr; ++i) {
@ -473,24 +470,23 @@ void KimInit::do_variables(const std::string &from, const std::string &to)
      variable->set(var_str + " internal 1.0");
      v_unit = variable->find(var_str.c_str());
    }
-    ier = lammps_unit_conversion(units[i], from, to,
+    ier = lammps_unit_conversion(units[i], from, to, conversion_factor);
                                 conversion_factor);
    if (ier != 0)
-      error->all(FLERR, "Unable to obtain conversion factor: "
+      error->all(FLERR,
                 "Unable to obtain conversion factor: "
                 "unit = {}; from = {}; to = {}",
                 units[i], from, to);
    variable->internal_set(v_unit, conversion_factor);
-    input->write_echo(fmt::format("variable {:<15s} internal {:<15.12e}\n",
+    input->write_echo(
-                                  var_str, conversion_factor));
+        fmt::format("variable {:<15s} internal {:<15.12e}\n", var_str, conversion_factor));
  }
  input->write_echo("#\n");
 }
 /* ---------------------------------------------------------------------- */
-void KimInit::write_log_cite(class LAMMPS *lmp,
+void KimInit::write_log_cite(class LAMMPS *lmp, KimInit::model_type_enum model_type,
                             KimInit::model_type_enum model_type,
                             char *model_name)
 {
  if (!lmp->citeme) return;
@ -513,12 +509,10 @@ void KimInit::write_log_cite(class LAMMPS *lmp,
    int extent;
    if (model_type == MO) {
      err = KIM_Collections_CacheListOfItemMetadataFiles(
-          collections, KIM_COLLECTION_ITEM_TYPE_portableModel,
+          collections, KIM_COLLECTION_ITEM_TYPE_portableModel, model_name, &extent);
          model_name, &extent);
    } else if (model_type == SM) {
      err = KIM_Collections_CacheListOfItemMetadataFiles(
-          collections, KIM_COLLECTION_ITEM_TYPE_simulatorModel,
+          collections, KIM_COLLECTION_ITEM_TYPE_simulatorModel, model_name, &extent);
          model_name, &extent);
    } else {
      lmp->error->all(FLERR, "Unknown model type");
    }
@ -529,19 +523,18 @@ void KimInit::write_log_cite(class LAMMPS *lmp,
    }
    cite_id = fmt::format("OpenKIM potential: https://openkim.org/cite/"
-                          "{}#item-citation\n\n",kim_id);
+                          "{}#item-citation\n\n",
                          kim_id);
    for (int i = 0; i < extent; ++i) {
      char const *fileName;
      int availableAsString;
      char const *fileString;
-      err = KIM_Collections_GetItemMetadataFile(
+      err = KIM_Collections_GetItemMetadataFile(collections, i, &fileName, nullptr, nullptr,
          collections, i, &fileName, nullptr, nullptr,
                                                &availableAsString, &fileString);
      if (err) continue;
-      if (utils::strmatch(fileName, "^kimcite") && availableAsString)
+      if (utils::strmatch(fileName, "^kimcite") && availableAsString) cite_id += fileString;
        cite_id += fileString;
    }
    KIM_Collections_Destroy(&collections);
  }
--- a/src/KIM/kim_init.h
+++ b/src/KIM/kim_init.h
@ -62,7 +62,8 @@
 #include "pointers.h"
 // Forward declaration.
-typedef struct KIM_Model KIM_Model;
+struct KIM_Model;
 struct KIM_Collections;
 namespace LAMMPS_NS {
@ -80,6 +81,8 @@ class KimInit : protected Pointers {
  void determine_model_type_and_units(char *, char *, char **, KIM_Model *&);
  void do_init(char *, char *, char *, KIM_Model *&);
  void do_variables(const std::string &, const std::string &);
  void print_dirs(struct KIM_Collections * const collections) const;
 };
 }    // namespace LAMMPS_NS
--- a/src/KOKKOS/pair_eam_alloy_kokkos.cpp
+++ b/src/KOKKOS/pair_eam_alloy_kokkos.cpp
@ -17,23 +17,23 @@
 ------------------------------------------------------------------------- */
 #include "pair_eam_alloy_kokkos.h"
-#include <cmath>
+
 #include <cstring>
 #include "kokkos.h"
 #include "pair_kokkos.h"
 #include "atom_kokkos.h"
-#include "force.h"
+#include "atom_masks.h"
 #include "comm.h"
-#include "neighbor.h"
+#include "error.h"
 #include "force.h"
 #include "kokkos.h"
 #include "memory_kokkos.h"
 #include "neigh_list_kokkos.h"
 #include "neigh_request.h"
-#include "memory_kokkos.h"
+#include "neighbor.h"
-#include "error.h"
+#include "pair_kokkos.h"
 #include "atom_masks.h"
 #include "tokenizer.h"
 #include "potential_file_reader.h"
 #include <cmath>
 #include <cstring>
 using namespace LAMMPS_NS;
 // Cannot use virtual inheritance on the GPU, so must duplicate code
@ -44,8 +44,8 @@ template<class DeviceType>
 PairEAMAlloyKokkos<DeviceType>::PairEAMAlloyKokkos(LAMMPS *lmp) : PairEAM(lmp)
 {
  respa_enable = 0;
  single_enable = 0;
  one_coeff = 1;
  manybody_flag = 1;
  kokkosable = 1;
  atomKK = (AtomKokkos *) atom;
@ -261,6 +261,8 @@ void PairEAMAlloyKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
    virial[5] += ev.v[5];
  }
  if (vflag_fdotr) pair_virial_fdotr_compute(this);
  if (eflag_atom) {
    if (need_dup)
      Kokkos::Experimental::contribute(d_eatom, dup_eatom);
@ -275,8 +277,6 @@ void PairEAMAlloyKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
    k_vatom.template sync<LMPHostType>();
  }
  if (vflag_fdotr) pair_virial_fdotr_compute(this);
  copymode = 0;
  // free duplicated memory
@ -322,6 +322,11 @@ void PairEAMAlloyKokkos<DeviceType>::init_style()
 }
 /* ----------------------------------------------------------------------
   convert read-in funcfl potential(s) to standard array format
   interpolate all file values to a single grid and cutoff
 ------------------------------------------------------------------------- */
 template<class DeviceType>
 void PairEAMAlloyKokkos<DeviceType>::file2array()
 {
@ -524,7 +529,7 @@ void PairEAMAlloyKokkos<DeviceType>::unpack_reverse_comm(int n, int *list, doubl
    h_rho[j] += buf[m++];
  }
-  k_fp.modify_host();
+  k_rho.modify_host();
 }
 /* ---------------------------------------------------------------------- */
@ -597,7 +602,6 @@ template<class DeviceType>
 template<int EFLAG>
 KOKKOS_INLINE_FUNCTION
 void PairEAMAlloyKokkos<DeviceType>::operator()(TagPairEAMAlloyKernelB<EFLAG>, const int &ii, EV_FLOAT& ev) const {
  // fp = derivative of embedding energy at each atom
  // phi = embedding energy at each atom
  // if rho > rhomax (e.g. due to close approach of two atoms),
@ -620,7 +624,6 @@ void PairEAMAlloyKokkos<DeviceType>::operator()(TagPairEAMAlloyKernelB<EFLAG>, c
    if (eflag_global) ev.evdwl += phi;
    if (eflag_atom) d_eatom[i] += phi;
  }
 }
 template<class DeviceType>
--- a/src/KOKKOS/pair_eam_fs_kokkos.cpp
+++ b/src/KOKKOS/pair_eam_fs_kokkos.cpp
@ -17,23 +17,23 @@
 ------------------------------------------------------------------------- */
 #include "pair_eam_fs_kokkos.h"
-#include <cmath>
+
 #include <cstring>
 #include "kokkos.h"
 #include "pair_kokkos.h"
 #include "atom_kokkos.h"
-#include "force.h"
+#include "atom_masks.h"
 #include "comm.h"
-#include "neighbor.h"
+#include "error.h"
 #include "force.h"
 #include "kokkos.h"
 #include "memory_kokkos.h"
 #include "neigh_list_kokkos.h"
 #include "neigh_request.h"
-#include "memory_kokkos.h"
+#include "neighbor.h"
-#include "error.h"
+#include "pair_kokkos.h"
 #include "atom_masks.h"
 #include "tokenizer.h"
 #include "potential_file_reader.h"
 #include <cmath>
 #include <cstring>
 using namespace LAMMPS_NS;
 // Cannot use virtual inheritance on the GPU, so must duplicate code
@ -43,9 +43,9 @@ using namespace LAMMPS_NS;
 template<class DeviceType>
 PairEAMFSKokkos<DeviceType>::PairEAMFSKokkos(LAMMPS *lmp) : PairEAM(lmp)
 {
  one_coeff = 1;
  manybody_flag = 1;
  respa_enable = 0;
  single_enable = 0;
  one_coeff = 1;
  kokkosable = 1;
  atomKK = (AtomKokkos *) atom;
@ -200,9 +200,9 @@ void PairEAMFSKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
  // communicate derivative of embedding function (on the device)
  k_fp.template sync<DeviceType>();
  comm->forward_comm_pair(this);
  k_fp.template modify<DeviceType>();
  comm->forward_comm_pair(this);
  k_fp.template sync<DeviceType>();
  // compute kernel C
@ -322,6 +322,11 @@ void PairEAMFSKokkos<DeviceType>::init_style()
 }
 /* ----------------------------------------------------------------------
   convert read-in funcfl potential(s) to standard array format
   interpolate all file values to a single grid and cutoff
 ------------------------------------------------------------------------- */
 template<class DeviceType>
 void PairEAMFSKokkos<DeviceType>::file2array()
 {
@ -620,7 +625,6 @@ void PairEAMFSKokkos<DeviceType>::operator()(TagPairEAMFSKernelB<EFLAG>, const i
    if (eflag_global) ev.evdwl += phi;
    if (eflag_atom) d_eatom[i] += phi;
  }
 }
 template<class DeviceType>
--- a/src/KOKKOS/pair_eam_kokkos.cpp
+++ b/src/KOKKOS/pair_eam_kokkos.cpp
@ -17,18 +17,20 @@
 ------------------------------------------------------------------------- */
 #include "pair_eam_kokkos.h"
-#include <cmath>
+
 #include "kokkos.h"
 #include "pair_kokkos.h"
 #include "atom_kokkos.h"
-#include "force.h"
+#include "atom_masks.h"
 #include "comm.h"
-#include "neighbor.h"
+#include "error.h"
 #include "force.h"
 #include "kokkos.h"
 #include "memory_kokkos.h"
 #include "neigh_list_kokkos.h"
 #include "neigh_request.h"
-#include "memory_kokkos.h"
+#include "neighbor.h"
-#include "error.h"
+#include "pair_kokkos.h"
-#include "atom_masks.h"
+
 #include <cmath>
 using namespace LAMMPS_NS;
--- a/tools/singularity/centos7.def
+++ b/tools/singularity/centos7.def
@ -36,7 +36,7 @@ From: centos:7
        # manually install Plumed
        mkdir plumed
        cd plumed
-        version=2.6.1
+        version=2.7.2
        curl -L -o plumed.tar.gz https://github.com/plumed/plumed2/releases/download/v${version}/plumed-src-${version}.tgz
        tar -xzf plumed.tar.gz
        cd plumed-${version}
--- a/tools/singularity/centos8.def
+++ b/tools/singularity/centos8.def
@ -3,7 +3,7 @@ From: centos:8
 %post
        dnf -y install epel-release dnf-utils
-        dnf config-manager --set-enabled PowerTools
+        dnf config-manager --set-enabled powertools
        dnf -y update
        dnf -y install vim-enhanced git file make cmake patch which file ninja-build \
               ccache gcc-c++ gcc-gfortran clang libomp-devel gdb valgrind libubsan libasan libtsan \
@ -42,7 +42,7 @@ From: centos:8
        # manually install Plumed
        mkdir plumed
        cd plumed
-        version=2.6.1
+        version=2.7.2
        curl -L -o plumed.tar.gz https://github.com/plumed/plumed2/releases/download/v${version}/plumed-src-${version}.tgz
        tar -xzf plumed.tar.gz
        cd plumed-${version}
--- a/tools/singularity/rocky8.def
+++ b/tools/singularity/rocky8.def
@ -0,0 +1,110 @@
 BootStrap: docker
 From: rockylinux/rockylinux:8
 %post
        dnf -y install epel-release dnf-utils
        dnf config-manager --set-enabled powertools
        dnf -y update
        dnf -y install vim-enhanced git file make cmake patch which file ninja-build \
               ccache gcc-c++ gcc-gfortran clang libomp-devel gdb valgrind libubsan libasan libtsan \
               eigen3-devel openblas-devel libpng-devel libjpeg-devel platform-python-devel \
               openmpi-devel mpich-devel fftw-devel voro++-devel gsl-devel hdf5-devel \
               netcdf-devel netcdf-cxx-devel netcdf-mpich-devel netcdf-openmpi-devel \
               enchant python3-virtualenv doxygen diffutils latexmk readline-devel \
               texlive-latex-fonts texlive-pslatex texlive-collection-latexrecommended \
               texlive-latex texlive-latexconfig doxygen-latex texlive-collection-latex \
               texlive-latex-bin texlive-lualatex-math texlive-fncychap texlive-tabulary \
               texlive-framed texlive-wrapfig texlive-upquote texlive-capt-of \
               texlive-needspace texlive-titlesec texlive-anysize texlive-dvipng \
               blas-devel lapack-devel libyaml-devel openkim-models kim-api-devel \
               zstd libzstd-devel
        dnf clean all
        # we need to reset any module variables
        # inherited from the host.
        unset __LMOD_REF_COUNT__LMFILES_
        unset __LMOD_REF_COUNT_PATH
        unset __LMOD_REF_COUNT_LD_LIBRARY_PATH
        unset __LMOD_REF_COUNT_MANPATH
        unset __LMOD_REF_COUNT_MODULEPATH
        unset __LMOD_REF_COUNT_LOADEDMODULES
        unset _LMFILES_
        unset MODULEPATH
        unset MODULESHOME
        unset MODULEPATH_ROOT
        unset LOADEDMODULES
        unset LMOD_SYSTEM_DEFAULT_MODULES
        # load MPI by default
        . /etc/profile
        module load mpi
        # manually install Plumed
        mkdir plumed
        cd plumed
        version=2.7.2
        curl -L -o plumed.tar.gz https://github.com/plumed/plumed2/releases/download/v${version}/plumed-src-${version}.tgz
        tar -xzf plumed.tar.gz
        cd plumed-${version}
        ./configure --disable-doc --prefix=/usr
        make
        make install
        # fix up installation for CentOS and Fedora
        mv -v /usr/lib64/pkgconfig/plumed* /usr/share/pkgconfig/
        cd ../../
        rm -rvf plumed
        # create missing readline pkgconfig file
        cat > /usr/lib64/pkgconfig/readline.pc <<EOF
 prefix=/usr
 exec_prefix=/usr
 libdir=/usr/lib64
 includedir=/usr/include
 Name: Readline
 Description: GNU Readline library for command line editing
 URL: http://tiswww.cwru.edu/php/chet/readline/rltop.html
 Version: 7.0
 Requires.private: ncurses
 Libs: -L\${libdir} -lreadline
 Cflags: -I\${includedir}/readline
 EOF
        # set custom prompt indicating the container name
        CUSTOM_PROMPT_ENV=/.singularity.d/env/99-zz_custom_prompt.sh
        cat >$CUSTOM_PROMPT_ENV <<EOF
 #!/bin/bash
 PS1="[centos8:\u@\h] \W> "
 EOF
        chmod 755 $CUSTOM_PROMPT_ENV
 %environment
        LC_ALL=C
        export LC_ALL
        # we need to reset any module variables
        # inherited from the host.
        unset __LMOD_REF_COUNT__LMFILES_
        unset __LMOD_REF_COUNT_PATH
        unset __LMOD_REF_COUNT_LD_LIBRARY_PATH
        unset __LMOD_REF_COUNT_MANPATH
        unset __LMOD_REF_COUNT_MODULEPATH
        unset __LMOD_REF_COUNT_LOADEDMODULES
        unset _LMFILES_
        unset MODULEPATH
        unset MODULESHOME
        unset MODULEPATH_ROOT
        unset LOADEDMODULES
        unset LMOD_SYSTEM_DEFAULT_MODULES
        # load MPI by default
        . /etc/profile
        module load mpi
        # tell OpenMPI to not try using Infiniband
        OMPI_MCA_btl="^openib"
        # do not warn about unused components as this messes up testing
        OMPI_MCA_btl_base_warn_component_unused="0"
        export OMPI_MCA_btl OMPI_MCA_btl_base_warn_component_unused
 %labels
        Author akohlmey, rbberger
--- a/tools/singularity/ubuntu18.04.def
+++ b/tools/singularity/ubuntu18.04.def
@ -105,7 +105,7 @@ From: ubuntu:18.04
    # Plumed
    ###########################################################################
-    export PLUMED_PKG_VERSION=2.6.1
+    export PLUMED_PKG_VERSION=2.7.2
    mkdir plumed
    cd plumed
--- a/tools/singularity/ubuntu18.04_amd_rocm.def
+++ b/tools/singularity/ubuntu18.04_amd_rocm.def
@ -3,7 +3,7 @@ From: ubuntu:18.04
 %environment
    export PATH=/usr/lib/ccache:${PATH}:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
-
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib:/opt/rocm-4.3.0/llvm/lib
 %post
    export DEBIAN_FRONTEND=noninteractive
    apt-get update
@ -94,7 +94,7 @@ From: ubuntu:18.04
    ###########################################################################
    export PATH=$PATH:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
-    git clone -b rocm-4.1.x https://github.com/ROCmSoftwarePlatform/hipCUB.git
+    git clone -b release/rocm-rel-4.3 https://github.com/ROCmSoftwarePlatform/hipCUB.git
    mkdir hipCUB/build
    cd hipCUB/build
    CXX=hipcc cmake -D BUILD_TEST=off ..
@ -129,7 +129,7 @@ From: ubuntu:18.04
    # Plumed
    ###########################################################################
-    export PLUMED_PKG_VERSION=2.6.1
+    export PLUMED_PKG_VERSION=2.7.2
    mkdir plumed
    cd plumed
--- a/tools/singularity/ubuntu18.04_gpu.def
+++ b/tools/singularity/ubuntu18.04_gpu.def
@ -2,11 +2,11 @@ BootStrap: docker
 From: ubuntu:18.04
 %environment
-    export PATH=/usr/lib/ccache:/usr/local/cuda-11.0/bin:${PATH}:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
+    export PATH=/usr/lib/ccache:/usr/local/cuda-11.4/bin:${PATH}:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
-    export CUDADIR=/usr/local/cuda-11.0
+    export CUDADIR=/usr/local/cuda-11.4
-    export CUDA_PATH=/usr/local/cuda-11.0
+    export CUDA_PATH=/usr/local/cuda-11.4
-    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.0/lib64
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.4/lib64:/opt/rocm/lib:/opt/rocm-4.3.0/llvm/lib
-    export LIBRARY_PATH=/usr/local/cuda-11.0/lib64/stubs
+    export LIBRARY_PATH=/usr/local/cuda-11.4/lib64/stubs
 %post
    export DEBIAN_FRONTEND=noninteractive
    apt-get update
@ -104,23 +104,19 @@ From: ubuntu:18.04
    add-apt-repository "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /"
    apt-get update
-    export CUDA_PKG_VERSION=11.0
+    export CUDA_PKG_VERSION=11.4
    apt-get install -y --no-install-recommends \
-        cuda-libraries-$CUDA_PKG_VERSION \
+        cuda-libraries-${CUDA_PKG_VERSION} \
-        cuda-command-line-tools-$CUDA_PKG_VERSION \
+        cuda-command-line-tools-${CUDA_PKG_VERSION} \
-        cuda-libraries-dev-$CUDA_PKG_VERSION \
+        cuda-libraries-dev-${CUDA_PKG_VERSION} \
-        cuda-minimal-build-$CUDA_PKG_VERSION \
+        cuda-minimal-build-${CUDA_PKG_VERSION} \
        cuda-compat-$CUDA_PKG_VERSION \
-        libcublas-11-0 \
+        libcublas-${CUDA_PKG_VERSION} \
-        libcublas-dev-11-0
+        libcublas-dev-${CUDA_PKG_VERSION}
    echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf
    echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
    # add missing symlink
-    ln -s /usr/local/cuda-11.0 /usr/local/cuda
+    ln -s /usr/local/cuda-${CUDA_PKG_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_PKG_VERSION}/lib64/stubs/libcuda.so.1
    ln -s /usr/local/cuda-11.0/lib64/stubs/libcuda.so /usr/local/cuda-11.0/lib64/stubs/libcuda.so.1
    ###########################################################################
    # NVIDIA OpenCL
@ -134,7 +130,7 @@ From: ubuntu:18.04
    ###########################################################################
    export PATH=$PATH:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
-    git clone -b rocm-3.7.x https://github.com/ROCmSoftwarePlatform/hipCUB.git
+    git clone -b release/rocm-rel-4.3 https://github.com/ROCmSoftwarePlatform/hipCUB.git
    mkdir hipCUB/build
    cd hipCUB/build
    CXX=hipcc cmake -D BUILD_TEST=off ..
@ -169,7 +165,7 @@ From: ubuntu:18.04
    # Plumed
    ###########################################################################
-    export PLUMED_PKG_VERSION=2.6.1
+    export PLUMED_PKG_VERSION=2.7.2
    mkdir plumed
    cd plumed
--- a/tools/singularity/ubuntu18.04_intel_opencl.def
+++ b/tools/singularity/ubuntu18.04_intel_opencl.def
@ -106,7 +106,7 @@ From: ubuntu:18.04
    # Plumed
    ###########################################################################
-    export PLUMED_PKG_VERSION=2.6.1
+    export PLUMED_PKG_VERSION=2.7.2
    mkdir plumed
    cd plumed
--- a/tools/singularity/ubuntu18.04_nvidia.def
+++ b/tools/singularity/ubuntu18.04_nvidia.def
@ -1,5 +1,5 @@
 BootStrap: docker
-From: nvidia/cuda:11.0-devel-ubuntu18.04
+From: nvidia/cuda:11.4.1-devel-ubuntu18.04
 %post
    export DEBIAN_FRONTEND=noninteractive
@ -105,7 +105,7 @@ From: nvidia/cuda:11.0-devel-ubuntu18.04
    # Plumed
    ###########################################################################
-    export PLUMED_PKG_VERSION=2.6.1
+    export PLUMED_PKG_VERSION=2.7.2
    mkdir plumed
    cd plumed
--- a/tools/singularity/ubuntu20.04.def
+++ b/tools/singularity/ubuntu20.04.def
@ -100,7 +100,7 @@ From: ubuntu:20.04
    # Plumed
    ###########################################################################
-    export PLUMED_PKG_VERSION=2.6.1
+    export PLUMED_PKG_VERSION=2.7.2
    mkdir plumed
    cd plumed
--- a/tools/singularity/ubuntu20.04_amd_rocm.def
+++ b/tools/singularity/ubuntu20.04_amd_rocm.def
@ -3,7 +3,7 @@ From: ubuntu:20.04
 %environment
    export PATH=/usr/lib/ccache:${PATH}:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
-    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib:/opt/rocm-4.2.0/llvm/lib
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib:/opt/rocm-4.3.0/llvm/lib
 %post
    export DEBIAN_FRONTEND=noninteractive
    apt-get update
@ -91,7 +91,7 @@ From: ubuntu:20.04
    ###########################################################################
    export PATH=$PATH:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
-    git clone -b rocm-4.1.x https://github.com/ROCmSoftwarePlatform/hipCUB.git
+    git clone -b release/rocm-rel-4.3 https://github.com/ROCmSoftwarePlatform/hipCUB.git
    mkdir hipCUB/build
    cd hipCUB/build
    CXX=hipcc cmake -D BUILD_TEST=off ..
@ -126,7 +126,7 @@ From: ubuntu:20.04
    # Plumed
    ###########################################################################
-    export PLUMED_PKG_VERSION=2.6.1
+    export PLUMED_PKG_VERSION=2.7.2
    mkdir plumed
    cd plumed
--- a/tools/singularity/ubuntu20.04_gpu.def
+++ b/tools/singularity/ubuntu20.04_gpu.def
@ -2,11 +2,11 @@ BootStrap: docker
 From: ubuntu:20.04
 %environment
-    export PATH=/usr/lib/ccache:/usr/local/cuda-11.0/bin:${PATH}:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
+    export PATH=/usr/lib/ccache:/usr/local/cuda-11.4/bin:${PATH}:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
-    export CUDADIR=/usr/local/cuda-11.0
+    export CUDADIR=/usr/local/cuda-11.4
-    export CUDA_PATH=/usr/local/cuda-11.0
+    export CUDA_PATH=/usr/local/cuda-11.4
-    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.0/lib64:/opt/rocm/lib:/opt/rocm-4.2.0/llvm/lib
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.4/lib64:/opt/rocm/lib:/opt/rocm-4.3.0/llvm/lib
-    export LIBRARY_PATH=/usr/local/cuda-11.0/lib64/stubs
+    export LIBRARY_PATH=/usr/local/cuda-11.4/lib64/stubs
 %post
    export DEBIAN_FRONTEND=noninteractive
    apt-get update
@ -101,23 +101,19 @@ From: ubuntu:20.04
    add-apt-repository "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /"
    apt-get update
-    export CUDA_PKG_VERSION=11.0
+    export CUDA_PKG_VERSION=11.4
    apt-get install -y --no-install-recommends \
-        cuda-libraries-$CUDA_PKG_VERSION \
+        cuda-libraries-${CUDA_PKG_VERSION} \
-        cuda-command-line-tools-$CUDA_PKG_VERSION \
+        cuda-command-line-tools-${CUDA_PKG_VERSION} \
-        cuda-libraries-dev-$CUDA_PKG_VERSION \
+        cuda-libraries-dev-${CUDA_PKG_VERSION} \
-        cuda-minimal-build-$CUDA_PKG_VERSION \
+        cuda-minimal-build-${CUDA_PKG_VERSION} \
        cuda-compat-$CUDA_PKG_VERSION \
-        libcublas-11-0 \
+        libcublas-${CUDA_PKG_VERSION} \
-        libcublas-dev-11-0
+        libcublas-dev-${CUDA_PKG_VERSION}
    echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf
    echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
    # add missing symlink
-    ln -s /usr/local/cuda-11.0 /usr/local/cuda
+    ln -s /usr/local/cuda-${CUDA_PKG_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_PKG_VERSION}/lib64/stubs/libcuda.so.1
    ln -s /usr/local/cuda-11.0/lib64/stubs/libcuda.so /usr/local/cuda-11.0/lib64/stubs/libcuda.so.1
    ###########################################################################
    # NVIDIA OpenCL
@ -131,7 +127,7 @@ From: ubuntu:20.04
    ###########################################################################
    export PATH=$PATH:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
-    git clone -b rocm-4.2.x https://github.com/ROCmSoftwarePlatform/hipCUB.git
+    git clone -b release/rocm-rel-4.3 https://github.com/ROCmSoftwarePlatform/hipCUB.git
    mkdir hipCUB/build
    cd hipCUB/build
    CXX=hipcc cmake -D BUILD_TEST=off ..
@ -166,7 +162,7 @@ From: ubuntu:20.04
    # Plumed
    ###########################################################################
-    export PLUMED_PKG_VERSION=2.6.1
+    export PLUMED_PKG_VERSION=2.7.2
    mkdir plumed
    cd plumed
--- a/tools/singularity/ubuntu20.04_intel_opencl.def
+++ b/tools/singularity/ubuntu20.04_intel_opencl.def
@ -99,7 +99,7 @@ From: ubuntu:20.04
    # Plumed
    ###########################################################################
-    export PLUMED_PKG_VERSION=2.6.1
+    export PLUMED_PKG_VERSION=2.7.2
    mkdir plumed
    cd plumed
--- a/tools/singularity/ubuntu20.04_nvidia.def
+++ b/tools/singularity/ubuntu20.04_nvidia.def
@ -1,5 +1,5 @@
 BootStrap: docker
-From: nvidia/cuda:11.0-devel-ubuntu20.04
+From: nvidia/cuda:11.4.1-devel-ubuntu20.04
 %post
    export DEBIAN_FRONTEND=noninteractive
@ -102,7 +102,7 @@ From: nvidia/cuda:11.0-devel-ubuntu20.04
    # Plumed
    ###########################################################################
-    export PLUMED_PKG_VERSION=2.6.1
+    export PLUMED_PKG_VERSION=2.7.2
    mkdir plumed
    cd plumed