Merge branch 'master' into collected-small-changes

This commit is contained in:
Axel Kohlmeyer
2021-09-26 18:12:40 -04:00
36 changed files with 1131 additions and 272 deletions

View File

@ -217,13 +217,20 @@ elseif(GPU_API STREQUAL "OPENCL")
elseif(GPU_API STREQUAL "HIP")
if(NOT DEFINED HIP_PATH)
if(NOT DEFINED ENV{HIP_PATH})
set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to HIP installation")
else()
set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to HIP installation")
endif()
endif()
set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
find_package(HIP REQUIRED)
if(NOT DEFINED ROCM_PATH)
if(NOT DEFINED ENV{ROCM_PATH})
set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to ROCm installation")
else()
set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Path to ROCm installation")
endif()
endif()
list(APPEND CMAKE_PREFIX_PATH ${HIP_PATH} ${ROCM_PATH})
find_package(hip REQUIRED)
option(HIP_USE_DEVICE_SORT "Use GPU sorting" ON)
if(NOT DEFINED HIP_PLATFORM)
@ -325,10 +332,11 @@ elseif(GPU_API STREQUAL "HIP")
set_directory_properties(PROPERTIES ADDITIONAL_MAKE_CLEAN_FILES "${LAMMPS_LIB_BINARY_DIR}/gpu/*_cubin.h ${LAMMPS_LIB_BINARY_DIR}/gpu/*.cu.cpp")
hip_add_library(gpu STATIC ${GPU_LIB_SOURCES})
add_library(gpu STATIC ${GPU_LIB_SOURCES})
target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu)
target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT)
target_compile_definitions(gpu PRIVATE -DUSE_HIP)
target_link_libraries(gpu PRIVATE hip::host)
if(HIP_USE_DEVICE_SORT)
# add hipCUB
@ -377,8 +385,9 @@ elseif(GPU_API STREQUAL "HIP")
endif()
endif()
hip_add_executable(hip_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp)
add_executable(hip_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp)
target_compile_definitions(hip_get_devices PRIVATE -DUCL_HIP)
target_link_libraries(hip_get_devices hip::host)
if(HIP_PLATFORM STREQUAL "nvcc")
target_compile_definitions(gpu PRIVATE -D__HIP_PLATFORM_NVCC__)

View File

@ -0,0 +1,30 @@
# preset that will enable hip (clang/clang++) with support for MPI and OpenMP (on Linux boxes)
# prefer flang over gfortran, if available
find_program(CLANG_FORTRAN NAMES flang gfortran f95)
set(ENV{OMPI_FC} ${CLANG_FORTRAN})
set(CMAKE_CXX_COMPILER "hipcc" CACHE STRING "" FORCE)
set(CMAKE_C_COMPILER "hipcc" CACHE STRING "" FORCE)
set(CMAKE_Fortran_COMPILER ${CLANG_FORTRAN} CACHE STRING "" FORCE)
set(CMAKE_CXX_FLAGS_DEBUG "-Wall -Wextra -g" CACHE STRING "" FORCE)
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-Wall -Wextra -g -O2 -DNDEBUG" CACHE STRING "" FORCE)
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG" CACHE STRING "" FORCE)
set(CMAKE_Fortran_FLAGS_DEBUG "-Wall -Wextra -g -std=f2003" CACHE STRING "" FORCE)
set(CMAKE_Fortran_FLAGS_RELWITHDEBINFO "-Wall -Wextra -g -O2 -DNDEBUG -std=f2003" CACHE STRING "" FORCE)
set(CMAKE_Fortran_FLAGS_RELEASE "-O3 -DNDEBUG -std=f2003" CACHE STRING "" FORCE)
set(CMAKE_C_FLAGS_DEBUG "-Wall -Wextra -g" CACHE STRING "" FORCE)
set(CMAKE_C_FLAGS_RELWITHDEBINFO "-Wall -Wextra -g -O2 -DNDEBUG" CACHE STRING "" FORCE)
set(CMAKE_C_FLAGS_RELEASE "-O3 -DNDEBUG" CACHE STRING "" FORCE)
set(MPI_CXX "hipcc" CACHE STRING "" FORCE)
set(MPI_CXX_COMPILER "mpicxx" CACHE STRING "" FORCE)
unset(HAVE_OMP_H_INCLUDE CACHE)
set(OpenMP_C "hipcc" CACHE STRING "" FORCE)
set(OpenMP_C_FLAGS "-fopenmp" CACHE STRING "" FORCE)
set(OpenMP_C_LIB_NAMES "omp" CACHE STRING "" FORCE)
set(OpenMP_CXX "hipcc" CACHE STRING "" FORCE)
set(OpenMP_CXX_FLAGS "-fopenmp" CACHE STRING "" FORCE)
set(OpenMP_CXX_LIB_NAMES "omp" CACHE STRING "" FORCE)
set(OpenMP_omp_LIBRARY "libomp.so" CACHE PATH "" FORCE)

View File

@ -11,6 +11,7 @@ of time and requests from the LAMMPS user community.
:maxdepth: 1
Developer_org
Developer_parallel
Developer_flow
Developer_write
Developer_notes

View File

@ -0,0 +1,120 @@
Communication
^^^^^^^^^^^^^
Following the partitioning scheme in use all per-atom data is
distributed across the MPI processes, which allows LAMMPS to handle very
large systems provided it uses a correspondingly large number of MPI
processes. Since The per-atom data (atom IDs, positions, velocities,
types, etc.) To be able to compute the short-range interactions MPI
processes need not only access to data of atoms they "own" but also
information about atoms from neighboring sub-domains, in LAMMPS referred
to as "ghost" atoms. These are copies of atoms storing required
per-atom data for up to the communication cutoff distance. The green
dashed-line boxes in the :ref:`domain-decomposition` figure illustrate
the extended ghost-atom sub-domain for one processor.
This approach is also used to implement periodic boundary
conditions: atoms that lie within the cutoff distance across a periodic
boundary are also stored as ghost atoms and taken from the periodic
replication of the sub-domain, which may be the same sub-domain, e.g. if
running in serial. As a consequence of this, force computation in
LAMMPS is not subject to minimum image conventions and thus cutoffs may
be larger than half the simulation domain.
.. _ghost-atom-comm:
.. figure:: img/ghost-comm.png
:align: center
ghost atom communication
This figure shows the ghost atom communication patterns between
sub-domains for "brick" (left) and "tiled" communication styles for
2d simulations. The numbers indicate MPI process ranks. Here the
sub-domains are drawn spatially separated for clarity. The
dashed-line box is the extended sub-domain of processor 0 which
includes its ghost atoms. The red- and blue-shaded boxes are the
regions of communicated ghost atoms.
Efficient communication patterns are needed to update the "ghost" atom
data, since that needs to be done at every MD time step or minimization
step. The diagrams of the `ghost-atom-comm` figure illustrate how ghost
atom communication is performed in two stages for a 2d simulation (three
in 3d) for both a regular and irregular partitioning of the simulation
box. For the regular case (left) atoms are exchanged first in the
*x*-direction, then in *y*, with four neighbors in the grid of processor
sub-domains.
In the *x* stage, processor ranks 1 and 2 send owned atoms in their
red-shaded regions to rank 0 (and vice versa). Then in the *y* stage,
ranks 3 and 4 send atoms in their blue-shaded regions to rank 0, which
includes ghost atoms they received in the *x* stage. Rank 0 thus
acquires all its ghost atoms; atoms in the solid blue corner regions
are communicated twice before rank 0 receives them.
For the irregular case (right) the two stages are similar, but a
processor can have more than one neighbor in each direction. In the
*x* stage, MPI ranks 1,2,3 send owned atoms in their red-shaded regions to
rank 0 (and vice versa). These include only atoms between the lower
and upper *y*-boundary of rank 0's sub-domain. In the *y* stage, ranks
4,5,6 send atoms in their blue-shaded regions to rank 0. This may
include ghost atoms they received in the *x* stage, but only if they
are needed by rank 0 to fill its extended ghost atom regions in the
+/-*y* directions (blue rectangles). Thus in this case, ranks 5 and
6 do not include ghost atoms they received from each other (in the *x*
stage) in the atoms they send to rank 0. The key point is that while
the pattern of communication is more complex in the irregular
partitioning case, it can still proceed in two stages (three in 3d)
via atom exchanges with only neighboring processors.
When attributes of owned atoms are sent to neighboring processors to
become attributes of their ghost atoms, LAMMPS calls this a "forward"
communication. On timesteps when atoms migrate to new owning processors
and neighbor lists are rebuilt, each processor creates a list of its
owned atoms which are ghost atoms in each of its neighbor processors.
These lists are used to pack per-atom coordinates (for example) into
message buffers in subsequent steps until the next reneighboring.
A "reverse" communication is when computed ghost atom attributes are
sent back to the processor who owns the atom. This is used (for
example) to sum partial forces on ghost atoms to the complete force on
owned atoms. The order of the two stages described in the
:ref:`ghost-atom-comm` figure is inverted and the same lists of atoms
are used to pack and unpack message buffers with per-atom forces. When
a received buffer is unpacked, the ghost forces are summed to owned atom
forces. As in forward communication, forces on atoms in the four blue
corners of the diagrams are sent, received, and summed twice (once at
each stage) before owning processors have the full force.
These two operations are used many places within LAMMPS aside from
exchange of coordinates and forces, for example by manybody potentials
to share intermediate per-atom values, or by rigid-body integrators to
enable each atom in a body to access body properties. Here are
additional details about how these communication operations are
performed in LAMMPS:
- When exchanging data with different processors, forward and reverse
communication is done using ``MPI_Send()`` and ``MPI_IRecv()`` calls.
If a processor is "exchanging" atoms with itself, only the pack and
unpack operations are performed, e.g. to create ghost atoms across
periodic boundaries when running on a single processor.
- For forward communication of owned atom coordinates, periodic box
lengths are added and subtracted when the receiving processor is
across a periodic boundary from the sender. There is then no need to
apply a minimum image convention when calculating distances between
atom pairs when building neighbor lists or computing forces.
- The cutoff distance for exchanging ghost atoms is typically equal to
the neighbor cutoff. But it can also chosen to be longer if needed,
e.g. half the diameter of a rigid body composed of multiple atoms or
over 3x the length of a stretched bond for dihedral interactions. It
can also exceed the periodic box size. For the regular communication
pattern (left), if the cutoff distance extends beyond a neighbor
processor's sub-domain, then multiple exchanges are performed in the
same direction. Each exchange is with the same neighbor processor,
but buffers are packed/unpacked using a different list of atoms. For
forward communication, in the first exchange a processor sends only
owned atoms. In subsequent exchanges, it sends ghost atoms received
in previous exchanges. For the irregular pattern (right) overlaps of
a processor's extended ghost-atom sub-domain with all other processors
in each dimension are detected.

View File

@ -0,0 +1,188 @@
Long-range interactions
^^^^^^^^^^^^^^^^^^^^^^^
For charged systems, LAMMPS can compute long-range Coulombic
interactions via the FFT-based particle-particle/particle-mesh (PPPM)
method implemented in :doc:`kspace style pppm and its variants
<kspace_style>`. For that Coulombic interactions are partitioned into
short- and long-range components. The short-ranged portion is computed
in real space as a loop over pairs of charges within a cutoff distance,
using neighbor lists. The long-range portion is computed in reciprocal
space using a kspace style. For the PPPM implementation the simulation
cell is overlaid with a regular FFT grid in 3d. It proceeds in several stages:
a) each atom's point charge is interpolated to nearby FFT grid points,
b) a forward 3d FFT is performed,
c) a convolution operation is performed in reciprocal space,
d) one or more inverse 3d FFTs are performed, and
e) electric field values from grid points near each atom are interpolated to compute
its forces.
For any of the spatial-decomposition partitioning schemes each processor
owns the brick-shaped portion of FFT grid points contained within its
sub-domain. The two interpolation operations use a stencil of grid
points surrounding each atom. To accommodate the stencil size, each
processor also stores a few layers of ghost grid points surrounding its
brick. Forward and reverse communication of grid point values is
performed similar to the corresponding :doc:`atom data communication
<Developer_par_comm>`. In this case, electric field values on owned
grid points are sent to neighboring processors to become ghost point
values. Likewise charge values on ghost points are sent and summed to
values on owned points.
For triclinic simulation boxes, the FFT grid planes are parallel to
the box faces, but the mapping of charge and electric field values
to/from grid points is done in reduced coordinates where the tilted
box is conceptually a unit cube, so that the stencil and FFT
operations are unchanged. However the FFT grid size required for a
given accuracy is larger for triclinic domains than it is for
orthogonal boxes.
.. _fft-parallel:
.. figure:: img/fft-decomp-parallel.png
:align: center
parallel FFT in PPPM
Stages of a parallel FFT for a simulation domain overlaid
with an 8x8x8 3d FFT grid, partitioned across 64 processors.
Within each of the 4 diagrams, grid cells of the same color are
owned by a single processor; for simplicity only cells owned by 4
or 8 of the 64 processors are colored. The two images on the left
illustrate brick-to-pencil communication. The two images on the
right illustrate pencil-to-pencil communication, which in this
case transposes the *y* and *z* dimensions of the grid.
Parallel 3d FFTs require substantial communication relative to their
computational cost. A 3d FFT is implemented by a series of 1d FFTs
along the *x-*, *y-*, and *z-*\ direction of the FFT grid. Thus the FFT
grid cannot be decomposed like atoms into 3 dimensions for parallel
processing of the FFTs but only in 1 (as planes) or 2 (as pencils)
dimensions and in between the steps the grid needs to be transposed to
have the FFT grid portion "owned" by each MPI process complete in the
direction of the 1d FFTs it has to perform. LAMMPS uses the
pencil-decomposition algorithm as shown in the :ref:`fft-parallel` figure.
Initially (far left), each processor owns a brick of same-color grid
cells (actually grid points) contained within in its sub-domain. A
brick-to-pencil communication operation converts this layout to 1d
pencils in the *x*-dimension (center left). Again, cells of the same
color are owned by the same processor. Each processor can then compute
a 1d FFT on each pencil of data it wholly owns using a call to the
configured FFT library. A pencil-to-pencil communication then converts
this layout to pencils in the *y* dimension (center right) which
effectively transposes the *x* and *y* dimensions of the grid, followed
by 1d FFTs in *y*. A final transpose of pencils from *y* to *z* (far
right) followed by 1d FFTs in *z* completes the forward FFT. The data
is left in a *z*-pencil layout for the convolution operation. One or
more inverse FFTs then perform the sequence of 1d FFTs and communication
steps in reverse order; the final layout of resulting grid values is the
same as the initial brick layout.
Each communication operation within the FFT (brick-to-pencil or
pencil-to-pencil or pencil-to-brick) converts one tiling of the 3d grid
to another, where a tiling in this context means an assignment of a
small brick-shaped subset of grid points to each processor, the union of
which comprise the entire grid. The parallel `fftMPI library
<https://lammps.github.io/fftmpi/>`_ written for LAMMPS allows arbitrary
definitions of the tiling so that an irregular partitioning of the
simulation domain can use it directly. Transforming data from one
tiling to another is implemented in `fftMPI` using point-to-point
communication, where each processor sends data to a few other
processors, since each tile in the initial tiling overlaps with a
handful of tiles in the final tiling.
The transformations could also be done using collective communication
across all $P$ processors with a single call to ``MPI_Alltoall()``, but
this is typically much slower. However, for the specialized brick and
pencil tiling illustrated in :ref:`fft-parallel` figure, collective
communication across the entire MPI communicator is not required. In
the example an :math:`8^3` grid with 512 grid cells is partitioned
across 64 processors; each processor owns a 2x2x2 3d brick of grid
cells. The initial brick-to-pencil communication (upper left to upper
right) only requires collective communication within subgroups of 4
processors, as illustrated by the 4 colors. More generally, a
brick-to-pencil communication can be performed by partitioning *P*
processors into :math:`P^{\frac{2}{3}}` subgroups of
:math:`P^{\frac{1}{3}}` processors each. Each subgroup performs
collective communication only within its subgroup. Similarly,
pencil-to-pencil communication can be performed by partitioning *P*
processors into :math:`P^{\frac{1}{2}}` subgroups of
:math:`P^{\frac{1}{2}}` processors each. This is illustrated in the
figure for the :math:`y \Rightarrow z` communication (center). An
eight-processor subgroup owns the front *yz* plane of data and performs
collective communication within the subgroup to transpose from a
*y*-pencil to *z*-pencil layout.
LAMMPS invokes point-to-point communication by default, but also
provides the option of partitioned collective communication when using a
:doc:`kspace_modify collective yes <kspace_modify>` command to switch to
that mode. In the latter case, the code detects the size of the
disjoint subgroups and partitions the single *P*-size communicator into
multiple smaller communicators, each of which invokes collective
communication. Testing on a large IBM Blue Gene/Q machine at Argonne
National Labs showed a significant improvement in FFT performance for
large processor counts; partitioned collective communication was faster
than point-to-point communication or global collective communication
involving all *P* processors.
Here are some additional details about FFTs for long-range and related
grid/particle operations that LAMMPS supports:
- The fftMPI library allows each grid dimension to be a multiple of
small prime factors (2,3,5), and allows any number of processors to
perform the FFT. The resulting brick and pencil decompositions are
thus not always as well-aligned but the size of subgroups of
processors for the two modes of communication (brick/pencil and
pencil/pencil) still scale as :math:`O(P^{\frac{1}{3}})` and
:math:`O(P^{\frac{1}{2}})`.
- For efficiency in performing 1d FFTs, the grid transpose
operations illustrated in Figure \ref{fig:fft} also involve
reordering the 3d data so that a different dimension is contiguous
in memory. This reordering can be done during the packing or
unpacking of buffers for MPI communication.
- For large systems and particularly a large number of MPI processes,
the dominant cost for parallel FFTs is often the communication, not
the computation of 1d FFTs, even though the latter scales as :math:`N
\log(N)` in the number of grid points *N* per grid direction. This is
due to the fact that only a 2d decomposition into pencils is possible
while atom data (and their corresponding short-range force and energy
computations) can be decomposed efficiently in 3d.
This can be addressed by reducing the number of MPI processes involved
in the MPI communication by using :doc:`hybrid MPI + OpenMP
parallelization <Speed_omp>`. This will use OpenMP parallelization
inside the MPI domains and while that may have a lower parallel
efficiency, it reduces the communication overhead.
As an alternative it is also possible to start a :ref:`multi-partition
<partition>` calculation and then use the :doc:`verlet/split
integrator <run_style>` to perform the PPPM computation on a
dedicated, separate partition of MPI processes. This uses an integer
"1:*p*" mapping of *p* sub-domains of the atom decomposition to one
sub-domain of the FFT grid decomposition and where pairwise non-bonded
and bonded forces and energies are computed on the larger partition
and the PPPM kspace computation concurrently on the smaller partition.
- LAMMPS also implements PPPM-based solvers for other long-range
interactions, dipole and dispersion (Lennard-Jones), which can be used
in conjunction with long-range Coulombics for point charges.
- LAMMPS implements a ``GridComm`` class which overlays the simulation
domain with a regular grid, partitions it across processors in a
manner consistent with processor sub-domains, and provides methods for
forward and reverse communication of owned and ghost grid point
values. It is used for PPPM as an FFT grid (as outlined above) and
also for the MSM algorithm which uses a cascade of grid sizes from
fine to coarse to compute long-range Coulombic forces. The GridComm
class is also useful for models where continuum fields interact with
particles. For example, the two-temperature model (TTM) defines heat
transfer between atoms (particles) and electrons (continuum gas) where
spatial variations in the electron temperature are computed by finite
differences of a discretized heat equation on a regular grid. The
:doc:`fix ttm/grid <fix_ttm>` command uses the ``GridComm`` class
internally to perform its grid operations on a distributed grid
instead of the original :doc:`fix ttm <fix_ttm>` which uses a
replicated grid.

View File

@ -0,0 +1,159 @@
Neighbor lists
^^^^^^^^^^^^^^
To compute forces efficiently, each processor creates a Verlet-style
neighbor list which enumerates all pairs of atoms *i,j* (*i* = owned,
*j* = owned or ghost) with separation less than the applicable
neighbor list cutoff distance. In LAMMPS the neighbor lists are stored
in a multiple-page data structure; each page is a contiguous chunk of
memory which stores vectors of neighbor atoms *j* for many *i* atoms.
This allows pages to be incrementally allocated or deallocated in blocks
as needed. Neighbor lists typically consume the most memory of any data
structure in LAMMPS. The neighbor list is rebuilt (from scratch) once
every few timesteps, then used repeatedly each step for force or other
computations. The neighbor cutoff distance is :math:`R_n = R_f +
\Delta_s`, where :math:`R_f` is the (largest) force cutoff defined by
the interatomic potential for computing short-range pairwise or manybody
forces and :math:`\Delta_s` is a "skin" distance that allows the list to
be used for multiple steps assuming that atoms do not move very far
between consecutive time steps. Typically the code triggers
reneighboring when any atom has moved half the skin distance since the
last reneighboring; this and other options of the neighbor list rebuild
can be adjusted with the :doc:`neigh_modify <neigh_modify>` command.
On steps when reneighboring is performed, atoms which have moved outside
their owning processor's sub-domain are first migrated to new processors
via communication. Periodic boundary conditions are also (only)
enforced on these steps to ensure each atom is re-assigned to the
correct processor. After migration, the atoms owned by each processor
are stored in a contiguous vector. Periodically each processor
spatially sorts owned atoms within its vector to reorder it for improved
cache efficiency in force computations and neighbor list building. For
that atoms are spatially binned and then reordered so that atoms in the
same bin are adjacent in the vector. Atom sorting can be disabled or
its settings modified with the :doc:`atom_modify <atom_modify>` command.
.. _neighbor-stencil:
.. figure:: img/neigh-stencil.png
:align: center
neighbor list stencils
A 2d simulation sub-domain (thick black line) and the corresponding
ghost atom cutoff region (dashed blue line) for both orthogonal
(left) and triclinic (right) domains. A regular grid of neighbor
bins (thin lines) overlays the entire simulation domain and need not
align with sub-domain boundaries; only the portion overlapping the
augmented sub-domain is shown. In the triclinic case it overlaps the
bounding box of the tilted rectangle. The blue- and red-shaded bins
represent a stencil of bins searched to find neighbors of a particular
atom (black dot).
To build a local neighbor list in linear time, the simulation domain is
overlaid (conceptually) with a regular 3d (or 2d) grid of neighbor bins,
as shown in the :ref:`neighbor-stencil` figure for 2d models and a
single MPI processor's sub-domain. Each processor stores a set of
neighbor bins which overlap its sub-domain extended by the neighbor
cutoff distance :math:`R_n`. As illustrated, the bins need not align
with processor boundaries; an integer number in each dimension is fit to
the size of the entire simulation box.
Most often LAMMPS builds what it calls a "half" neighbor list where
each *i,j* neighbor pair is stored only once, with either atom *i* or
*j* as the central atom. The build can be done efficiently by using a
pre-computed "stencil" of bins around a central origin bin which
contains the atom whose neighbors are being searched for. A stencil
is simply a list of integer offsets in *x,y,z* of nearby bins
surrounding the origin bin which are close enough to contain any
neighbor atom *j* within a distance :math:`R_n` from any atom *i* in the
origin bin. Note that for a half neighbor list, the stencil can be
asymmetric since each atom only need store half its nearby neighbors.
These stencils are illustrated in the figure for a half list and a bin
size of :math:`\frac{1}{2} R_n`. There are 13 red+blue stencil bins in
2d (for the orthogonal case, 15 for triclinic). In 3d there would be
63, 13 in the plane of bins that contain the origin bin and 25 in each
of the two planes above it in the *z* direction (75 for triclinic). The
reason the triclinic stencil has extra bins is because the bins tile the
bounding box of the entire triclinic domain and thus are not periodic
with respect to the simulation box itself. The stencil and logic for
determining which *i,j* pairs to include in the neighbor list are
altered slightly to account for this.
To build a neighbor list, a processor first loops over its "owned" plus
"ghost" atoms and assigns each to a neighbor bin. This uses an integer
vector to create a linked list of atom indices within each bin. It then
performs a triply-nested loop over its owned atoms *i*, the stencil of
bins surrounding atom *i*'s bin, and the *j* atoms in each stencil bin
(including ghost atoms). If the distance :math:`r_{ij} < R_n`, then
atom *j* is added to the vector of atom *i*'s neighbors.
Here are additional details about neighbor list build options LAMMPS
supports:
- The choice of bin size is an option; a size half of :math:`R_n` has
been found to be optimal for many typical cases. Smaller bins incur
additional overhead to loop over; larger bins require more distance
calculations. Note that for smaller bin sizes, the 2d stencil in the
figure would be more semi-circular in shape (hemispherical in 3d),
with bins near the corners of the square eliminated due to their
distance from the origin bin.
- Depending on the interatomic potential(s) and other commands used in
an input script, multiple neighbor lists and stencils with different
attributes may be needed. This includes lists with different cutoff
distances, e.g. for force computation versus occasional diagnostic
computations such as a radial distribution function, or for the
r-RESPA time integrator which can partition pairwise forces by
distance into subsets computed at different time intervals. It
includes "full" lists (as opposed to half lists) where each *i,j* pair
appears twice, stored once with *i* and *j*, and which use a larger
symmetric stencil. It also includes lists with partial enumeration of
ghost atom neighbors. The full and ghost-atom lists are used by
various manybody interatomic potentials. Lists may also use different
criteria for inclusion of a pair interaction. Typically this simply
depends only on the distance between two atoms and the cutoff
distance. But for finite-size coarse-grained particles with
individual diameters (e.g. polydisperse granular particles), it can
also depend on the diameters of the two particles.
- When using :doc:`pair style hybrid <pair_hybrid>` multiple sub-lists
of the master neighbor list for the full system need to be generated,
one for each sub-style, which contains only the *i,j* pairs needed to
compute interactions between subsets of atoms for the corresponding
potential. This means not all *i* or *j* atoms owned by a processor
are included in a particular sub-list.
- Some models use different cutoff lengths for pairwise interactions
between different kinds of particles which are stored in a single
neighbor list. One example is a solvated colloidal system with large
colloidal particles where colloid/colloid, colloid/solvent, and
solvent/solvent interaction cutoffs can be dramatically different.
Another is a model of polydisperse finite-size granular particles;
pairs of particles interact only when they are in contact with each
other. Mixtures with particle size ratios as high as 10-100x may be
used to model realistic systems. Efficient neighbor list building
algorithms for these kinds of systems are available in LAMMPS. They
include a method which uses different stencils for different cutoff
lengths and trims the stencil to only include bins that straddle the
cutoff sphere surface. More recently a method which uses both
multiple stencils and multiple bin sizes was developed; it builds
neighbor lists efficiently for systems with particles of any size
ratio, though other considerations (timestep size, force computations)
may limit the ability to model systems with huge polydispersity.
- For small and sparse systems and as a fallback method, LAMMPS also
supports neighbor list construction without binning by using a full
:math:`O(N^2)` loop over all *i,j* atom pairs in a sub-domain when
using the :doc:`neighbor nsq <neighbor>` command.
- Dependent on the "pair" setting of the :doc:`newton <newton>` command,
the "half" neighbor lists may contain **all** pairs of atoms where
atom *j* is a ghost atom (i.e. when the newton pair setting is *off*)
For the newton pair *on* setting the atom *j* is only added to the
list if its *z* coordinate is larger, or if equal the *y* coordinate
is larger, and that is equal, too, the *x* coordinate is larger. For
homogeneously dense systems that will result in picking neighbors from
a same size sector in always the same direction relative to the
"owned" atom and thus it should lead to similar length neighbor lists
and thus reduce the chance of a load imbalance.

View File

@ -0,0 +1,114 @@
OpenMP Parallelism
^^^^^^^^^^^^^^^^^^
The styles in the INTEL, KOKKOS, and OPENMP package offer to use OpenMP
thread parallelism to predominantly distribute loops over local data
and thus follow an orthogonal parallelization strategy to the
decomposition into spatial domains used by the :doc:`MPI partitioning
<Developer_par_part>`. For clarity, this section discusses only the
implementation in the OPENMP package as it is the simplest. The INTEL
and KOKKOS package offer additional options and are more complex since
they support more features and different hardware like co-processors
or GPUs.
One of the key decisions when implementing the OPENMP package was to
keep the changes to the source code small, so that it would be easier to
maintain the code and keep it in sync with the non-threaded standard
implementation. this is achieved by a) making the OPENMP version a
derived class from the regular version (e.g. ``PairLJCutOMP`` from
``PairLJCut``) and overriding only methods that are multi-threaded or
need to be modified to support multi-threading (similar to what was done
in the OPT package), b) keeping the structure in the modified code very
similar so that side-by-side comparisons are still useful, and c)
offloading additional functionality and multi-thread support functions
into three separate classes ``ThrOMP``, ``ThrData``, and ``FixOMP``.
``ThrOMP`` provides additional, multi-thread aware functionality not
available in the corresponding base class (e.g. ``Pair`` for
``PairLJCutOMP``) like multi-thread aware variants of the "tally"
functions. Those functions are made available through multiple
inheritance so those new functions have to have unique names to avoid
ambiguities; typically ``_thr`` is appended to the name of the function.
``ThrData`` is a classes that manages per-thread data structures.
It is used instead of extending the corresponding storage to per-thread
arrays to avoid slowdowns due to "false sharing" when multiple threads
update adjacent elements in an array and thus force the CPU cache lines
to be reset and re-fetched. ``FixOMP`` finally manages the "multi-thread
state" like settings and access to per-thread storage, it is activated
by the :doc:`package omp <package>` command.
Avoiding data races
"""""""""""""""""""
A key problem when implementing thread parallelism in an MD code is
to avoid data races when updating accumulated properties like forces,
energies, and stresses. When interactions are computed, they always
involve multiple atoms and thus there are race conditions when multiple
threads want to update per-atom data of the same atoms. Five possible
strategies have been considered to avoid this:
1) restructure the code so that there is no overlapping access possible
when computing in parallel, e.g. by breaking lists into multiple
parts and synchronizing threads in between.
2) have each thread be "responsible" for a specific group of atoms and
compute these interactions multiple times, once on each thread that
is responsible for a given atom and then have each thread only update
the properties of this atom.
3) use mutexes around functions and regions of code where the data race
could happen
4) use atomic operations when updating per-atom properties
5) use replicated per-thread data structures to accumulate data without
conflicts and then use a reduction to combine those results into the
data structures used by the regular style.
Option 5 was chosen for the OPENMP package because it would retain the
performance for the case of 1 thread and the code would be more
maintainable. Option 1 would require extensive code changes,
particularly to the neighbor list code; options 2 would have incurred a
2x or more performance penalty for the serial case; option 3 causes
significant overhead and would enforce serialization of operations in
inner loops and thus defeat the purpose of multi-threading; option 4
slows down the serial case although not quite as bad as option 2. The
downside of option 5 is that the overhead of the reduction operations
grows with the number of threads used, so there would be a crossover
point where options 2 or 4 would result in faster executing. That is
why option 2 for example is used in the GPU package because a GPU is a
processor with a massive number of threads. However, since the MPI
parallelization is generally more effective for typical MD systems, the
expectation is that thread parallelism is only used for a smaller number
of threads (2-8). At the time of its implementation, that number was
equivalent to the number of CPU cores per CPU socket on high-end
supercomputers.
Thus arrays like the force array are dimensioned to the number of atoms
times the number of threads when enabling OpenMP support and inside the
compute functions a pointer to a different chunk is obtained by each thread.
Similarly, accumulators like potential energy or virial are kept in
per-thread instances of the ``ThrData`` class and then only reduced and
stored in their global counterparts at the end of the force computation.
Loop scheduling
"""""""""""""""
Multi-thread parallelization is applied by distributing (outer) loops
statically across threads. Typically this would be the loop over local
atoms *i* when processing *i,j* pairs of atoms from a neighbor list.
The design of the neighbor list code results in atoms having a similar
number of neighbors for homogeneous systems and thus load imbalances
across threads are not common and typically happen for systems where
also the MPI parallelization would be unbalanced, which would typically
have a more pronounced impact on the performance. This same loop
scheduling scheme can also be applied to the reduction operations on
per-atom data to try and reduce the overhead of the reduction operation.
Neighbor list parallelization
"""""""""""""""""""""""""""""
In addition to the parallelization of force computations, also the
generation of the neighbor lists is parallelized. As explained
previously, neighbor lists are built by looping over "owned" atoms and
storing the neighbors in "pages". In the OPENMP variants of the
neighbor list code, each thread operates on a different chunk of "owned"
atoms and allocates and fills its own set of pages with neighbor list
data. This is achieved by each thread keeping its own instance of the
:cpp:class:`MyPage <LAMMPS_NS::MyPage>` page allocator class.

View File

@ -0,0 +1,89 @@
Partitioning
^^^^^^^^^^^^
The underlying spatial decomposition strategy used by LAMMPS for
distributed-memory parallelism is set with the :doc:`comm_style command
<comm_style>` and can be either "brick" (a regular grid) or "tiled".
.. _domain-decomposition:
.. figure:: img/domain-decomp.png
:align: center
domain decomposition
This figure shows the different kinds of domain decomposition used
for MPI parallelization: "brick" on the left with an orthogonal
(left) and a triclinic (middle) simulation domain, and a "tiled"
decomposition (right). The black lines show the division into
sub-domains and the contained atoms are "owned" by the corresponding
MPI process. The green dashed lines indicate how sub-domains are
extended with "ghost" atoms up to the communication cutoff distance.
The LAMMPS simulation box is a 3d or 2d volume, which can be orthogonal
or triclinic in shape, as illustrated in the :ref:`domain-decomposition`
figure for the 2d case. Orthogonal means the box edges are aligned with
the *x*, *y*, *z* Cartesian axes, and the box faces are thus all
rectangular. Triclinic allows for a more general parallelepiped shape
in which edges are aligned with three arbitrary vectors and the box
faces are parallelograms. In each dimension box faces can be periodic,
or non-periodic with fixed or shrink-wrapped boundaries. In the fixed
case, atoms which move outside the face are deleted; shrink-wrapped
means the position of the box face adjusts continuously to enclose all
the atoms.
For distributed-memory MPI parallelism, the simulation box is spatially
decomposed (partitioned) into non-overlapping sub-domains which fill the
box. The default partitioning, "brick", is most suitable when atom
density is roughly uniform, as shown in the left-side images of the
:ref:`domain-decomposition` figure. The sub-domains comprise a regular
grid and all sub-domains are identical in size and shape. Both the
orthogonal and triclinic boxes can deform continuously during a
simulation, e.g. to compress a solid or shear a liquid, in which case
the processor sub-domains likewise deform.
For models with non-uniform density, the number of particles per
processor can be load-imbalanced with the default partitioning. This
reduces parallel efficiency, as the overall simulation rate is limited
by the slowest processor, i.e. the one with the largest computational
load. For such models, LAMMPS supports multiple strategies to reduce
the load imbalance:
- The processor grid decomposition is by default based on the simulation
cell volume and tries to optimize the volume to surface ratio for the sub-domains.
This can be changed with the :doc:`processors command <processors>`.
- The parallel planes defining the size of the sub-domains can be shifted
with the :doc:`balance command <balance>`. Which can be done in addition
to choosing a more optimal processor grid.
- The recursive bisectioning algorithm in combination with the "tiled"
communication style can produce a partitioning with equal numbers of
particles in each sub-domain.
.. |decomp1| image:: img/decomp-regular.png
:width: 24%
.. |decomp2| image:: img/decomp-processors.png
:width: 24%
.. |decomp3| image:: img/decomp-balance.png
:width: 24%
.. |decomp4| image:: img/decomp-rcb.png
:width: 24%
|decomp1| |decomp2| |decomp3| |decomp4|
The pictures above demonstrate different decompositions for a 2d system
with 12 MPI ranks. The atom colors indicate the load imbalance of each
sub-domain with green being optimal and red the least optimal.
Due to the vacuum in the system, the default decomposition is unbalanced
with several MPI ranks without atoms (left). By forcing a 1x12x1
processor grid, every MPI rank does computations now, but number of
atoms per sub-domain is still uneven and the thin slice shape increases
the amount of communication between sub-domains (center left). With a
2x6x1 processor grid and shifting the sub-domain divisions, the load
imbalance is further reduced and the amount of communication required
between sub-domains is less (center right). And using the recursive
bisectioning leads to further improved decomposition (right).

View File

@ -0,0 +1,28 @@
Parallel algorithms
-------------------
LAMMPS is designed to enable running simulations in parallel using the
MPI parallel communication standard with distributed data via domain
decomposition. The parallelization aims to be efficient result in good
strong scaling (= good speedup for the same system) and good weak
scaling (= the computational cost of enlarging the system is
proportional to the system size). Additional parallelization using GPUs
or OpenMP can also be applied within the sub-domain assigned to an MPI
process. For clarity, most of the following illustrations show the 2d
simulation case. The underlying algorithms in those cases, however,
apply to both 2d and 3d cases equally well.
.. note::
The text and most of the figures in this chapter were adapted
for the manual from the section on parallel algorithms in the
:ref:`new LAMMPS paper <lammps_paper>`.
.. toctree::
:maxdepth: 1
Developer_par_part
Developer_par_comm
Developer_par_neigh
Developer_par_long
Developer_par_openmp

View File

@ -4,28 +4,40 @@ Citing LAMMPS
Core Algorithms
^^^^^^^^^^^^^^^
Since LAMMPS is a community project, there is not a single one
publication or reference that describes **all** of LAMMPS.
The canonical publication that describes the foundation, that is
the basic spatial decomposition approach, the neighbor finding,
and basic communications algorithms used in LAMMPS is:
The paper mentioned below is the best overview of LAMMPS, but there are
also publications describing particular models or algorithms implemented
in LAMMPS or complementary software that is has interfaces to. Please
see below for how to cite contributions to LAMMPS.
`S. Plimpton, Fast Parallel Algorithms for Short-Range Molecular Dynamics, J Comp Phys, 117, 1-19 (1995). <http://www.sandia.gov/~sjplimp/papers/jcompphys95.pdf>`_
.. _lammps_paper:
So any project using LAMMPS (or a derivative application using LAMMPS as
a simulation engine) should cite this paper. A new publication
describing the developments and improvements of LAMMPS in the 25 years
since then is currently in preparation.
The latest canonical publication that describes the basic features, the
source code design, the program structure, the spatial decomposition
approach, the neighbor finding, basic communications algorithms, and how
users and developers have contributed to LAMMPS is:
`LAMMPS - A flexible simulation tool for particle-based materials modeling at the atomic, meso, and continuum scales, Comp. Phys. Comm. (accepted 09/2021), DOI:10.1016/j.cpc.2021.108171 <https://doi.org/10.1016/j.cpc.2021.108171>`_
So a project using LAMMPS or a derivative application that uses LAMMPS
as a simulation engine should cite this paper. The paper is expected to
be published in its final form under the same DOI in the first half
of 2022.
The original publication describing the parallel algorithms used in the
initial versions of LAMMPS is:
`S. Plimpton, Fast Parallel Algorithms for Short-Range Molecular Dynamics, J Comp Phys, 117, 1-19 (1995). <http://www.sandia.gov/~sjplimp/papers/jcompphys95.pdf>`_
DOI for the LAMMPS code
^^^^^^^^^^^^^^^^^^^^^^^
LAMMPS developers use the `Zenodo service at CERN
<https://zenodo.org/>`_ to create digital object identifies (DOI) for
stable releases of the LAMMPS code. There are two types of DOIs for the
LAMMPS source code: the canonical DOI for **all** versions of LAMMPS,
which will always point to the **latest** stable release version is:
LAMMPS developers use the `Zenodo service at CERN <https://zenodo.org/>`_
to create digital object identifies (DOI) for stable releases of the
LAMMPS source code. There are two types of DOIs for the LAMMPS source code.
The canonical DOI for **all** versions of LAMMPS, which will always
point to the **latest** stable release version is:
- DOI: `10.5281/zenodo.3726416 <https://dx.doi.org/10.5281/zenodo.3726416>`_
@ -45,11 +57,13 @@ about LAMMPS and its features.
Citing contributions
^^^^^^^^^^^^^^^^^^^^
LAMMPS has many features and that use either previously published
methods and algorithms or novel features. It also includes potential
parameter filed for specific models. Where available, a reminder about
references for optional features used in a specific run is printed to
the screen and log file. Style and output location can be selected with
the :ref:`-cite command-line switch <cite>`. Additional references are
LAMMPS has many features that use either previously published methods
and algorithms or novel features. It also includes potential parameter
files for specific models. Where available, a reminder about references
for optional features used in a specific run is printed to the screen
and log file. Style and output location can be selected with the
:ref:`-cite command-line switch <cite>`. Additional references are
given in the documentation of the :doc:`corresponding commands
<Commands_all>` or in the :doc:`Howto tutorials <Howto>`.
<Commands_all>` or in the :doc:`Howto tutorials <Howto>`. So please
make certain, that you provide the proper acknowledgments and citations
in any published works using LAMMPS.

Binary file not shown.

After

Width:  |  Height:  |  Size: 129 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 121 KiB

BIN
doc/src/img/decomp-rcb.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 121 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 121 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 547 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 35 KiB

BIN
doc/src/img/ghost-comm.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 33 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 53 KiB

View File

@ -1,4 +1,3 @@
// clang-format off
/* ----------------------------------------------------------------------
LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
https://www.lammps.org/, Sandia National Laboratories
@ -85,21 +84,23 @@ void KimInit::command(int narg, char **arg)
if ((narg < 2) || (narg > 3)) error->all(FLERR, "Illegal 'kim init' command");
if (domain->box_exist)
error->all(FLERR, "Must use 'kim init' command before "
"simulation box is defined");
error->all(FLERR, "Must use 'kim init' command before simulation box is defined");
char *model_name = utils::strdup(arg[0]);
char *user_units = utils::strdup(arg[1]);
if (narg == 3) {
auto arg_str = std::string(arg[2]);
if (arg_str == "unit_conversion_mode") unit_conversion_mode = true;
if (arg_str == "unit_conversion_mode")
unit_conversion_mode = true;
else {
error->all(FLERR, "Illegal 'kim init' command.\nThe argument "
"followed by unit_style {} is an optional "
"argument and when is used must "
"be unit_conversion_mode", user_units);
error->all(FLERR,
"Illegal 'kim init' command.\n"
"The argument followed by unit_style {} is an optional argument and when "
"is used must be unit_conversion_mode",
user_units);
}
} else unit_conversion_mode = false;
} else
unit_conversion_mode = false;
char *model_units;
KIM_Model *pkim = nullptr;
@ -117,14 +118,9 @@ void KimInit::command(int narg, char **arg)
/* ---------------------------------------------------------------------- */
namespace {
void get_kim_unit_names(
char const * const system,
KIM_LengthUnit & lengthUnit,
KIM_EnergyUnit & energyUnit,
KIM_ChargeUnit & chargeUnit,
KIM_TemperatureUnit & temperatureUnit,
KIM_TimeUnit & timeUnit,
Error * error)
void get_kim_unit_names(char const *const system, KIM_LengthUnit &lengthUnit,
KIM_EnergyUnit &energyUnit, KIM_ChargeUnit &chargeUnit,
KIM_TemperatureUnit &temperatureUnit, KIM_TimeUnit &timeUnit, Error *error)
{
const std::string system_str(system);
if (system_str == "real") {
@ -157,20 +153,64 @@ void get_kim_unit_names(
chargeUnit = KIM_CHARGE_UNIT_e;
temperatureUnit = KIM_TEMPERATURE_UNIT_K;
timeUnit = KIM_TIME_UNIT_fs;
} else if ((system_str == "lj") ||
(system_str == "micro") ||
(system_str == "nano")) {
error->all(FLERR, "LAMMPS unit_style {} not supported "
"by KIM models", system_str);
} else if ((system_str == "lj") || (system_str == "micro") || (system_str == "nano")) {
error->all(FLERR, "LAMMPS unit_style {} not supported by KIM models", system_str);
} else {
error->all(FLERR, "Unknown unit_style");
}
}
} // namespace
} // namespace
void KimInit::determine_model_type_and_units(char * model_name,
char * user_units,
char ** model_units,
void KimInit::print_dirs(struct KIM_Collections *const collections) const
{
int kim_error = 0;
int dirListExtent = 0;
int dirCounter = 0;
std::string mesg = "#=== KIM is looking for 'Portable Models' in these directories ===\n";
std::vector<struct KIM_Collection> collection_list;
collection_list.push_back(KIM_COLLECTION_currentWorkingDirectory);
collection_list.push_back(KIM_COLLECTION_environmentVariable);
collection_list.push_back(KIM_COLLECTION_user);
collection_list.push_back(KIM_COLLECTION_system);
for (auto col : collection_list) {
kim_error = KIM_Collections_CacheListOfDirectoryNames(
collections, col, KIM_COLLECTION_ITEM_TYPE_portableModel, &dirListExtent);
if (!kim_error) {
for (int i = 0; i < dirListExtent; ++i) {
char const *name;
kim_error = KIM_Collections_GetDirectoryName(collections, i, &name);
// Don't check for error due to bug in kim-api-2.2.1 and below.
#if ((KIM_VERSION_MAJOR * 1000 + KIM_VERSION_MINOR) * 1000 + KIM_VERSION_PATCH) <= 2002001
kim_error = 0;
#endif
if (!kim_error) mesg += fmt::format("# {:2}: {}\n", ++dirCounter, name);
}
}
}
dirCounter = 0;
mesg += "#=== KIM is looking for 'Simulator Models' in these directories ===\n";
for (auto col : collection_list) {
kim_error = KIM_Collections_CacheListOfDirectoryNames(
collections, col, KIM_COLLECTION_ITEM_TYPE_simulatorModel, &dirListExtent);
if (!kim_error) {
for (int i = 0; i < dirListExtent; ++i) {
char const *name;
kim_error = KIM_Collections_GetDirectoryName(collections, i, &name);
// Don't check for error due to bug in kim-api-2.2.1 and below.
#if ((KIM_VERSION_MAJOR * 1000 + KIM_VERSION_MINOR) * 1000 + KIM_VERSION_PATCH) <= 2002001
kim_error = 0;
#endif
if (!kim_error) mesg += fmt::format("# {:2}: {}\n", ++dirCounter, name);
}
}
}
input->write_echo(mesg);
}
void KimInit::determine_model_type_and_units(char *model_name, char *user_units, char **model_units,
KIM_Model *&pkim)
{
KIM_LengthUnit lengthUnit;
@ -179,33 +219,26 @@ void KimInit::determine_model_type_and_units(char * model_name,
KIM_TemperatureUnit temperatureUnit;
KIM_TimeUnit timeUnit;
int units_accepted;
KIM_Collections * collections;
KIM_Collections *collections;
KIM_CollectionItemType itemType;
int kim_error = KIM_Collections_Create(&collections);
if (kim_error)
error->all(FLERR, "Unable to access KIM Collections to find Model");
if (kim_error) error->all(FLERR, "Unable to access KIM Collections to find Model");
auto logID = fmt::format("{}_Collections", comm->me);
KIM_Collections_SetLogID(collections, logID.c_str());
print_dirs(collections);
kim_error = KIM_Collections_GetItemType(collections, model_name, &itemType);
if (kim_error) error->all(FLERR, "KIM Model name not found");
KIM_Collections_Destroy(&collections);
if (KIM_CollectionItemType_Equal(itemType,
KIM_COLLECTION_ITEM_TYPE_portableModel)) {
get_kim_unit_names(user_units, lengthUnit, energyUnit,
chargeUnit, temperatureUnit, timeUnit, error);
int kim_error = KIM_Model_Create(KIM_NUMBERING_zeroBased,
lengthUnit,
energyUnit,
chargeUnit,
temperatureUnit,
timeUnit,
model_name,
&units_accepted,
&pkim);
if (KIM_CollectionItemType_Equal(itemType, KIM_COLLECTION_ITEM_TYPE_portableModel)) {
get_kim_unit_names(user_units, lengthUnit, energyUnit, chargeUnit, temperatureUnit, timeUnit,
error);
int kim_error = KIM_Model_Create(KIM_NUMBERING_zeroBased, lengthUnit, energyUnit, chargeUnit,
temperatureUnit, timeUnit, model_name, &units_accepted, &pkim);
if (kim_error) error->all(FLERR, "Unable to load KIM Simulator Model");
@ -219,20 +252,12 @@ void KimInit::determine_model_type_and_units(char * model_name,
} else if (unit_conversion_mode) {
KIM_Model_Destroy(&pkim);
int const num_systems = 5;
char const * const systems[num_systems]
= {"metal", "real", "si", "cgs", "electron"};
for (int i=0; i < num_systems; ++i) {
get_kim_unit_names(systems[i], lengthUnit, energyUnit,
chargeUnit, temperatureUnit, timeUnit, error);
kim_error = KIM_Model_Create(KIM_NUMBERING_zeroBased,
lengthUnit,
energyUnit,
chargeUnit,
temperatureUnit,
timeUnit,
model_name,
&units_accepted,
&pkim);
char const *const systems[num_systems] = {"metal", "real", "si", "cgs", "electron"};
for (int i = 0; i < num_systems; ++i) {
get_kim_unit_names(systems[i], lengthUnit, energyUnit, chargeUnit, temperatureUnit,
timeUnit, error);
kim_error = KIM_Model_Create(KIM_NUMBERING_zeroBased, lengthUnit, energyUnit, chargeUnit,
temperatureUnit, timeUnit, model_name, &units_accepted, &pkim);
if (units_accepted) {
logID = fmt::format("{}_Model", comm->me);
KIM_Model_SetLogID(pkim, logID.c_str());
@ -246,12 +271,10 @@ void KimInit::determine_model_type_and_units(char * model_name,
KIM_Model_Destroy(&pkim);
error->all(FLERR, "KIM Model does not support the requested unit system");
}
} else if (KIM_CollectionItemType_Equal(
itemType, KIM_COLLECTION_ITEM_TYPE_simulatorModel)) {
KIM_SimulatorModel * simulatorModel;
} else if (KIM_CollectionItemType_Equal(itemType, KIM_COLLECTION_ITEM_TYPE_simulatorModel)) {
KIM_SimulatorModel *simulatorModel;
kim_error = KIM_SimulatorModel_Create(model_name, &simulatorModel);
if (kim_error)
error->all(FLERR, "Unable to load KIM Simulator Model");
if (kim_error) error->all(FLERR, "Unable to load KIM Simulator Model");
model_type = SM;
logID = fmt::format("{}_SimulatorModel", comm->me);
@ -264,13 +287,11 @@ void KimInit::determine_model_type_and_units(char * model_name,
KIM_SimulatorModel_GetNumberOfSimulatorFields(simulatorModel, &sim_fields);
KIM_SimulatorModel_CloseTemplateMap(simulatorModel);
for (int i = 0; i < sim_fields; ++i) {
KIM_SimulatorModel_GetSimulatorFieldMetadata(
simulatorModel, i, &sim_lines, &sim_field);
KIM_SimulatorModel_GetSimulatorFieldMetadata(simulatorModel, i, &sim_lines, &sim_field);
const std::string sim_field_str(sim_field);
if (sim_field_str == "units") {
KIM_SimulatorModel_GetSimulatorFieldLine(
simulatorModel, i, 0, &sim_value);
KIM_SimulatorModel_GetSimulatorFieldLine(simulatorModel, i, 0, &sim_value);
*model_units = utils::strdup(sim_value);
break;
}
@ -280,16 +301,15 @@ void KimInit::determine_model_type_and_units(char * model_name,
const std::string model_units_str(*model_units);
const std::string user_units_str(user_units);
if ((!unit_conversion_mode) && (model_units_str != user_units_str)) {
error->all(FLERR, "Incompatible units for KIM Simulator Model"
", required units = {}", model_units_str);
error->all(FLERR, "Incompatible units for KIM Simulator Model, required units = {}",
model_units_str);
}
}
}
/* ---------------------------------------------------------------------- */
void KimInit::do_init(char *model_name, char *user_units, char *model_units,
KIM_Model *&pkim)
void KimInit::do_init(char *model_name, char *user_units, char *model_units, KIM_Model *&pkim)
{
// create storage proxy fix. delete existing fix, if needed.
@ -304,8 +324,7 @@ void KimInit::do_init(char *model_name, char *user_units, char *model_units,
fix_store->setptr("model_units", (void *) model_units);
// Begin output to log file
input->write_echo("#=== BEGIN kim init ==================================="
"=======\n");
input->write_echo("#=== BEGIN kim init ==========================================\n");
KIM_SimulatorModel *simulatorModel;
if (model_type == SM) {
@ -316,18 +335,16 @@ void KimInit::do_init(char *model_name, char *user_units, char *model_units,
KIM_SimulatorModel_SetLogID(simulatorModel, logID.c_str());
char const *sim_name, *sim_version;
KIM_SimulatorModel_GetSimulatorNameAndVersion(
simulatorModel, &sim_name, &sim_version);
KIM_SimulatorModel_GetSimulatorNameAndVersion(simulatorModel, &sim_name, &sim_version);
const std::string sim_name_str(sim_name);
if (sim_name_str != "LAMMPS")
error->all(FLERR, "Incompatible KIM Simulator Model");
if (sim_name_str != "LAMMPS") error->all(FLERR, "Incompatible KIM Simulator Model");
if (comm->me == 0) {
auto mesg = fmt::format("# Using KIM Simulator Model : {}\n"
"# For Simulator : {} {}\n"
"# Running on : LAMMPS {}\n#\n", model_name,
sim_name_str, sim_version, lmp->version);
"# For Simulator : {} {}\n"
"# Running on : LAMMPS {}\n#\n",
model_name, sim_name_str, sim_version, lmp->version);
utils::logmesg(lmp, mesg);
}
@ -350,18 +367,16 @@ void KimInit::do_init(char *model_name, char *user_units, char *model_units,
// Set the skin and timestep default values as
// 2.0 Angstroms and 1.0 femtosecond
const std::string skin_cmd =
(model_units_str == "real") ? "neighbor 2.0 bin # Angstroms":
(model_units_str == "metal") ? "neighbor 2.0 bin # Angstroms":
(model_units_str == "si") ? "neighbor 2e-10 bin # meters":
(model_units_str == "cgs") ? "neighbor 2e-8 bin # centimeters":
"neighbor 3.77945224 bin # Bohr";
const std::string step_cmd =
(model_units_str == "real") ? "timestep 1.0 # femtoseconds":
(model_units_str == "metal") ? "timestep 1.0e-3 # picoseconds":
(model_units_str == "si") ? "timestep 1e-15 # seconds":
(model_units_str == "cgs") ? "timestep 1e-15 # seconds":
"timestep 1.0 # femtoseconds";
const std::string skin_cmd = (model_units_str == "real") ? "neighbor 2.0 bin # Angstroms"
: (model_units_str == "metal") ? "neighbor 2.0 bin # Angstroms"
: (model_units_str == "si") ? "neighbor 2e-10 bin # meters"
: (model_units_str == "cgs") ? "neighbor 2e-8 bin # centimeters"
: "neighbor 3.77945224 bin # Bohr";
const std::string step_cmd = (model_units_str == "real") ? "timestep 1.0 # femtoseconds"
: (model_units_str == "metal") ? "timestep 1.0e-3 # picoseconds"
: (model_units_str == "si") ? "timestep 1e-15 # seconds"
: (model_units_str == "cgs") ? "timestep 1e-15 # seconds"
: "timestep 1.0 # femtoseconds";
input->one(skin_cmd);
input->one(step_cmd);
@ -373,14 +388,12 @@ void KimInit::do_init(char *model_name, char *user_units, char *model_units,
// init model
for (int i = 0; i < sim_fields; ++i) {
KIM_SimulatorModel_GetSimulatorFieldMetadata(
simulatorModel, i, &sim_lines, &sim_field);
KIM_SimulatorModel_GetSimulatorFieldMetadata(simulatorModel, i, &sim_lines, &sim_field);
const std::string sim_field_str(sim_field);
if (sim_field_str == "model-init") {
for (int j = 0; j < sim_lines; ++j) {
KIM_SimulatorModel_GetSimulatorFieldLine(
simulatorModel, i, j, &sim_value);
KIM_SimulatorModel_GetSimulatorFieldLine(simulatorModel, i, j, &sim_value);
input->one(sim_value);
}
break;
@ -404,31 +417,28 @@ void KimInit::do_init(char *model_name, char *user_units, char *model_units,
int max_len(0);
for (int i = 0; i < numberOfParameters; ++i) {
KIM_Model_GetParameterMetadata(pkim, i, &kim_DataType,
&extent, &str_name, &str_desc);
max_len = MAX(max_len, (int)strlen(str_name));
KIM_Model_GetParameterMetadata(pkim, i, &kim_DataType, &extent, &str_name, &str_desc);
max_len = MAX(max_len, (int) strlen(str_name));
}
max_len = MAX(18, max_len + 1);
mesg += fmt::format(" No. | {:<{}} | data type | extent\n",
"Parameter name", max_len);
mesg += fmt::format(" No. | {:<{}} | data type | extent\n", "Parameter name", max_len);
mesg += fmt::format("{:-<{}}\n", "-", max_len + 35);
for (int i = 0; i < numberOfParameters; ++i) {
KIM_Model_GetParameterMetadata(pkim, i, &kim_DataType,
&extent, &str_name, &str_desc);
KIM_Model_GetParameterMetadata(pkim, i, &kim_DataType, &extent, &str_name, &str_desc);
auto data_type = std::string("\"");
data_type += KIM_DataType_ToString(kim_DataType) + std::string("\"");
mesg += fmt::format(" {:<8} | {:<{}} | {:<10} | {}\n", i + 1, str_name,
max_len, data_type, extent);
mesg += fmt::format(" {:<8} | {:<{}} | {:<10} | {}\n", i + 1, str_name, max_len, data_type,
extent);
}
} else mesg += "No mutable parameters.\n";
} else
mesg += "No mutable parameters.\n";
KIM_Model_Destroy(&pkim);
input->write_echo(mesg);
}
// End output to log file
input->write_echo("#=== END kim init ====================================="
"=======\n\n");
input->write_echo("#=== END kim init ============================================\n\n");
}
/* ---------------------------------------------------------------------- */
@ -446,24 +456,11 @@ void KimInit::do_variables(const std::string &from, const std::string &to)
int ier;
std::string var_str;
int v_unit;
const char *units[] = {"mass",
"distance",
"time",
"energy",
"velocity",
"force",
"torque",
"temperature",
"pressure",
"viscosity",
"charge",
"dipole",
"efield",
"density",
nullptr};
const char *units[] = {"mass", "distance", "time", "energy", "velocity",
"force", "torque", "temperature", "pressure", "viscosity",
"charge", "dipole", "efield", "density", nullptr};
input->write_echo(fmt::format("# Conversion factors from {} to {}:\n",
from, to));
input->write_echo(fmt::format("# Conversion factors from {} to {}:\n", from, to));
auto variable = input->variable;
for (int i = 0; units[i] != nullptr; ++i) {
@ -473,24 +470,23 @@ void KimInit::do_variables(const std::string &from, const std::string &to)
variable->set(var_str + " internal 1.0");
v_unit = variable->find(var_str.c_str());
}
ier = lammps_unit_conversion(units[i], from, to,
conversion_factor);
ier = lammps_unit_conversion(units[i], from, to, conversion_factor);
if (ier != 0)
error->all(FLERR, "Unable to obtain conversion factor: "
"unit = {}; from = {}; to = {}",
units[i], from, to);
error->all(FLERR,
"Unable to obtain conversion factor: "
"unit = {}; from = {}; to = {}",
units[i], from, to);
variable->internal_set(v_unit, conversion_factor);
input->write_echo(fmt::format("variable {:<15s} internal {:<15.12e}\n",
var_str, conversion_factor));
input->write_echo(
fmt::format("variable {:<15s} internal {:<15.12e}\n", var_str, conversion_factor));
}
input->write_echo("#\n");
}
/* ---------------------------------------------------------------------- */
void KimInit::write_log_cite(class LAMMPS *lmp,
KimInit::model_type_enum model_type,
void KimInit::write_log_cite(class LAMMPS *lmp, KimInit::model_type_enum model_type,
char *model_name)
{
if (!lmp->citeme) return;
@ -501,7 +497,7 @@ void KimInit::write_log_cite(class LAMMPS *lmp,
std::string cite_id;
if (kim_id.empty()) {
cite_id = fmt::format("KIM potential: unpublished, \"{}\"\n",model_name_str);
cite_id = fmt::format("KIM potential: unpublished, \"{}\"\n", model_name_str);
} else {
KIM_Collections *collections;
int err = KIM_Collections_Create(&collections);
@ -513,12 +509,10 @@ void KimInit::write_log_cite(class LAMMPS *lmp,
int extent;
if (model_type == MO) {
err = KIM_Collections_CacheListOfItemMetadataFiles(
collections, KIM_COLLECTION_ITEM_TYPE_portableModel,
model_name, &extent);
collections, KIM_COLLECTION_ITEM_TYPE_portableModel, model_name, &extent);
} else if (model_type == SM) {
err = KIM_Collections_CacheListOfItemMetadataFiles(
collections, KIM_COLLECTION_ITEM_TYPE_simulatorModel,
model_name, &extent);
collections, KIM_COLLECTION_ITEM_TYPE_simulatorModel, model_name, &extent);
} else {
lmp->error->all(FLERR, "Unknown model type");
}
@ -529,19 +523,18 @@ void KimInit::write_log_cite(class LAMMPS *lmp,
}
cite_id = fmt::format("OpenKIM potential: https://openkim.org/cite/"
"{}#item-citation\n\n",kim_id);
"{}#item-citation\n\n",
kim_id);
for (int i = 0; i < extent; ++i) {
char const *fileName;
int availableAsString;
char const *fileString;
err = KIM_Collections_GetItemMetadataFile(
collections, i, &fileName, nullptr, nullptr,
&availableAsString, &fileString);
err = KIM_Collections_GetItemMetadataFile(collections, i, &fileName, nullptr, nullptr,
&availableAsString, &fileString);
if (err) continue;
if (utils::strmatch(fileName, "^kimcite") && availableAsString)
cite_id += fileString;
if (utils::strmatch(fileName, "^kimcite") && availableAsString) cite_id += fileString;
}
KIM_Collections_Destroy(&collections);
}

View File

@ -62,7 +62,8 @@
#include "pointers.h"
// Forward declaration.
typedef struct KIM_Model KIM_Model;
struct KIM_Model;
struct KIM_Collections;
namespace LAMMPS_NS {
@ -80,6 +81,8 @@ class KimInit : protected Pointers {
void determine_model_type_and_units(char *, char *, char **, KIM_Model *&);
void do_init(char *, char *, char *, KIM_Model *&);
void do_variables(const std::string &, const std::string &);
void print_dirs(struct KIM_Collections * const collections) const;
};
} // namespace LAMMPS_NS

View File

@ -17,23 +17,23 @@
------------------------------------------------------------------------- */
#include "pair_eam_alloy_kokkos.h"
#include <cmath>
#include <cstring>
#include "kokkos.h"
#include "pair_kokkos.h"
#include "atom_kokkos.h"
#include "force.h"
#include "atom_masks.h"
#include "comm.h"
#include "neighbor.h"
#include "error.h"
#include "force.h"
#include "kokkos.h"
#include "memory_kokkos.h"
#include "neigh_list_kokkos.h"
#include "neigh_request.h"
#include "memory_kokkos.h"
#include "error.h"
#include "atom_masks.h"
#include "tokenizer.h"
#include "neighbor.h"
#include "pair_kokkos.h"
#include "potential_file_reader.h"
#include <cmath>
#include <cstring>
using namespace LAMMPS_NS;
// Cannot use virtual inheritance on the GPU, so must duplicate code
@ -44,8 +44,8 @@ template<class DeviceType>
PairEAMAlloyKokkos<DeviceType>::PairEAMAlloyKokkos(LAMMPS *lmp) : PairEAM(lmp)
{
respa_enable = 0;
single_enable = 0;
one_coeff = 1;
manybody_flag = 1;
kokkosable = 1;
atomKK = (AtomKokkos *) atom;
@ -261,6 +261,8 @@ void PairEAMAlloyKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
virial[5] += ev.v[5];
}
if (vflag_fdotr) pair_virial_fdotr_compute(this);
if (eflag_atom) {
if (need_dup)
Kokkos::Experimental::contribute(d_eatom, dup_eatom);
@ -275,8 +277,6 @@ void PairEAMAlloyKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
k_vatom.template sync<LMPHostType>();
}
if (vflag_fdotr) pair_virial_fdotr_compute(this);
copymode = 0;
// free duplicated memory
@ -322,6 +322,11 @@ void PairEAMAlloyKokkos<DeviceType>::init_style()
}
/* ----------------------------------------------------------------------
convert read-in funcfl potential(s) to standard array format
interpolate all file values to a single grid and cutoff
------------------------------------------------------------------------- */
template<class DeviceType>
void PairEAMAlloyKokkos<DeviceType>::file2array()
{
@ -524,7 +529,7 @@ void PairEAMAlloyKokkos<DeviceType>::unpack_reverse_comm(int n, int *list, doubl
h_rho[j] += buf[m++];
}
k_fp.modify_host();
k_rho.modify_host();
}
/* ---------------------------------------------------------------------- */
@ -581,8 +586,8 @@ void PairEAMAlloyKokkos<DeviceType>::operator()(TagPairEAMAlloyKernelA<NEIGHFLAG
d_rhor_spline(d_type2rhor_ji,m,5))*p + d_rhor_spline(d_type2rhor_ji,m,6);
if (NEWTON_PAIR || j < nlocal) {
const int d_type2rhor_ij = d_type2rhor(itype,jtype);
a_rho[j] += ((d_rhor_spline(d_type2rhor_ij,m,3)*p + d_rhor_spline(d_type2rhor_ij,m,4))*p +
d_rhor_spline(d_type2rhor_ij,m,5))*p + d_rhor_spline(d_type2rhor_ij,m,6);
a_rho[j] += ((d_rhor_spline(d_type2rhor_ij,m,3)*p + d_rhor_spline(d_type2rhor_ij,m,4))*p +
d_rhor_spline(d_type2rhor_ij,m,5))*p + d_rhor_spline(d_type2rhor_ij,m,6);
}
}
@ -597,7 +602,6 @@ template<class DeviceType>
template<int EFLAG>
KOKKOS_INLINE_FUNCTION
void PairEAMAlloyKokkos<DeviceType>::operator()(TagPairEAMAlloyKernelB<EFLAG>, const int &ii, EV_FLOAT& ev) const {
// fp = derivative of embedding energy at each atom
// phi = embedding energy at each atom
// if rho > rhomax (e.g. due to close approach of two atoms),
@ -620,7 +624,6 @@ void PairEAMAlloyKokkos<DeviceType>::operator()(TagPairEAMAlloyKernelB<EFLAG>, c
if (eflag_global) ev.evdwl += phi;
if (eflag_atom) d_eatom[i] += phi;
}
}
template<class DeviceType>

View File

@ -17,23 +17,23 @@
------------------------------------------------------------------------- */
#include "pair_eam_fs_kokkos.h"
#include <cmath>
#include <cstring>
#include "kokkos.h"
#include "pair_kokkos.h"
#include "atom_kokkos.h"
#include "force.h"
#include "atom_masks.h"
#include "comm.h"
#include "neighbor.h"
#include "error.h"
#include "force.h"
#include "kokkos.h"
#include "memory_kokkos.h"
#include "neigh_list_kokkos.h"
#include "neigh_request.h"
#include "memory_kokkos.h"
#include "error.h"
#include "atom_masks.h"
#include "tokenizer.h"
#include "neighbor.h"
#include "pair_kokkos.h"
#include "potential_file_reader.h"
#include <cmath>
#include <cstring>
using namespace LAMMPS_NS;
// Cannot use virtual inheritance on the GPU, so must duplicate code
@ -43,9 +43,9 @@ using namespace LAMMPS_NS;
template<class DeviceType>
PairEAMFSKokkos<DeviceType>::PairEAMFSKokkos(LAMMPS *lmp) : PairEAM(lmp)
{
one_coeff = 1;
manybody_flag = 1;
respa_enable = 0;
single_enable = 0;
one_coeff = 1;
kokkosable = 1;
atomKK = (AtomKokkos *) atom;
@ -200,9 +200,9 @@ void PairEAMFSKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
// communicate derivative of embedding function (on the device)
k_fp.template sync<DeviceType>();
comm->forward_comm_pair(this);
k_fp.template modify<DeviceType>();
comm->forward_comm_pair(this);
k_fp.template sync<DeviceType>();
// compute kernel C
@ -322,6 +322,11 @@ void PairEAMFSKokkos<DeviceType>::init_style()
}
/* ----------------------------------------------------------------------
convert read-in funcfl potential(s) to standard array format
interpolate all file values to a single grid and cutoff
------------------------------------------------------------------------- */
template<class DeviceType>
void PairEAMFSKokkos<DeviceType>::file2array()
{
@ -581,8 +586,8 @@ void PairEAMFSKokkos<DeviceType>::operator()(TagPairEAMFSKernelA<NEIGHFLAG,NEWTO
d_rhor_spline(d_type2rhor_ji,m,5))*p + d_rhor_spline(d_type2rhor_ji,m,6);
if (NEWTON_PAIR || j < nlocal) {
const int d_type2rhor_ij = d_type2rhor(itype,jtype);
a_rho[j] += ((d_rhor_spline(d_type2rhor_ij,m,3)*p + d_rhor_spline(d_type2rhor_ij,m,4))*p +
d_rhor_spline(d_type2rhor_ij,m,5))*p + d_rhor_spline(d_type2rhor_ij,m,6);
a_rho[j] += ((d_rhor_spline(d_type2rhor_ij,m,3)*p + d_rhor_spline(d_type2rhor_ij,m,4))*p +
d_rhor_spline(d_type2rhor_ij,m,5))*p + d_rhor_spline(d_type2rhor_ij,m,6);
}
}
@ -620,7 +625,6 @@ void PairEAMFSKokkos<DeviceType>::operator()(TagPairEAMFSKernelB<EFLAG>, const i
if (eflag_global) ev.evdwl += phi;
if (eflag_atom) d_eatom[i] += phi;
}
}
template<class DeviceType>

View File

@ -17,18 +17,20 @@
------------------------------------------------------------------------- */
#include "pair_eam_kokkos.h"
#include <cmath>
#include "kokkos.h"
#include "pair_kokkos.h"
#include "atom_kokkos.h"
#include "force.h"
#include "atom_masks.h"
#include "comm.h"
#include "neighbor.h"
#include "error.h"
#include "force.h"
#include "kokkos.h"
#include "memory_kokkos.h"
#include "neigh_list_kokkos.h"
#include "neigh_request.h"
#include "memory_kokkos.h"
#include "error.h"
#include "atom_masks.h"
#include "neighbor.h"
#include "pair_kokkos.h"
#include <cmath>
using namespace LAMMPS_NS;

View File

@ -36,7 +36,7 @@ From: centos:7
# manually install Plumed
mkdir plumed
cd plumed
version=2.6.1
version=2.7.2
curl -L -o plumed.tar.gz https://github.com/plumed/plumed2/releases/download/v${version}/plumed-src-${version}.tgz
tar -xzf plumed.tar.gz
cd plumed-${version}

View File

@ -3,7 +3,7 @@ From: centos:8
%post
dnf -y install epel-release dnf-utils
dnf config-manager --set-enabled PowerTools
dnf config-manager --set-enabled powertools
dnf -y update
dnf -y install vim-enhanced git file make cmake patch which file ninja-build \
ccache gcc-c++ gcc-gfortran clang libomp-devel gdb valgrind libubsan libasan libtsan \
@ -42,7 +42,7 @@ From: centos:8
# manually install Plumed
mkdir plumed
cd plumed
version=2.6.1
version=2.7.2
curl -L -o plumed.tar.gz https://github.com/plumed/plumed2/releases/download/v${version}/plumed-src-${version}.tgz
tar -xzf plumed.tar.gz
cd plumed-${version}

View File

@ -0,0 +1,110 @@
BootStrap: docker
From: rockylinux/rockylinux:8
%post
dnf -y install epel-release dnf-utils
dnf config-manager --set-enabled powertools
dnf -y update
dnf -y install vim-enhanced git file make cmake patch which file ninja-build \
ccache gcc-c++ gcc-gfortran clang libomp-devel gdb valgrind libubsan libasan libtsan \
eigen3-devel openblas-devel libpng-devel libjpeg-devel platform-python-devel \
openmpi-devel mpich-devel fftw-devel voro++-devel gsl-devel hdf5-devel \
netcdf-devel netcdf-cxx-devel netcdf-mpich-devel netcdf-openmpi-devel \
enchant python3-virtualenv doxygen diffutils latexmk readline-devel \
texlive-latex-fonts texlive-pslatex texlive-collection-latexrecommended \
texlive-latex texlive-latexconfig doxygen-latex texlive-collection-latex \
texlive-latex-bin texlive-lualatex-math texlive-fncychap texlive-tabulary \
texlive-framed texlive-wrapfig texlive-upquote texlive-capt-of \
texlive-needspace texlive-titlesec texlive-anysize texlive-dvipng \
blas-devel lapack-devel libyaml-devel openkim-models kim-api-devel \
zstd libzstd-devel
dnf clean all
# we need to reset any module variables
# inherited from the host.
unset __LMOD_REF_COUNT__LMFILES_
unset __LMOD_REF_COUNT_PATH
unset __LMOD_REF_COUNT_LD_LIBRARY_PATH
unset __LMOD_REF_COUNT_MANPATH
unset __LMOD_REF_COUNT_MODULEPATH
unset __LMOD_REF_COUNT_LOADEDMODULES
unset _LMFILES_
unset MODULEPATH
unset MODULESHOME
unset MODULEPATH_ROOT
unset LOADEDMODULES
unset LMOD_SYSTEM_DEFAULT_MODULES
# load MPI by default
. /etc/profile
module load mpi
# manually install Plumed
mkdir plumed
cd plumed
version=2.7.2
curl -L -o plumed.tar.gz https://github.com/plumed/plumed2/releases/download/v${version}/plumed-src-${version}.tgz
tar -xzf plumed.tar.gz
cd plumed-${version}
./configure --disable-doc --prefix=/usr
make
make install
# fix up installation for CentOS and Fedora
mv -v /usr/lib64/pkgconfig/plumed* /usr/share/pkgconfig/
cd ../../
rm -rvf plumed
# create missing readline pkgconfig file
cat > /usr/lib64/pkgconfig/readline.pc <<EOF
prefix=/usr
exec_prefix=/usr
libdir=/usr/lib64
includedir=/usr/include
Name: Readline
Description: GNU Readline library for command line editing
URL: http://tiswww.cwru.edu/php/chet/readline/rltop.html
Version: 7.0
Requires.private: ncurses
Libs: -L\${libdir} -lreadline
Cflags: -I\${includedir}/readline
EOF
# set custom prompt indicating the container name
CUSTOM_PROMPT_ENV=/.singularity.d/env/99-zz_custom_prompt.sh
cat >$CUSTOM_PROMPT_ENV <<EOF
#!/bin/bash
PS1="[centos8:\u@\h] \W> "
EOF
chmod 755 $CUSTOM_PROMPT_ENV
%environment
LC_ALL=C
export LC_ALL
# we need to reset any module variables
# inherited from the host.
unset __LMOD_REF_COUNT__LMFILES_
unset __LMOD_REF_COUNT_PATH
unset __LMOD_REF_COUNT_LD_LIBRARY_PATH
unset __LMOD_REF_COUNT_MANPATH
unset __LMOD_REF_COUNT_MODULEPATH
unset __LMOD_REF_COUNT_LOADEDMODULES
unset _LMFILES_
unset MODULEPATH
unset MODULESHOME
unset MODULEPATH_ROOT
unset LOADEDMODULES
unset LMOD_SYSTEM_DEFAULT_MODULES
# load MPI by default
. /etc/profile
module load mpi
# tell OpenMPI to not try using Infiniband
OMPI_MCA_btl="^openib"
# do not warn about unused components as this messes up testing
OMPI_MCA_btl_base_warn_component_unused="0"
export OMPI_MCA_btl OMPI_MCA_btl_base_warn_component_unused
%labels
Author akohlmey, rbberger

View File

@ -105,7 +105,7 @@ From: ubuntu:18.04
# Plumed
###########################################################################
export PLUMED_PKG_VERSION=2.6.1
export PLUMED_PKG_VERSION=2.7.2
mkdir plumed
cd plumed

View File

@ -3,7 +3,7 @@ From: ubuntu:18.04
%environment
export PATH=/usr/lib/ccache:${PATH}:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib:/opt/rocm-4.3.0/llvm/lib
%post
export DEBIAN_FRONTEND=noninteractive
apt-get update
@ -94,7 +94,7 @@ From: ubuntu:18.04
###########################################################################
export PATH=$PATH:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
git clone -b rocm-4.1.x https://github.com/ROCmSoftwarePlatform/hipCUB.git
git clone -b release/rocm-rel-4.3 https://github.com/ROCmSoftwarePlatform/hipCUB.git
mkdir hipCUB/build
cd hipCUB/build
CXX=hipcc cmake -D BUILD_TEST=off ..
@ -129,7 +129,7 @@ From: ubuntu:18.04
# Plumed
###########################################################################
export PLUMED_PKG_VERSION=2.6.1
export PLUMED_PKG_VERSION=2.7.2
mkdir plumed
cd plumed

View File

@ -2,11 +2,11 @@ BootStrap: docker
From: ubuntu:18.04
%environment
export PATH=/usr/lib/ccache:/usr/local/cuda-11.0/bin:${PATH}:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
export CUDADIR=/usr/local/cuda-11.0
export CUDA_PATH=/usr/local/cuda-11.0
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.0/lib64
export LIBRARY_PATH=/usr/local/cuda-11.0/lib64/stubs
export PATH=/usr/lib/ccache:/usr/local/cuda-11.4/bin:${PATH}:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
export CUDADIR=/usr/local/cuda-11.4
export CUDA_PATH=/usr/local/cuda-11.4
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.4/lib64:/opt/rocm/lib:/opt/rocm-4.3.0/llvm/lib
export LIBRARY_PATH=/usr/local/cuda-11.4/lib64/stubs
%post
export DEBIAN_FRONTEND=noninteractive
apt-get update
@ -104,23 +104,19 @@ From: ubuntu:18.04
add-apt-repository "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /"
apt-get update
export CUDA_PKG_VERSION=11.0
export CUDA_PKG_VERSION=11.4
apt-get install -y --no-install-recommends \
cuda-libraries-$CUDA_PKG_VERSION \
cuda-command-line-tools-$CUDA_PKG_VERSION \
cuda-libraries-dev-$CUDA_PKG_VERSION \
cuda-minimal-build-$CUDA_PKG_VERSION \
cuda-libraries-${CUDA_PKG_VERSION} \
cuda-command-line-tools-${CUDA_PKG_VERSION} \
cuda-libraries-dev-${CUDA_PKG_VERSION} \
cuda-minimal-build-${CUDA_PKG_VERSION} \
cuda-compat-$CUDA_PKG_VERSION \
libcublas-11-0 \
libcublas-dev-11-0
echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf
echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
libcublas-${CUDA_PKG_VERSION} \
libcublas-dev-${CUDA_PKG_VERSION}
# add missing symlink
ln -s /usr/local/cuda-11.0 /usr/local/cuda
ln -s /usr/local/cuda-11.0/lib64/stubs/libcuda.so /usr/local/cuda-11.0/lib64/stubs/libcuda.so.1
ln -s /usr/local/cuda-${CUDA_PKG_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_PKG_VERSION}/lib64/stubs/libcuda.so.1
###########################################################################
# NVIDIA OpenCL
@ -134,7 +130,7 @@ From: ubuntu:18.04
###########################################################################
export PATH=$PATH:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
git clone -b rocm-3.7.x https://github.com/ROCmSoftwarePlatform/hipCUB.git
git clone -b release/rocm-rel-4.3 https://github.com/ROCmSoftwarePlatform/hipCUB.git
mkdir hipCUB/build
cd hipCUB/build
CXX=hipcc cmake -D BUILD_TEST=off ..
@ -169,7 +165,7 @@ From: ubuntu:18.04
# Plumed
###########################################################################
export PLUMED_PKG_VERSION=2.6.1
export PLUMED_PKG_VERSION=2.7.2
mkdir plumed
cd plumed

View File

@ -106,7 +106,7 @@ From: ubuntu:18.04
# Plumed
###########################################################################
export PLUMED_PKG_VERSION=2.6.1
export PLUMED_PKG_VERSION=2.7.2
mkdir plumed
cd plumed

View File

@ -1,5 +1,5 @@
BootStrap: docker
From: nvidia/cuda:11.0-devel-ubuntu18.04
From: nvidia/cuda:11.4.1-devel-ubuntu18.04
%post
export DEBIAN_FRONTEND=noninteractive
@ -105,7 +105,7 @@ From: nvidia/cuda:11.0-devel-ubuntu18.04
# Plumed
###########################################################################
export PLUMED_PKG_VERSION=2.6.1
export PLUMED_PKG_VERSION=2.7.2
mkdir plumed
cd plumed

View File

@ -100,7 +100,7 @@ From: ubuntu:20.04
# Plumed
###########################################################################
export PLUMED_PKG_VERSION=2.6.1
export PLUMED_PKG_VERSION=2.7.2
mkdir plumed
cd plumed

View File

@ -3,7 +3,7 @@ From: ubuntu:20.04
%environment
export PATH=/usr/lib/ccache:${PATH}:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib:/opt/rocm-4.2.0/llvm/lib
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib:/opt/rocm-4.3.0/llvm/lib
%post
export DEBIAN_FRONTEND=noninteractive
apt-get update
@ -91,7 +91,7 @@ From: ubuntu:20.04
###########################################################################
export PATH=$PATH:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
git clone -b rocm-4.1.x https://github.com/ROCmSoftwarePlatform/hipCUB.git
git clone -b release/rocm-rel-4.3 https://github.com/ROCmSoftwarePlatform/hipCUB.git
mkdir hipCUB/build
cd hipCUB/build
CXX=hipcc cmake -D BUILD_TEST=off ..
@ -126,7 +126,7 @@ From: ubuntu:20.04
# Plumed
###########################################################################
export PLUMED_PKG_VERSION=2.6.1
export PLUMED_PKG_VERSION=2.7.2
mkdir plumed
cd plumed

View File

@ -2,11 +2,11 @@ BootStrap: docker
From: ubuntu:20.04
%environment
export PATH=/usr/lib/ccache:/usr/local/cuda-11.0/bin:${PATH}:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
export CUDADIR=/usr/local/cuda-11.0
export CUDA_PATH=/usr/local/cuda-11.0
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.0/lib64:/opt/rocm/lib:/opt/rocm-4.2.0/llvm/lib
export LIBRARY_PATH=/usr/local/cuda-11.0/lib64/stubs
export PATH=/usr/lib/ccache:/usr/local/cuda-11.4/bin:${PATH}:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
export CUDADIR=/usr/local/cuda-11.4
export CUDA_PATH=/usr/local/cuda-11.4
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.4/lib64:/opt/rocm/lib:/opt/rocm-4.3.0/llvm/lib
export LIBRARY_PATH=/usr/local/cuda-11.4/lib64/stubs
%post
export DEBIAN_FRONTEND=noninteractive
apt-get update
@ -101,23 +101,19 @@ From: ubuntu:20.04
add-apt-repository "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /"
apt-get update
export CUDA_PKG_VERSION=11.0
export CUDA_PKG_VERSION=11.4
apt-get install -y --no-install-recommends \
cuda-libraries-$CUDA_PKG_VERSION \
cuda-command-line-tools-$CUDA_PKG_VERSION \
cuda-libraries-dev-$CUDA_PKG_VERSION \
cuda-minimal-build-$CUDA_PKG_VERSION \
cuda-libraries-${CUDA_PKG_VERSION} \
cuda-command-line-tools-${CUDA_PKG_VERSION} \
cuda-libraries-dev-${CUDA_PKG_VERSION} \
cuda-minimal-build-${CUDA_PKG_VERSION} \
cuda-compat-$CUDA_PKG_VERSION \
libcublas-11-0 \
libcublas-dev-11-0
echo "/usr/local/nvidia/lib" >> /etc/ld.so.conf.d/nvidia.conf
echo "/usr/local/nvidia/lib64" >> /etc/ld.so.conf.d/nvidia.conf
libcublas-${CUDA_PKG_VERSION} \
libcublas-dev-${CUDA_PKG_VERSION}
# add missing symlink
ln -s /usr/local/cuda-11.0 /usr/local/cuda
ln -s /usr/local/cuda-11.0/lib64/stubs/libcuda.so /usr/local/cuda-11.0/lib64/stubs/libcuda.so.1
ln -s /usr/local/cuda-${CUDA_PKG_VERSION}/lib64/stubs/libcuda.so /usr/local/cuda-${CUDA_PKG_VERSION}/lib64/stubs/libcuda.so.1
###########################################################################
# NVIDIA OpenCL
@ -131,7 +127,7 @@ From: ubuntu:20.04
###########################################################################
export PATH=$PATH:/opt/rocm/bin:/opt/rocm/profiler/bin:/opt/rocm/opencl/bin/x86_64
git clone -b rocm-4.2.x https://github.com/ROCmSoftwarePlatform/hipCUB.git
git clone -b release/rocm-rel-4.3 https://github.com/ROCmSoftwarePlatform/hipCUB.git
mkdir hipCUB/build
cd hipCUB/build
CXX=hipcc cmake -D BUILD_TEST=off ..
@ -166,7 +162,7 @@ From: ubuntu:20.04
# Plumed
###########################################################################
export PLUMED_PKG_VERSION=2.6.1
export PLUMED_PKG_VERSION=2.7.2
mkdir plumed
cd plumed

View File

@ -99,7 +99,7 @@ From: ubuntu:20.04
# Plumed
###########################################################################
export PLUMED_PKG_VERSION=2.6.1
export PLUMED_PKG_VERSION=2.7.2
mkdir plumed
cd plumed

View File

@ -1,5 +1,5 @@
BootStrap: docker
From: nvidia/cuda:11.0-devel-ubuntu20.04
From: nvidia/cuda:11.4.1-devel-ubuntu20.04
%post
export DEBIAN_FRONTEND=noninteractive
@ -102,7 +102,7 @@ From: nvidia/cuda:11.0-devel-ubuntu20.04
# Plumed
###########################################################################
export PLUMED_PKG_VERSION=2.6.1
export PLUMED_PKG_VERSION=2.7.2
mkdir plumed
cd plumed