Merge remote-tracking branch 'origin/master' into cmake_clean_up

This commit is contained in:
Christoph Junghans
2020-03-25 09:36:36 -06:00
23 changed files with 433 additions and 514 deletions

View File

@ -236,7 +236,15 @@ pkg_depends(USER-LB MPI)
pkg_depends(USER-PHONON KSPACE) pkg_depends(USER-PHONON KSPACE)
pkg_depends(USER-SCAFACOS MPI) pkg_depends(USER-SCAFACOS MPI)
# detect if we may enable OpenMP support by default
set(BUILD_OMP_DEFAULT OFF)
find_package(OpenMP QUIET) find_package(OpenMP QUIET)
if(OpenMP_FOUND)
check_include_file_cxx(omp.h HAVE_OMP_H_INCLUDE)
if(HAVE_OMP_H_INCLUDE)
set(BUILD_OMP_DEFAULT ON)
endif()
endif()
# TODO: this is a temporary workaround until a better solution is found. AK 2019-05-30 # TODO: this is a temporary workaround until a better solution is found. AK 2019-05-30
# GNU GCC 9.x uses settings incompatible with our use of 'default(none)' in OpenMP pragmas # GNU GCC 9.x uses settings incompatible with our use of 'default(none)' in OpenMP pragmas
@ -246,14 +254,14 @@ find_package(OpenMP QUIET)
if ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 8.99.9)) if ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 8.99.9))
option(BUILD_OMP "Build with OpenMP support" OFF) option(BUILD_OMP "Build with OpenMP support" OFF)
else() else()
option(BUILD_OMP "Build with OpenMP support" ${OpenMP_FOUND}) option(BUILD_OMP "Build with OpenMP support" ${BUILD_OMP_DEFAULT})
endif() endif()
if(BUILD_OMP) if(BUILD_OMP)
find_package(OpenMP REQUIRED) find_package(OpenMP REQUIRED)
check_include_file_cxx(omp.h HAVE_OMP_H_INCLUDE) check_include_file_cxx(omp.h HAVE_OMP_H_INCLUDE)
if(NOT HAVE_OMP_H_INCLUDE) if(NOT HAVE_OMP_H_INCLUDE)
message(FATAL_ERROR "Cannot find required 'omp.h' header file") message(FATAL_ERROR "Cannot find the 'omp.h' header file required for full OpenMP support")
endif() endif()
target_link_libraries(lammps PRIVATE OpenMP::OpenMP_CXX) target_link_libraries(lammps PRIVATE OpenMP::OpenMP_CXX)
endif() endif()

View File

@ -31,7 +31,7 @@ if(PKG_USER-INTEL)
endif() endif()
endif() endif()
if(INTEL_LRT_MODE STREQUAL "C++11") if(INTEL_LRT_MODE STREQUAL "C++11")
target_compile_definitions(lammps PRIVATE -DLMP_INTEL_USERLRT -DLMP_INTEL_LRT11) target_compile_definitions(lammps PRIVATE -DLMP_INTEL_USELRT -DLMP_INTEL_LRT11)
endif() endif()
if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")

View File

@ -12,6 +12,10 @@ via apt-get and all files are accessible in both the Windows Explorer and your
Linux shell (bash). This avoids switching to a different operating system or Linux shell (bash). This avoids switching to a different operating system or
installing a virtual machine. Everything runs on Windows. installing a virtual machine. Everything runs on Windows.
.. seealso::
You can find more detailed information at the `Windows Subsystem for Linux Installation Guide for Windows 10 <https://docs.microsoft.com/en-us/windows/wsl/install-win10>`_.
Installing Bash on Windows Installing Bash on Windows
-------------------------- --------------------------
@ -103,7 +107,7 @@ needed for various LAMMPS features:
.. code-block:: bash .. code-block:: bash
sudo apt install -y build-essential ccache gfortran openmpi-bin libopenmpi-dev libfftw3-dev libjpeg-dev libpng12-dev python-dev python-virtualenv libblas-dev liblapack-dev libhdf5-serial-dev hdf5-tools sudo apt install -y build-essential ccache gfortran openmpi-bin libopenmpi-dev libfftw3-dev libjpeg-dev libpng-dev python-dev python-virtualenv libblas-dev liblapack-dev libhdf5-serial-dev hdf5-tools
Files in Ubuntu on Windows Files in Ubuntu on Windows
^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^

View File

@ -32,7 +32,7 @@ Examples
bond_coeff * 2.0 0.25 0.7564 bond_coeff * 2.0 0.25 0.7564
bond_style oxrna2/fene bond_style oxrna2/fene
bond_coeff \* 2.0 0.25 0.76107 bond_coeff * 2.0 0.25 0.76107
Description Description
""""""""""" """""""""""

View File

@ -132,9 +132,9 @@ and Te. If your LAMMPS simulation has 4 atoms types and you want the
1st 3 to be Cd, and the 4th to be Te, you would use the following 1st 3 to be Cd, and the 4th to be Te, you would use the following
pair_coeff command: pair_coeff command:
.. parsed-literal:: .. code-block:: LAMMPS
pair_coeff \* \* CdTe Cd Cd Cd Te pair_coeff * * CdTe Cd Cd Cd Te
The 1st 2 arguments must be \* \* so as to span all LAMMPS atom types. The 1st 2 arguments must be \* \* so as to span all LAMMPS atom types.
The first three Cd arguments map LAMMPS atom types 1,2,3 to the Cd The first three Cd arguments map LAMMPS atom types 1,2,3 to the Cd

View File

@ -60,18 +60,18 @@ Examples
.. code-block:: LAMMPS .. code-block:: LAMMPS
pair_style lj/class2 10.0 pair_style lj/class2 10.0
pair_coeff \* \* 100.0 2.5 pair_coeff * * 100.0 2.5
pair_coeff 1 2\* 100.0 2.5 9.0 pair_coeff 1 2* 100.0 2.5 9.0
pair_style lj/class2/coul/cut 10.0 pair_style lj/class2/coul/cut 10.0
pair_style lj/class2/coul/cut 10.0 8.0 pair_style lj/class2/coul/cut 10.0 8.0
pair_coeff \* \* 100.0 3.0 pair_coeff * * 100.0 3.0
pair_coeff 1 1 100.0 3.5 9.0 pair_coeff 1 1 100.0 3.5 9.0
pair_coeff 1 1 100.0 3.5 9.0 9.0 pair_coeff 1 1 100.0 3.5 9.0 9.0
pair_style lj/class2/coul/long 10.0 pair_style lj/class2/coul/long 10.0
pair_style lj/class2/coul/long 10.0 8.0 pair_style lj/class2/coul/long 10.0 8.0
pair_coeff \* \* 100.0 3.0 pair_coeff * * 100.0 3.0
pair_coeff 1 1 100.0 3.5 9.0 pair_coeff 1 1 100.0 3.5 9.0
Description Description

View File

@ -19,11 +19,11 @@ Examples
.. code-block:: LAMMPS .. code-block:: LAMMPS
pair_coeff 1 2 1.0 1.0 2.5 pair_coeff 1 2 1.0 1.0 2.5
pair_coeff 2 \* 1.0 1.0 pair_coeff 2 * 1.0 1.0
pair_coeff 3\* 1\*2 1.0 1.0 2.5 pair_coeff 3* 1*2 1.0 1.0 2.5
pair_coeff \* \* 1.0 1.0 pair_coeff * * 1.0 1.0
pair_coeff \* \* nialhjea 1 1 2 pair_coeff * * nialhjea 1 1 2
pair_coeff \* 3 morse.table ENTRY1 pair_coeff * 3 morse.table ENTRY1
pair_coeff 1 2 lj/cut 1.0 1.0 2.5 (for pair_style hybrid) pair_coeff 1 2 lj/cut 1.0 1.0 2.5 (for pair_style hybrid)
Description Description
@ -55,7 +55,7 @@ pairs, then overwrite the coeffs for just the I,J = 2,3 pair:
.. code-block:: LAMMPS .. code-block:: LAMMPS
pair_coeff \* \* 1.0 1.0 2.5 pair_coeff * * 1.0 1.0 2.5
pair_coeff 2 3 2.0 1.0 1.12 pair_coeff 2 3 2.0 1.0 1.12
A line in a data file that specifies pair coefficients uses the exact A line in a data file that specifies pair coefficients uses the exact

View File

@ -31,7 +31,7 @@ Examples
.. code-block:: LAMMPS .. code-block:: LAMMPS
pair_style cosine/squared 3.0 pair_style cosine/squared 3.0
pair_coeff \* \* 1.0 1.3 pair_coeff * * 1.0 1.3
pair_coeff 1 3 1.0 1.3 2.0 pair_coeff 1 3 1.0 1.3 2.0
pair_coeff 1 3 1.0 1.3 wca pair_coeff 1 3 1.0 1.3 wca
pair_coeff 1 3 1.0 1.3 2.0 wca pair_coeff 1 3 1.0 1.3 2.0 wca

View File

@ -75,7 +75,9 @@ If your LAMMPS simulation has 3 atoms types and they are all to be
treated with this potential, you would use the following pair_coeff treated with this potential, you would use the following pair_coeff
command: command:
pair_coeff \* \* Ti.meam.sw.spline Ti Ti Ti .. code-block:: LAMMPS
pair_coeff * * Ti.meam.sw.spline Ti Ti Ti
The 1st 2 arguments must be \* \* so as to span all LAMMPS atom types. The 1st 2 arguments must be \* \* so as to span all LAMMPS atom types.
The three Ti arguments map LAMMPS atom types 1,2,3 to the Ti element The three Ti arguments map LAMMPS atom types 1,2,3 to the Ti element

View File

@ -64,7 +64,9 @@ NULL values are placeholders for atom types that will be used with
other potentials. An example of a pair_coeff command for use with the other potentials. An example of a pair_coeff command for use with the
*hybrid* pair style is: *hybrid* pair style is:
pair_coeff \* \* nb3b/harmonic MgOH.nb3b.harmonic Mg O H .. code-block:: LAMMPS
pair_coeff * * nb3b/harmonic MgOH.nb3b.harmonic Mg O H
Three-body non-bonded harmonic files in the *potentials* directory of Three-body non-bonded harmonic files in the *potentials* directory of
the LAMMPS distribution have a ".nb3b.harmonic" suffix. Lines that the LAMMPS distribution have a ".nb3b.harmonic" suffix. Lines that

View File

@ -180,9 +180,9 @@ functions for Si-C tersoff potential. If your LAMMPS simulation has 4
atoms types and you want the 1st 3 to be Si, and the 4th to be C, you atoms types and you want the 1st 3 to be Si, and the 4th to be C, you
would use the following pair_coeff command: would use the following pair_coeff command:
.. parsed-literal:: .. code-block:: LAMMPS
pair_coeff \* \* SiC_tersoff.poly Si Si Si C pair_coeff * * SiC_tersoff.poly Si Si Si C
The 1st 2 arguments must be \* \* so as to span all LAMMPS atom The 1st 2 arguments must be \* \* so as to span all LAMMPS atom
types. The first three Si arguments map LAMMPS atom types 1,2,3 to the types. The first three Si arguments map LAMMPS atom types 1,2,3 to the

View File

@ -113,8 +113,8 @@ which the parameters epsilon and sigma are both 1.0:
class LJCutMelt(LAMMPSPairPotential): class LJCutMelt(LAMMPSPairPotential):
def __init__(self): def __init__(self):
super(LJCutMelt,self).__init__() super(LJCutMelt,self).__init__()
# set coeffs: 48\*eps\*sig\*\*12, 24\*eps\*sig\*\*6, # set coeffs: 48*eps*sig**12, 24*eps*sig**6,
# 4\*eps\*sig\*\*12, 4\*eps\*sig\*\*6 # 4*eps*sig**12, 4*eps*sig**6
self.units = 'lj' self.units = 'lj'
self.coeff = {'lj' : {'lj' : (48.0,24.0,4.0,4.0)}} self.coeff = {'lj' : {'lj' : (48.0,24.0,4.0,4.0)}}
@ -137,18 +137,18 @@ the *LJCutMelt* example, here are the two functions:
def compute_force(self,rsq,itype,jtype): def compute_force(self,rsq,itype,jtype):
coeff = self.coeff[self.pmap[itype]][self.pmap[jtype]] coeff = self.coeff[self.pmap[itype]][self.pmap[jtype]]
r2inv = 1.0/rsq r2inv = 1.0/rsq
r6inv = r2inv\*r2inv\*r2inv r6inv = r2inv*r2inv*r2inv
lj1 = coeff[0] lj1 = coeff[0]
lj2 = coeff[1] lj2 = coeff[1]
return (r6inv \* (lj1\*r6inv - lj2))\*r2inv return (r6inv * (lj1*r6inv - lj2))*r2inv
def compute_energy(self,rsq,itype,jtype): def compute_energy(self,rsq,itype,jtype):
coeff = self.coeff[self.pmap[itype]][self.pmap[jtype]] coeff = self.coeff[self.pmap[itype]][self.pmap[jtype]]
r2inv = 1.0/rsq r2inv = 1.0/rsq
r6inv = r2inv\*r2inv\*r2inv r6inv = r2inv*r2inv*r2inv
lj3 = coeff[2] lj3 = coeff[2]
lj4 = coeff[3] lj4 = coeff[3]
return (r6inv \* (lj3\*r6inv - lj4)) return (r6inv * (lj3*r6inv - lj4))
.. note:: .. note::

View File

@ -18,7 +18,7 @@ Examples
.. code-block:: LAMMPS .. code-block:: LAMMPS
pair_style spin/magelec 4.5 pair_style spin/magelec 4.5
pair_coeff \* \* magelec 4.5 0.00109 1.0 1.0 1.0 pair_coeff * * magelec 4.5 0.00109 1.0 1.0 1.0
Description Description
""""""""""" """""""""""

View File

@ -37,15 +37,13 @@ struct TagPairSNAPBeta{};
struct TagPairSNAPComputeNeigh{}; struct TagPairSNAPComputeNeigh{};
struct TagPairSNAPPreUi{}; struct TagPairSNAPPreUi{};
struct TagPairSNAPComputeUi{}; struct TagPairSNAPComputeUi{};
struct TagPairSNAPComputeUiTot{}; // accumulate ulist into ulisttot separately
struct TagPairSNAPComputeUiCPU{}; struct TagPairSNAPComputeUiCPU{};
struct TagPairSNAPComputeZi{}; struct TagPairSNAPComputeZi{};
struct TagPairSNAPComputeBi{}; struct TagPairSNAPComputeBi{};
struct TagPairSNAPZeroYi{}; struct TagPairSNAPZeroYi{};
struct TagPairSNAPComputeYi{}; struct TagPairSNAPComputeYi{};
struct TagPairSNAPComputeDuidrj{}; struct TagPairSNAPComputeFusedDeidrj{};
struct TagPairSNAPComputeDuidrjCPU{}; struct TagPairSNAPComputeDuidrjCPU{};
struct TagPairSNAPComputeDeidrj{};
struct TagPairSNAPComputeDeidrjCPU{}; struct TagPairSNAPComputeDeidrjCPU{};
template<class DeviceType> template<class DeviceType>
@ -83,9 +81,6 @@ public:
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi>::member_type& team) const; void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi>::member_type& team) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeUiTot,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiTot>::member_type& team) const;
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const; void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const;
@ -102,14 +97,11 @@ public:
void operator() (TagPairSNAPComputeYi,const int& ii) const; void operator() (TagPairSNAPComputeYi,const int& ii) const;
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeDuidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj>::member_type& team) const; void operator() (TagPairSNAPComputeFusedDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj>::member_type& team) const;
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrjCPU>::member_type& team) const; void operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrjCPU>::member_type& team) const;
KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj>::member_type& team) const;
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU>::member_type& team) const; void operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU>::member_type& team) const;

View File

@ -30,7 +30,6 @@
#include "kokkos.h" #include "kokkos.h"
#include "sna.h" #include "sna.h"
#define MAXLINE 1024 #define MAXLINE 1024
#define MAXWORD 3 #define MAXWORD 3
@ -255,26 +254,19 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
// scratch size: 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values // scratch size: 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values
// 2 is for double buffer // 2 is for double buffer
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi> policy_ui(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
const int tile_size = (twojmax+1)*(twojmax+1);
typedef Kokkos::View< SNAcomplex*, typedef Kokkos::View< SNAcomplex*,
Kokkos::DefaultExecutionSpace::scratch_memory_space, Kokkos::DefaultExecutionSpace::scratch_memory_space,
Kokkos::MemoryTraits<Kokkos::Unmanaged> > Kokkos::MemoryTraits<Kokkos::Unmanaged> >
ScratchViewType; ScratchViewType;
int scratch_size = ScratchViewType::shmem_size( 2 * team_size * (twojmax+1)*(twojmax+1)); int scratch_size = ScratchViewType::shmem_size( 2 * team_size * tile_size );
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUi> policy_ui(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam( scratch_size )); policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam( scratch_size ));
Kokkos::parallel_for("ComputeUi",policy_ui,*this); Kokkos::parallel_for("ComputeUi",policy_ui,*this);
// ComputeUitot
vector_length = 1;
team_size = 128;
team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiTot>::team_size_max(*this);
if (team_size*vector_length > team_size_max)
team_size = team_size_max/vector_length;
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiTot> policy_ui_tot(((idxu_max+team_size-1)/team_size)*chunk_size,team_size,vector_length);
Kokkos::parallel_for("ComputeUiTot",policy_ui_tot,*this);
} }
@ -316,7 +308,7 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
typename Kokkos::RangePolicy<DeviceType, TagPairSNAPComputeYi> policy_yi(0,chunk_size*idxz_max); typename Kokkos::RangePolicy<DeviceType, TagPairSNAPComputeYi> policy_yi(0,chunk_size*idxz_max);
Kokkos::parallel_for("ComputeYi",policy_yi,*this); Kokkos::parallel_for("ComputeYi",policy_yi,*this);
//ComputeDuidrj //ComputeDuidrj and Deidrj
if (lmp->kokkos->ngpus == 0) { // CPU if (lmp->kokkos->ngpus == 0) { // CPU
int vector_length = 1; int vector_length = 1;
int team_size = 1; int team_size = 1;
@ -324,53 +316,37 @@ void PairSNAPKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrjCPU> policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrjCPU> policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
snaKK.set_dir(-1); // technically doesn't do anything snaKK.set_dir(-1); // technically doesn't do anything
Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this); Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this);
} else { // GPU, utilize scratch memory and splitting over dimensions
int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj>::team_size_max(*this); typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU> policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);
} else { // GPU, utilize scratch memory and splitting over dimensions, fused dui and dei
int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj>::team_size_max(*this);
int vector_length = 32; int vector_length = 32;
int team_size = 2; // need to cap b/c of shared memory reqs int team_size = 2; // need to cap b/c of shared memory reqs
if (team_size*vector_length > team_size_max) if (team_size*vector_length > team_size_max)
team_size = team_size_max/vector_length; team_size = team_size_max/vector_length;
// scratch size: 2 * 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values // scratch size: 2 * 2 * team_size * (twojmax+1)*(twojmax/2+1), to cover half `m1`,`m2` values due to symmetry
// 2 is for double buffer // 2 is for double buffer
const int tile_size = (twojmax+1)*(twojmax/2+1);
typedef Kokkos::View< SNAcomplex*, typedef Kokkos::View< SNAcomplex*,
Kokkos::DefaultExecutionSpace::scratch_memory_space, Kokkos::DefaultExecutionSpace::scratch_memory_space,
Kokkos::MemoryTraits<Kokkos::Unmanaged> > Kokkos::MemoryTraits<Kokkos::Unmanaged> >
ScratchViewType; ScratchViewType;
int scratch_size = ScratchViewType::shmem_size( 4 * team_size * tile_size);
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj> policy_fused_deidrj(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
policy_fused_deidrj = policy_fused_deidrj.set_scratch_size(0, Kokkos::PerTeam( scratch_size ));
int scratch_size = ScratchViewType::shmem_size( 4 * team_size * (twojmax+1)*(twojmax+1));
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj> policy_duidrj(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
policy_duidrj = policy_duidrj.set_scratch_size(0, Kokkos::PerTeam( scratch_size ));
// Need to call three times, once for each direction
for (int k = 0; k < 3; k++) { for (int k = 0; k < 3; k++) {
snaKK.set_dir(k); snaKK.set_dir(k);
Kokkos::parallel_for("ComputeDuidrj",policy_duidrj,*this); Kokkos::parallel_for("ComputeFusedDeidrj",policy_fused_deidrj,*this);
} }
} }
//ComputeDeidrj
if (lmp->kokkos->ngpus == 0) { // CPU
int vector_length = 1;
int team_size = 1;
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU> policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this);
} else { // GPU, different loop strategy internally
int team_size_max = Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj>::team_size_max(*this);
int vector_length = 32; // coalescing disaster right now, will fix later
int team_size = 8;
if (team_size*vector_length > team_size_max)
team_size = team_size_max/vector_length;
typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj> policy_deidrj(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length);
Kokkos::parallel_for("ComputeDeidrj",policy_deidrj,*this);
}
//ComputeForce //ComputeForce
if (eflag) { if (eflag) {
if (neighflag == HALF) { if (neighflag == HALF) {
@ -642,25 +618,6 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUi,const typename
my_sna.compute_ui(team,ii,jj); my_sna.compute_ui(team,ii,jj);
} }
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUiTot,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiTot>::member_type& team) const {
SNAKokkos<DeviceType> my_sna = snaKK;
// Extract the quantum number
const int idx = team.team_rank() + team.team_size() * (team.league_rank() % ((my_sna.idxu_max+team.team_size()-1)/team.team_size()));
if (idx >= my_sna.idxu_max) return;
// Extract the atomic index
const int ii = team.league_rank() / ((my_sna.idxu_max+team.team_size()-1)/team.team_size());
if (ii >= chunk_size) return;
// Extract the number of neighbors neighbor number
const int ninside = d_ninside(ii);
my_sna.compute_uitot(team,idx,ii,ninside);
}
template<class DeviceType> template<class DeviceType>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const { void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeUiCPU>::member_type& team) const {
@ -718,7 +675,7 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeBi,const typename
template<class DeviceType> template<class DeviceType>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDuidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDuidrj>::member_type& team) const { void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeFusedDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeFusedDeidrj>::member_type& team) const {
SNAKokkos<DeviceType> my_sna = snaKK; SNAKokkos<DeviceType> my_sna = snaKK;
// Extract the atom number // Extract the atom number
@ -730,7 +687,7 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDuidrj,const type
const int ninside = d_ninside(ii); const int ninside = d_ninside(ii);
if (jj >= ninside) return; if (jj >= ninside) return;
my_sna.compute_duidrj(team,ii,jj); my_sna.compute_fused_deidrj(team,ii,jj);
} }
template<class DeviceType> template<class DeviceType>
@ -750,24 +707,6 @@ void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDuidrjCPU,const t
my_sna.compute_duidrj_cpu(team,ii,jj); my_sna.compute_duidrj_cpu(team,ii,jj);
} }
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDeidrj,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrj>::member_type& team) const {
SNAKokkos<DeviceType> my_sna = snaKK;
// Extract the atom number
int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size()));
if (ii >= chunk_size) return;
// Extract the neighbor number
const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
const int ninside = d_ninside(ii);
if (jj >= ninside) return;
my_sna.compute_deidrj(team,ii,jj);
}
template<class DeviceType> template<class DeviceType>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU>::member_type& team) const { void PairSNAPKokkos<DeviceType>::operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy<DeviceType, TagPairSNAPComputeDeidrjCPU>::member_type& team) const {

View File

@ -135,14 +135,10 @@ inline
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&); // ForceSNAP void pre_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team,const int&); // ForceSNAP
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP void compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int); // ForceSNAP
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP void compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void compute_ui_orig(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
KOKKOS_INLINE_FUNCTION
void compute_uitot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int, int); // ForceSNAP
KOKKOS_INLINE_FUNCTION
void compute_zi(const int&); // ForceSNAP void compute_zi(const int&); // ForceSNAP
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void zero_yi(const int&,const int&); // ForceSNAP void zero_yi(const int&,const int&); // ForceSNAP
@ -155,12 +151,10 @@ inline
// functions for derivatives // functions for derivatives
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void compute_duidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); //ForceSNAP void compute_fused_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int, const int); //ForceSNAP
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); //ForceSNAP void compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); //ForceSNAP
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void compute_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
KOKKOS_INLINE_FUNCTION
void compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP void compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int); // ForceSNAP
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
double compute_sfac(double, double); // add_uarraytot, compute_duarray double compute_sfac(double, double); // add_uarraytot, compute_duarray
@ -251,10 +245,6 @@ inline
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void add_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int, double, double, double); // compute_ui void add_uarraytot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int, double, double, double); // compute_ui
KOKKOS_INLINE_FUNCTION
void compute_uarray(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
double, double, double,
double, double); // compute_ui
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void compute_uarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int, void compute_uarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
double, double, double, double, double, double,
@ -267,12 +257,8 @@ inline
inline inline
int compute_ncoeff(); // SNAKokkos() int compute_ncoeff(); // SNAKokkos()
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void compute_duarray(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
double, double, double, // compute_duidrj
double, double, double, double, double);
KOKKOS_INLINE_FUNCTION
void compute_duarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int, void compute_duarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int, int,
double, double, double, // compute_duidrj double, double, double, // compute_duidrj_cpu
double, double, double, double, double); double, double, double, double, double);
// Sets the style for the switching function // Sets the style for the switching function

View File

@ -19,6 +19,7 @@
#include <cmath> #include <cmath>
#include <cstring> #include <cstring>
#include <cstdlib> #include <cstdlib>
#include <type_traits>
namespace LAMMPS_NS { namespace LAMMPS_NS {
@ -231,11 +232,22 @@ void SNAKokkos<DeviceType>::grow_rij(int newnatom, int newnmax)
zlist = t_sna_2c_ll("sna:zlist",idxz_max,natom); zlist = t_sna_2c_ll("sna:zlist",idxz_max,natom);
//ulist = t_sna_3c("sna:ulist",natom,nmax,idxu_max); //ulist = t_sna_3c("sna:ulist",natom,nmax,idxu_max);
#ifdef KOKKOS_ENABLE_CUDA
if (std::is_same<DeviceType,Kokkos::Cuda>::value) {
// dummy allocation
ulist = t_sna_3c_ll("sna:ulist",1,1,1);
dulist = t_sna_4c_ll("sna:dulist",1,1,1);
} else {
#endif
ulist = t_sna_3c_ll("sna:ulist",idxu_max,natom,nmax); ulist = t_sna_3c_ll("sna:ulist",idxu_max,natom,nmax);
dulist = t_sna_4c_ll("sna:dulist",idxu_max,natom,nmax);
#ifdef KOKKOS_ENABLE_CUDA
}
#endif
//ylist = t_sna_2c_lr("sna:ylist",natom,idxu_max); //ylist = t_sna_2c_lr("sna:ylist",natom,idxu_max);
ylist = t_sna_2c_ll("sna:ylist",idxu_max,natom); ylist = t_sna_2c_ll("sna:ylist",idxu_max,natom);
//dulist = t_sna_4c("sna:dulist",natom,nmax,idxu_max);
dulist = t_sna_4c_ll("sna:dulist",idxu_max,natom,nmax); dulist = t_sna_4c_ll("sna:dulist",idxu_max,natom,nmax);
} }
@ -269,14 +281,14 @@ void SNAKokkos<DeviceType>::pre_ui(const typename Kokkos::TeamPolicy<DeviceType>
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
compute Ui by summing over bispectrum components compute Ui by computing Wigner U-functions for one neighbor and
accumulating to the total. GPU only.
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
template<class DeviceType> template<class DeviceType>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor) void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int iatom, const int jnbor)
{ {
double rsq, r, x, y, z, z0, theta0;
// utot(j,ma,mb) = 0 for all j,ma,ma // utot(j,ma,mb) = 0 for all j,ma,ma
// utot(j,ma,ma) = 1 for all j,ma // utot(j,ma,ma) = 1 for all j,ma
@ -284,22 +296,143 @@ void SNAKokkos<DeviceType>::compute_ui(const typename Kokkos::TeamPolicy<DeviceT
// compute r0 = (x,y,z,z0) // compute r0 = (x,y,z,z0)
// utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb // utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb
x = rij(iatom,jnbor,0); // get shared memory offset
y = rij(iatom,jnbor,1); const int max_m_tile = (twojmax+1)*(twojmax+1);
z = rij(iatom,jnbor,2); const int team_rank = team.team_rank();
rsq = x * x + y * y + z * z; const int scratch_shift = team_rank * max_m_tile;
r = sqrt(rsq);
theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0); // double buffer
SNAcomplex* buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
SNAcomplex* buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
const double x = rij(iatom,jnbor,0);
const double y = rij(iatom,jnbor,1);
const double z = rij(iatom,jnbor,2);
const double wj_local = wj(iatom, jnbor);
const double rcut = rcutij(iatom, jnbor);
const double rsq = x * x + y * y + z * z;
const double r = sqrt(rsq);
const double theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0);
// theta0 = (r - rmin0) * rscale0; // theta0 = (r - rmin0) * rscale0;
z0 = r / tan(theta0); const double cs = cos(theta0);
const double sn = sin(theta0);
const double z0 = r * cs / sn; // r / tan(theta0)
compute_uarray(team, iatom, jnbor, x, y, z, z0, r); // Compute cutoff function
const double sfac = compute_sfac(r, rcut) * wj_local;
// if we're on the GPU, accumulating into uarraytot is done in a separate kernel. // compute Cayley-Klein parameters for unit quaternion,
// if we're not, it's more efficient to include it in compute_uarray. // pack into complex number
const double r0inv = 1.0 / sqrt(r * r + z0 * z0);
const SNAcomplex a = { r0inv * z0, -r0inv * z };
const SNAcomplex b = { r0inv * y, -r0inv * x };
// VMK Section 4.8.2
// All writes go to global memory and shared memory
// so we can avoid all global memory reads
Kokkos::single(Kokkos::PerThread(team), [=]() {
//ulist(0,iatom,jnbor) = { 1.0, 0.0 };
buf1[0] = {1.,0.};
Kokkos::atomic_add(&(ulisttot(0,iatom).re), sfac);
});
for (int j = 1; j <= twojmax; j++) {
const int jju = idxu_block[j];
const int jjup = idxu_block[j-1];
// fill in left side of matrix layer from previous layer
// Flatten loop over ma, mb, need to figure out total
// number of iterations
// for (int ma = 0; ma <= j; ma++)
const int n_ma = j+1;
// for (int mb = 0; 2*mb <= j; mb++)
const int n_mb = j/2+1;
// the last (j / 2) can be avoided due to symmetry
const int total_iters = n_ma * n_mb - (j % 2 == 0 ? (j / 2) : 0);
//for (int m = 0; m < total_iters; m++) {
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, total_iters),
[&] (const int m) {
// ma fast, mb slow
int ma = m % n_ma;
int mb = m / n_ma;
// index into global memory array
const int jju_index = jju+m;
//const int jjup_index = jjup+mb*j+ma;
// index into shared memory buffer for this level
const int jju_shared_idx = m;
// index into shared memory buffer for next level
const int jjup_shared_idx = jju_shared_idx - mb;
SNAcomplex u_accum = {0., 0.};
// VMK recursion relation: grab contribution which is multiplied by b*
const double rootpq2 = -rootpqarray(ma, j - mb);
const SNAcomplex u_up2 = (ma > 0)?rootpq2*buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.);
//const SNAcomplex u_up2 = (ma > 0)?rootpq2*ulist(jjup_index-1,iatom,jnbor):SNAcomplex(0.,0.);
caconjxpy(b, u_up2, u_accum);
// VMK recursion relation: grab contribution which is multiplied by a*
const double rootpq1 = rootpqarray(j - ma, j - mb);
const SNAcomplex u_up1 = (ma < j)?rootpq1*buf1[jjup_shared_idx]:SNAcomplex(0.,0.);
//const SNAcomplex u_up1 = (ma < j)?rootpq1*ulist(jjup_index,iatom,jnbor):SNAcomplex(0.,0.);
caconjxpy(a, u_up1, u_accum);
//ulist(jju_index,iatom,jnbor) = u_accum;
// back up into shared memory for next iter
buf2[jju_shared_idx] = u_accum;
Kokkos::atomic_add(&(ulisttot(jju_index,iatom).re), sfac * u_accum.re);
Kokkos::atomic_add(&(ulisttot(jju_index,iatom).im), sfac * u_accum.im);
// copy left side to right side with inversion symmetry VMK 4.4(2)
// u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb))
// if j is even (-> physical j integer), last element maps to self, skip
//if (!(m == total_iters - 1 && j % 2 == 0)) {
if (m < total_iters - 1 || j % 2 == 1) {
const int sign_factor = (((ma+mb)%2==0)?1:-1);
const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1);
const int jjup_flip = jju + jju_shared_flip; // jju+(j+1-mb)*(j+1)-(ma+1);
if (sign_factor == 1) {
u_accum.im = -u_accum.im;
} else {
u_accum.re = -u_accum.re;
}
//ulist(jjup_flip,iatom,jnbor) = u_accum;
buf2[jju_shared_flip] = u_accum;
Kokkos::atomic_add(&(ulisttot(jjup_flip,iatom).re), sfac * u_accum.re);
Kokkos::atomic_add(&(ulisttot(jjup_flip,iatom).im), sfac * u_accum.im);
}
});
// In CUDA backend,
// ThreadVectorRange has a __syncwarp (appropriately masked for
// vector lengths < 32) implict at the end
// swap double buffers
auto tmp = buf1; buf1 = buf2; buf2 = tmp;
}
} }
/* ----------------------------------------------------------------------
compute Ui by summing over bispectrum components. CPU only.
------------------------------------------------------------------------- */
template<class DeviceType> template<class DeviceType>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor) void SNAKokkos<DeviceType>::compute_ui_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
@ -327,40 +460,8 @@ void SNAKokkos<DeviceType>::compute_ui_cpu(const typename Kokkos::TeamPolicy<Dev
} }
/* ----------------------------------------------------------------------
compute UiTot by summing over neighbors
------------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::compute_uitot(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int idx, int iatom, int ninside)
{
// fuse initialize in, avoid this load?
SNAcomplex utot = ulisttot(idx, iatom);
for (int jnbor = 0; jnbor < ninside; jnbor++) {
const auto x = rij(iatom,jnbor,0);
const auto y = rij(iatom,jnbor,1);
const auto z = rij(iatom,jnbor,2);
const auto rsq = x * x + y * y + z * z;
const auto r = sqrt(rsq);
const double wj_local = wj(iatom, jnbor);
const double rcut = rcutij(iatom, jnbor);
const double sfac = compute_sfac(r, rcut) * wj_local;
auto ulist_local = ulist(idx, iatom, jnbor);
utot.re += sfac * ulist_local.re;
utot.im += sfac * ulist_local.im;
}
ulisttot(idx, iatom) = utot;
}
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
compute Zi by summing over products of Ui compute Zi by summing over products of Ui
not updated yet
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
template<class DeviceType> template<class DeviceType>
@ -509,72 +610,203 @@ void SNAKokkos<DeviceType>::compute_yi(int iter,
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
compute dEidRj Fused calculation of the derivative of Ui w.r.t. atom j
and of dEidRj. GPU only.
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
template<class DeviceType> template<class DeviceType>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::compute_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor) void SNAKokkos<DeviceType>::compute_fused_deidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, const int iatom, const int jnbor)
{ {
t_scalar3<double> final_sum; // get shared memory offset
const int max_m_tile = (twojmax+1)*(twojmax/2+1);
const int team_rank = team.team_rank();
const int scratch_shift = team_rank * max_m_tile;
// Like in ComputeUi/ComputeDuidrj, regular loop over j. // double buffer for ulist
for (int j = 0; j <= twojmax; j++) { SNAcomplex* ulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
int jju = idxu_block(j); SNAcomplex* ulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
// Flatten loop over ma, mb, reduce w/in // double buffer for dulist
SNAcomplex* dulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
SNAcomplex* dulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift;
const double x = rij(iatom,jnbor,0);
const double y = rij(iatom,jnbor,1);
const double z = rij(iatom,jnbor,2);
const double rsq = x * x + y * y + z * z;
const double r = sqrt(rsq);
const double rcut = rcutij(iatom, jnbor);
const double rscale0 = rfac0 * MY_PI / (rcut - rmin0);
const double theta0 = (r - rmin0) * rscale0;
const double cs = cos(theta0);
const double sn = sin(theta0);
const double z0 = r * cs / sn;
const double dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
const double wj_local = wj(iatom, jnbor);
const double sfac = wj_local * compute_sfac(r, rcut);
const double dsfac = wj_local * compute_dsfac(r, rcut);
const double rinv = 1.0 / r;
// extract a single unit vector
const double u = (dir == 0 ? x * rinv : dir == 1 ? y * rinv : z * rinv);
// Compute Cayley-Klein parameters for unit quaternion
const double r0inv = 1.0 / sqrt(r * r + z0 * z0);
const SNAcomplex a = { r0inv * z0, -r0inv * z };
const SNAcomplex b = { r0inv * y, -r0inv * x };
const double dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr);
const double dr0inv = dr0invdr * u;
const double dz0 = dz0dr * u;
const SNAcomplex da = { dz0 * r0inv + z0 * dr0inv,
- z * dr0inv + (dir == 2 ? - r0inv : 0.) };
const SNAcomplex db = { y * dr0inv + (dir==1?r0inv:0.),
-x * dr0inv + (dir==0?-r0inv:0.) };
// Accumulate the full contribution to dedr on the fly
const double du_prod = dsfac * u; // chain rule
const SNAcomplex y_local = ylist(0, iatom);
// Symmetry factor of 0.5 b/c 0 element is on diagonal for even j==0
double dedr_full_sum = 0.5 * du_prod * y_local.re;
// single has a warp barrier at the end
Kokkos::single(Kokkos::PerThread(team), [=]() {
//dulist(0,iatom,jnbor,dir) = { dsfac * u, 0. }; // fold in chain rule here
ulist_buf1[0] = {1., 0.};
dulist_buf1[0] = {0., 0.};
});
for (int j = 1; j <= twojmax; j++) {
int jju = idxu_block[j];
int jjup = idxu_block[j-1];
// flatten the loop over ma,mb
// for (int ma = 0; ma <= j; ma++)
const int n_ma = j+1; const int n_ma = j+1;
// for (int mb = 0; 2*mb <= j; mb++) // for (int mb = 0; 2*mb <= j; mb++)
const int n_mb = j/2+1; const int n_mb = j/2+1;
const int total_iters = n_ma * n_mb; const int total_iters = n_ma * n_mb;
t_scalar3<double> sum; double dedr_sum = 0.; // j-local sum
//for (int m = 0; m < total_iters; m++) { //for (int m = 0; m < total_iters; m++) {
Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, total_iters), Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, total_iters),
[&] (const int m, t_scalar3<double>& sum_tmp) { [&] (const int m, double& sum_tmp) {
// ma fast, mb slow // ma fast, mb slow
int ma = m % n_ma; int ma = m % n_ma;
int mb = m / n_ma; int mb = m / n_ma;
// get index const int jju_index = jju+m;
const int jju_index = jju+mb+mb*j+ma;
// get ylist, rescale last element by 0.5
SNAcomplex y_local = ylist(jju_index,iatom);
const SNAcomplex du_x = dulist(jju_index,iatom,jnbor,0);
const SNAcomplex du_y = dulist(jju_index,iatom,jnbor,1);
const SNAcomplex du_z = dulist(jju_index,iatom,jnbor,2);
// Load y_local, apply the symmetry scaling factor
// The "secret" of the shared memory optimization is it eliminates
// all global memory reads to duidrj in lieu of caching values in
// shared memory and otherwise always writing, making the kernel
// ultimately compute bound. We take advantage of that by adding
// some reads back in.
auto y_local = ylist(jju_index,iatom);
if (j % 2 == 0 && 2*mb == j) { if (j % 2 == 0 && 2*mb == j) {
if (ma == mb) { y_local = 0.5*y_local; } if (ma == mb) { y_local = 0.5*y_local; }
else if (ma > mb) { y_local = { 0., 0. }; } else if (ma > mb) { y_local = { 0., 0. }; } // can probably avoid this outright
// else the ma < mb gets "double counted", cancelling the 0.5. // else the ma < mb gets "double counted", cancelling the 0.5.
} }
sum_tmp.x += du_x.re * y_local.re + du_x.im * y_local.im; // index into shared memory
sum_tmp.y += du_y.re * y_local.re + du_y.im * y_local.im; const int jju_shared_idx = m;
sum_tmp.z += du_z.re * y_local.re + du_z.im * y_local.im; const int jjup_shared_idx = jju_shared_idx - mb;
}, sum); // end loop over flattened ma,mb // Need to compute and accumulate both u and du (mayhaps, we could probably
// balance some read and compute by reading u each time).
SNAcomplex u_accum = { 0., 0. };
SNAcomplex du_accum = { 0., 0. };
final_sum.x += sum.x; const double rootpq2 = -rootpqarray(ma, j - mb);
final_sum.y += sum.y; const SNAcomplex u_up2 = (ma > 0)?rootpq2*ulist_buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.);
final_sum.z += sum.z; caconjxpy(b, u_up2, u_accum);
const double rootpq1 = rootpqarray(j - ma, j - mb);
const SNAcomplex u_up1 = (ma < j)?rootpq1*ulist_buf1[jjup_shared_idx]:SNAcomplex(0.,0.);
caconjxpy(a, u_up1, u_accum);
// Next, spin up du_accum
const SNAcomplex du_up1 = (ma < j) ? rootpq1*dulist_buf1[jjup_shared_idx] : SNAcomplex(0.,0.);
caconjxpy(da, u_up1, du_accum);
caconjxpy(a, du_up1, du_accum);
const SNAcomplex du_up2 = (ma > 0) ? rootpq2*dulist_buf1[jjup_shared_idx-1] : SNAcomplex(0.,0.);
caconjxpy(db, u_up2, du_accum);
caconjxpy(b, du_up2, du_accum);
// No need to save u_accum to global memory
// Cache u_accum, du_accum to scratch memory.
ulist_buf2[jju_shared_idx] = u_accum;
dulist_buf2[jju_shared_idx] = du_accum;
// Directly accumulate deidrj into sum_tmp
//dulist(jju_index,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum);
const SNAcomplex du_prod = ((dsfac * u)*u_accum) + (sfac*du_accum);
sum_tmp += du_prod.re * y_local.re + du_prod.im * y_local.im;
// copy left side to right side with inversion symmetry VMK 4.4(2)
// u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
if (j%2==1 && mb+1==n_mb) {
int sign_factor = (((ma+mb)%2==0)?1:-1);
//const int jjup_flip = jju+(j+1-mb)*(j+1)-(ma+1); // no longer needed b/c we don't update dulist
const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1);
if (sign_factor == 1) {
u_accum.im = -u_accum.im;
du_accum.im = -du_accum.im;
} else {
u_accum.re = -u_accum.re;
du_accum.re = -du_accum.re;
} }
Kokkos::single(Kokkos::PerThread(team), [&] () { // We don't need the second half of the tile for the deidrj accumulation.
dedr(iatom,jnbor,0) = final_sum.x*2.0; // That's taken care of by the symmetry factor above.
dedr(iatom,jnbor,1) = final_sum.y*2.0; //dulist(jjup_flip,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum);
dedr(iatom,jnbor,2) = final_sum.z*2.0;
});
// We do need it for ortho polynomial generation, though
ulist_buf2[jju_shared_flip] = u_accum;
dulist_buf2[jju_shared_flip] = du_accum;
}
}, dedr_sum);
// swap buffers
auto tmp = ulist_buf1; ulist_buf1 = ulist_buf2; ulist_buf2 = tmp;
tmp = dulist_buf1; dulist_buf1 = dulist_buf2; dulist_buf2 = tmp;
// Accumulate dedr. This "should" be in a single, but
// a Kokkos::single call implies a warp sync, and we may
// as well avoid that. This does no harm as long as the
// final assignment is in a single block.
//Kokkos::single(Kokkos::PerThread(team), [=]() {
dedr_full_sum += dedr_sum;
//});
}
// Store the accumulated dedr.
Kokkos::single(Kokkos::PerThread(team), [&] () {
dedr(iatom,jnbor,dir) = dedr_full_sum*2.0;
});
} }
/* ----------------------------------------------------------------------
compute dEidRj, CPU path only.
------------------------------------------------------------------------- */
template<class DeviceType> template<class DeviceType>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor) void SNAKokkos<DeviceType>::compute_deidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
@ -708,28 +940,6 @@ void SNAKokkos<DeviceType>::compute_bi(const typename Kokkos::TeamPolicy<DeviceT
calculate derivative of Ui w.r.t. atom j calculate derivative of Ui w.r.t. atom j
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::compute_duidrj(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
{
double rsq, r, x, y, z, z0, theta0, cs, sn;
double dz0dr;
x = rij(iatom,jnbor,0);
y = rij(iatom,jnbor,1);
z = rij(iatom,jnbor,2);
rsq = x * x + y * y + z * z;
r = sqrt(rsq);
double rscale0 = rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0);
theta0 = (r - rmin0) * rscale0;
cs = cos(theta0);
sn = sin(theta0);
z0 = r * cs / sn;
dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq;
compute_duarray(team, iatom, jnbor, x, y, z, z0, r, dz0dr, wj(iatom,jnbor), rcutij(iatom,jnbor));
}
template<class DeviceType> template<class DeviceType>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor) void SNAKokkos<DeviceType>::compute_duidrj_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor)
@ -774,119 +984,6 @@ void SNAKokkos<DeviceType>::add_uarraytot(const typename Kokkos::TeamPolicy<Devi
compute Wigner U-functions for one neighbor compute Wigner U-functions for one neighbor
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::compute_uarray(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
double x, double y, double z,
double z0, double r)
{
// define size of scratch memory buffer
const int max_m_tile = (twojmax+1)*(twojmax+1);
const int team_rank = team.team_rank();
// get scratch memory double buffer
SNAcomplex* buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0);
SNAcomplex* buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0);
// compute Cayley-Klein parameters for unit quaternion,
// pack into complex number
double r0inv = 1.0 / sqrt(r * r + z0 * z0);
SNAcomplex a = { r0inv * z0, -r0inv * z };
SNAcomplex b = { r0inv * y, -r0inv * x };
// VMK Section 4.8.2
// All writes go to global memory and shared memory
// so we can avoid all global memory reads
Kokkos::single(Kokkos::PerThread(team), [=]() {
ulist(0,iatom,jnbor) = { 1.0, 0.0 };
buf1[max_m_tile*team_rank] = {1.,0.};
});
for (int j = 1; j <= twojmax; j++) {
const int jju = idxu_block[j];
int jjup = idxu_block[j-1];
// fill in left side of matrix layer from previous layer
// Flatten loop over ma, mb, need to figure out total
// number of iterations
// for (int ma = 0; ma <= j; ma++)
const int n_ma = j+1;
// for (int mb = 0; 2*mb <= j; mb++)
const int n_mb = j/2+1;
const int total_iters = n_ma * n_mb;
//for (int m = 0; m < total_iters; m++) {
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, total_iters),
[&] (const int m) {
// ma fast, mb slow
int ma = m % n_ma;
int mb = m / n_ma;
// index into global memory array
const int jju_index = jju+mb+mb*j+ma;
// index into shared memory buffer for previous level
const int jju_shared_idx = max_m_tile*team_rank+mb+mb*j+ma;
// index into shared memory buffer for next level
const int jjup_shared_idx = max_m_tile*team_rank+mb*j+ma;
SNAcomplex u_accum = {0., 0.};
// VMK recursion relation: grab contribution which is multiplied by a*
const double rootpq1 = rootpqarray(j - ma, j - mb);
const SNAcomplex u_up1 = (ma < j)?rootpq1*buf1[jjup_shared_idx]:SNAcomplex(0.,0.);
caconjxpy(a, u_up1, u_accum);
// VMK recursion relation: grab contribution which is multiplied by b*
const double rootpq2 = -rootpqarray(ma, j - mb);
const SNAcomplex u_up2 = (ma > 0)?rootpq2*buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.);
caconjxpy(b, u_up2, u_accum);
ulist(jju_index,iatom,jnbor) = u_accum;
// We no longer accumulate into ulisttot in this kernel.
// Instead, we have a separate kernel which avoids atomics.
// Running two separate kernels is net faster.
// back up into shared memory for next iter
if (j != twojmax) buf2[jju_shared_idx] = u_accum;
// copy left side to right side with inversion symmetry VMK 4.4(2)
// u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb))
// We can avoid this if we're on the last row for an integer j
if (!(n_ma % 2 == 1 && (mb+1) == n_mb)) {
int sign_factor = ((ma%2==0)?1:-1)*(mb%2==0?1:-1);
const int jjup_flip = jju+(j+1-mb)*(j+1)-(ma+1);
const int jju_shared_flip = max_m_tile*team_rank+(j+1-mb)*(j+1)-(ma+1);
if (sign_factor == 1) {
u_accum.im = -u_accum.im;
} else {
u_accum.re = -u_accum.re;
}
ulist(jjup_flip,iatom,jnbor) = u_accum;
if (j != twojmax) buf2[jju_shared_flip] = u_accum;
}
});
// In CUDA backend,
// ThreadVectorRange has a __syncwarp (appropriately masked for
// vector lengths < 32) implicit at the end
// swap double buffers
auto tmp = buf1; buf1 = buf2; buf2 = tmp;
//std::swap(buf1, buf2); // throws warnings
}
}
// CPU version
template<class DeviceType> template<class DeviceType>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::compute_uarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor, void SNAKokkos<DeviceType>::compute_uarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
@ -976,152 +1073,9 @@ void SNAKokkos<DeviceType>::compute_uarray_cpu(const typename Kokkos::TeamPolicy
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
compute derivatives of Wigner U-functions for one neighbor compute derivatives of Wigner U-functions for one neighbor
see comments in compute_uarray() see comments in compute_uarray_cpu()
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::compute_duarray(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
double x, double y, double z,
double z0, double r, double dz0dr,
double wj, double rcut)
{
// get shared memory offset
const int max_m_tile = (twojmax+1)*(twojmax+1);
const int team_rank = team.team_rank();
// double buffer for ulist
SNAcomplex* ulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0);
SNAcomplex* ulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0);
// double buffer for dulist
SNAcomplex* dulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0);
SNAcomplex* dulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0);
const double sfac = wj * compute_sfac(r, rcut);
const double dsfac = wj * compute_dsfac(r, rcut);
const double rinv = 1.0 / r;
// extract a single unit vector
const double u = (dir == 0 ? x * rinv : dir == 1 ? y * rinv : z * rinv);
// Compute Cayley-Klein parameters for unit quaternion
const double r0inv = 1.0 / sqrt(r * r + z0 * z0);
const SNAcomplex a = { r0inv * z0, -r0inv * z };
const SNAcomplex b = { r0inv * y, -r0inv * x };
const double dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr);
const double dr0inv = dr0invdr * u;
const double dz0 = dz0dr * u;
const SNAcomplex da = { dz0 * r0inv + z0 * dr0inv,
- z * dr0inv + (dir == 2 ? - r0inv : 0.) };
const SNAcomplex db = { y * dr0inv + (dir==1?r0inv:0.),
-x * dr0inv + (dir==0?-r0inv:0.) };
// single has a warp barrier at the end
Kokkos::single(Kokkos::PerThread(team), [=]() {
dulist(0,iatom,jnbor,dir) = { dsfac * u, 0. }; // fold in chain rule here
ulist_buf1[max_m_tile*team_rank] = {1., 0.};
dulist_buf1[max_m_tile*team_rank] = {0., 0.};
});
for (int j = 1; j <= twojmax; j++) {
int jju = idxu_block[j];
int jjup = idxu_block[j-1];
// flatten the loop over ma,mb
// for (int ma = 0; ma <= j; ma++)
const int n_ma = j+1;
// for (int mb = 0; 2*mb <= j; mb++)
const int n_mb = j/2+1;
const int total_iters = n_ma * n_mb;
//for (int m = 0; m < total_iters; m++) {
Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, total_iters),
[&] (const int m) {
// ma fast, mb slow
int ma = m % n_ma;
int mb = m / n_ma;
const int jju_index = jju+mb+mb*j+ma;
// index into shared memory
const int jju_shared_idx = max_m_tile*team_rank+mb+mb*j+ma;
const int jjup_shared_idx = max_m_tile*team_rank+mb*j+ma;
// Need to compute and accumulate both u and du (mayhaps, we could probably
// balance some read and compute by reading u each time).
SNAcomplex u_accum = { 0., 0. };
SNAcomplex du_accum = { 0., 0. };
const double rootpq1 = rootpqarray(j - ma, j - mb);
const SNAcomplex u_up1 = (ma < j)?rootpq1*ulist_buf1[jjup_shared_idx]:SNAcomplex(0.,0.);
caconjxpy(a, u_up1, u_accum);
const double rootpq2 = -rootpqarray(ma, j - mb);
const SNAcomplex u_up2 = (ma > 0)?rootpq2*ulist_buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.);
caconjxpy(b, u_up2, u_accum);
// No need to save u_accum to global memory
if (j != twojmax) ulist_buf2[jju_shared_idx] = u_accum;
// Next, spin up du_accum
const SNAcomplex du_up1 = (ma < j) ? rootpq1*dulist_buf1[jjup_shared_idx] : SNAcomplex(0.,0.);
caconjxpy(da, u_up1, du_accum);
caconjxpy(a, du_up1, du_accum);
const SNAcomplex du_up2 = (ma > 0) ? rootpq2*dulist_buf1[jjup_shared_idx-1] : SNAcomplex(0.,0.);
caconjxpy(db, u_up2, du_accum);
caconjxpy(b, du_up2, du_accum);
dulist(jju_index,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum);
if (j != twojmax) dulist_buf2[jju_shared_idx] = du_accum;
// copy left side to right side with inversion symmetry VMK 4.4(2)
// u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb])
int sign_factor = ((ma%2==0)?1:-1)*(mb%2==0?1:-1);
const int jjup_flip = jju+(j+1-mb)*(j+1)-(ma+1);
const int jju_shared_flip = max_m_tile*team_rank+(j+1-mb)*(j+1)-(ma+1);
if (sign_factor == 1) {
//ulist_alt(iatom,jnbor,jjup_flip).re = u_accum.re;
//ulist_alt(iatom,jnbor,jjup_flip).im = -u_accum.im;
u_accum.im = -u_accum.im;
du_accum.im = -du_accum.im;
} else {
//ulist_alt(iatom,jnbor,jjup_flip).re = -u_accum.re;
//ulist_alt(iatom,jnbor,jjup_flip).im = u_accum.im;
u_accum.re = -u_accum.re;
du_accum.re = -du_accum.re;
}
dulist(jjup_flip,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum);
if (j != twojmax) {
ulist_buf2[jju_shared_flip] = u_accum;
dulist_buf2[jju_shared_flip] = du_accum;
}
});
// swap buffers
auto tmp = ulist_buf1; ulist_buf1 = ulist_buf2; ulist_buf2 = tmp;
tmp = dulist_buf1; dulist_buf1 = dulist_buf2; dulist_buf2 = tmp;
}
}
template<class DeviceType> template<class DeviceType>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void SNAKokkos<DeviceType>::compute_duarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor, void SNAKokkos<DeviceType>::compute_duarray_cpu(const typename Kokkos::TeamPolicy<DeviceType>::member_type& team, int iatom, int jnbor,
@ -1680,11 +1634,17 @@ double SNAKokkos<DeviceType>::memory_usage()
bytes += jdimpq*jdimpq * sizeof(double); // pqarray bytes += jdimpq*jdimpq * sizeof(double); // pqarray
bytes += idxcg_max * sizeof(double); // cglist bytes += idxcg_max * sizeof(double); // cglist
#ifdef KOKKOS_ENABLE_CUDA
if (!std::is_same<DeviceType,Kokkos::Cuda>::value) {
#endif
bytes += natom * idxu_max * sizeof(double) * 2; // ulist bytes += natom * idxu_max * sizeof(double) * 2; // ulist
bytes += natom * idxu_max * 3 * sizeof(double) * 2; // dulist
#ifdef KOKKOS_ENABLE_CUDA
}
#endif
bytes += natom * idxu_max * sizeof(double) * 2; // ulisttot bytes += natom * idxu_max * sizeof(double) * 2; // ulisttot
if (!Kokkos::Impl::is_same<typename DeviceType::array_layout,Kokkos::LayoutRight>::value) if (!Kokkos::Impl::is_same<typename DeviceType::array_layout,Kokkos::LayoutRight>::value)
bytes += natom * idxu_max * sizeof(double) * 2; // ulisttot_lr bytes += natom * idxu_max * sizeof(double) * 2; // ulisttot_lr
bytes += natom * idxu_max * 3 * sizeof(double) * 2; // dulist
bytes += natom * idxz_max * sizeof(double) * 2; // zlist bytes += natom * idxz_max * sizeof(double) * 2; // zlist
bytes += natom * idxb_max * sizeof(double); // blist bytes += natom * idxb_max * sizeof(double); // blist

View File

@ -2920,7 +2920,7 @@ void MSM::compute_phis_and_dphis(const double &dx, const double &dy,
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
compute phi using interpolating polynomial compute phi using interpolating polynomial
see Eq 7 from Parallel Computing 35 (2009) 164<EFBFBD>177 see Eq 7 from Parallel Computing 35 (2009) 164-177
and Hardy's thesis and Hardy's thesis
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
@ -2999,7 +2999,7 @@ inline double MSM::compute_phi(const double &xi)
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
compute the derivative of phi compute the derivative of phi
phi is an interpolating polynomial phi is an interpolating polynomial
see Eq 7 from Parallel Computing 35 (2009) 164<EFBFBD>177 see Eq 7 from Parallel Computing 35 (2009) 164-177
and Hardy's thesis and Hardy's thesis
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */

View File

@ -12,7 +12,7 @@
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
Contributing author: Markus H<EFBFBD>hnerbach (RWTH) Contributing author: Markus Höhnerbach (RWTH)
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
#include <cmath> #include <cmath>

View File

@ -185,7 +185,7 @@ void VerletLRTIntel::setup(int flag)
_kspace_done = 0; _kspace_done = 0;
pthread_mutex_unlock(&_kmutex); pthread_mutex_unlock(&_kmutex);
#elif defined(_LMP_INTEL_LRT_11) #elif defined(_LMP_INTEL_LRT_11)
kspace_thread.join(); _kspace_thread.join();
#endif #endif
if (kspace_compute_flag) _intel_kspace->compute_second(eflag,vflag); if (kspace_compute_flag) _intel_kspace->compute_second(eflag,vflag);
@ -298,9 +298,9 @@ void VerletLRTIntel::run(int n)
pthread_cond_signal(&_kcond); pthread_cond_signal(&_kcond);
pthread_mutex_unlock(&_kmutex); pthread_mutex_unlock(&_kmutex);
#elif defined(_LMP_INTEL_LRT_11) #elif defined(_LMP_INTEL_LRT_11)
std::thread kspace_thread; std::thread _kspace_thread;
if (kspace_compute_flag) if (kspace_compute_flag)
kspace_thread=std::thread([=] { _kspace_thread=std::thread([=] {
_intel_kspace->compute_first(eflag, vflag); _intel_kspace->compute_first(eflag, vflag);
timer->stamp(Timer::KSPACE); timer->stamp(Timer::KSPACE);
} ); } );
@ -332,7 +332,7 @@ void VerletLRTIntel::run(int n)
pthread_mutex_unlock(&_kmutex); pthread_mutex_unlock(&_kmutex);
#elif defined(_LMP_INTEL_LRT_11) #elif defined(_LMP_INTEL_LRT_11)
if (kspace_compute_flag) if (kspace_compute_flag)
kspace_thread.join(); _kspace_thread.join();
#endif #endif
if (kspace_compute_flag) { if (kspace_compute_flag) {

View File

@ -13,7 +13,7 @@
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
The SMTBQ code has been developed with the financial support of CNRS and The SMTBQ code has been developed with the financial support of CNRS and
of the Regional Council of Burgundy (Convention n<EFBFBD> 2010-9201AAO037S03129) of the Regional Council of Burgundy (Convention n¡ 2010-9201AAO037S03129)
Copyright (2015) Copyright (2015)
Universite de Bourgogne : Nicolas SALLES, Olivier POLITANO Universite de Bourgogne : Nicolas SALLES, Olivier POLITANO
@ -943,7 +943,7 @@ void PairSMTBQ::compute(int eflag, int vflag)
3 -> Short int. Ox-Ox 3 -> Short int. Ox-Ox
4 -> Short int. SMTB (repulsion) 4 -> Short int. SMTB (repulsion)
5 -> Covalent energy SMTB 5 -> Covalent energy SMTB
6 -> Somme des Q(i)<EFBFBD> 6 -> Somme des Q(i)²
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
/* -------------- N-body forces Calcul --------------- */ /* -------------- N-body forces Calcul --------------- */
@ -3022,7 +3022,7 @@ void PairSMTBQ::groupQEqAllParallel_QEq()
ngp = igp = 0; nelt[ngp] = 0; ngp = igp = 0; nelt[ngp] = 0;
// On prend un oxyg<EFBFBD>ne // On prend un oxygène
// printf ("[me %d] On prend un oxygene\n",me); // printf ("[me %d] On prend un oxygene\n",me);
for (ii = 0; ii < inum; ii++) { for (ii = 0; ii < inum; ii++) {

View File

@ -36,6 +36,7 @@ AtomVec::AtomVec(LAMMPS *lmp) : Pointers(lmp)
forceclearflag = 0; forceclearflag = 0;
size_data_bonus = 0; size_data_bonus = 0;
maxexchange = 0; maxexchange = 0;
molecular = 0;
kokkosable = 0; kokkosable = 0;

View File

@ -661,6 +661,8 @@ int Variable::next(int narg, char **arg)
} else if (istyle == UNIVERSE || istyle == ULOOP) { } else if (istyle == UNIVERSE || istyle == ULOOP) {
uloop_again:
// wait until lock file can be created and owned by proc 0 of this world // wait until lock file can be created and owned by proc 0 of this world
// rename() is not atomic in practice, but no known simple fix // rename() is not atomic in practice, but no known simple fix
// means multiple procs can read/write file at the same time (bad!) // means multiple procs can read/write file at the same time (bad!)
@ -669,7 +671,7 @@ int Variable::next(int narg, char **arg)
// delay for random fraction of 1 second before subsequent tries // delay for random fraction of 1 second before subsequent tries
// when successful, read next available index and Bcast it within my world // when successful, read next available index and Bcast it within my world
int nextindex; int nextindex = -1;
if (me == 0) { if (me == 0) {
int seed = 12345 + universe->me + which[find(arg[0])]; int seed = 12345 + universe->me + which[find(arg[0])];
RanMars *random = new RanMars(lmp,seed); RanMars *random = new RanMars(lmp,seed);
@ -682,10 +684,33 @@ int Variable::next(int narg, char **arg)
} }
delete random; delete random;
FILE *fp = fopen("tmp.lammps.variable.lock","r"); // if the file cannot be found, we may have a race with some
fscanf(fp,"%d",&nextindex); // other MPI rank that has called rename at the same time
//printf("READ %d %d\n",universe->me,nextindex); // and we have to start over.
// if the read is short (we need at least one byte) we try reading again.
FILE *fp;
char buf[64];
for (int loopmax = 0; loopmax < 100; ++loopmax) {
fp = fopen("tmp.lammps.variable.lock","r");
if (fp == NULL) goto uloop_again;
buf[0] = buf[1] = '\0';
fread(buf,1,64,fp);
fclose(fp); fclose(fp);
if (strlen(buf) > 0) {
nextindex = atoi(buf);
break;
}
delay = (int) (1000000*random->uniform());
usleep(delay);
}
if (nextindex < 0)
error->one(FLERR,"Unexpected error while incrementing uloop "
"style variable. Please contact LAMMPS developers.");
//printf("READ %d %d\n",universe->me,nextindex);
fp = fopen("tmp.lammps.variable.lock","w"); fp = fopen("tmp.lammps.variable.lock","w");
fprintf(fp,"%d\n",nextindex+1); fprintf(fp,"%d\n",nextindex+1);
//printf("WRITE %d %d\n",universe->me,nextindex+1); //printf("WRITE %d %d\n",universe->me,nextindex+1);