From 7e5782a81b2e0e51a92d1adaf3683100ece6ed4a Mon Sep 17 00:00:00 2001 From: Stan Gerald Moore Date: Thu, 26 Aug 2021 08:21:16 -0600 Subject: [PATCH 01/15] Fix deallocation issue in pair_lj_charmm_coul_charmm_kokkos --- src/KOKKOS/pair_lj_charmm_coul_charmm_kokkos.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/KOKKOS/pair_lj_charmm_coul_charmm_kokkos.cpp b/src/KOKKOS/pair_lj_charmm_coul_charmm_kokkos.cpp index 66064d58b2..982ec9d99e 100644 --- a/src/KOKKOS/pair_lj_charmm_coul_charmm_kokkos.cpp +++ b/src/KOKKOS/pair_lj_charmm_coul_charmm_kokkos.cpp @@ -69,7 +69,7 @@ PairLJCharmmCoulCharmmKokkos::~PairLJCharmmCoulCharmmKokkos() if (allocated) { memoryKK->destroy_kokkos(k_eatom,eatom); memoryKK->destroy_kokkos(k_vatom,vatom); - k_cutsq = DAT::tdual_ffloat_2d(); + memoryKK->destroy_kokkos(k_cutsq,cutsq); } } From 0e8facdcbbc69ada65fa51c4aa3a835f1a924a2a Mon Sep 17 00:00:00 2001 From: Stan Gerald Moore Date: Thu, 26 Aug 2021 08:58:58 -0600 Subject: [PATCH 02/15] Makefile.kokkos: fix (standard_in) 1: syntax error (kokkos PR4173) --- lib/kokkos/Makefile.kokkos | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos index 013d2b3ede..2a984eefb6 100644 --- a/lib/kokkos/Makefile.kokkos +++ b/lib/kokkos/Makefile.kokkos @@ -406,8 +406,8 @@ KOKKOS_INTERNAL_USE_ISA_POWERPCBE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POW KOKKOS_INTERNAL_USE_TM := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_SKX)) # Incompatible flags? -KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1" | bc )) -KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1" | bc)) +KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1") | bc ) +KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1") | bc) ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1) $(error Defined Multiple Host architectures: KOKKOS_ARCH=$(KOKKOS_ARCH) ) From be98d0bbd95a67400c5900504fc585a1b6e69811 Mon Sep 17 00:00:00 2001 From: Stan Gerald Moore Date: Thu, 26 Aug 2021 09:36:16 -0600 Subject: [PATCH 03/15] Fix memory issue in fix_nvt_sllod_kokkos --- src/KOKKOS/fix_nvt_sllod_kokkos.cpp | 7 ++++--- src/KOKKOS/fix_nvt_sllod_kokkos.h | 19 +++++++++++-------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/src/KOKKOS/fix_nvt_sllod_kokkos.cpp b/src/KOKKOS/fix_nvt_sllod_kokkos.cpp index d0af72f17f..5ba0e6b666 100644 --- a/src/KOKKOS/fix_nvt_sllod_kokkos.cpp +++ b/src/KOKKOS/fix_nvt_sllod_kokkos.cpp @@ -65,8 +65,6 @@ void FixNVTSllodKokkos::init() { FixNHKokkos::init(); - vdelu = typename ArrayTypes::t_v_array("nvt/sllod/kk:vdelu", atomKK->nlocal); - if (!this->temperature->tempbias) this->error->all(FLERR,"Temperature for fix nvt/sllod does not have a bias"); @@ -100,7 +98,7 @@ void FixNVTSllodKokkos::nh_v_temp() // calculate temperature since some computes require temp // computed on current nlocal atoms to remove bias - if (nondeformbias){ + if (nondeformbias) { atomKK->sync(this->temperature->execution_space,this->temperature->datamask_read); this->temperature->compute_scalar(); atomKK->modified(this->temperature->execution_space,this->temperature->datamask_modify); @@ -115,6 +113,9 @@ void FixNVTSllodKokkos::nh_v_temp() d_h_two = Few(h_two); + if (vdelu.extent(0) < atomKK->nmax) + vdelu = typename AT::t_v_array(Kokkos::NoInit("nvt/sllod/kk:vdelu"), atomKK->nmax); + this->copymode = 1; Kokkos::parallel_for(Kokkos::RangePolicy(0,nlocal),*this); this->copymode = 0; diff --git a/src/KOKKOS/fix_nvt_sllod_kokkos.h b/src/KOKKOS/fix_nvt_sllod_kokkos.h index 6057ce44d0..84e57ab2c3 100644 --- a/src/KOKKOS/fix_nvt_sllod_kokkos.h +++ b/src/KOKKOS/fix_nvt_sllod_kokkos.h @@ -35,6 +35,9 @@ struct TagFixNVTSllod_temp2{}; template class FixNVTSllodKokkos : public FixNHKokkos { public: + typedef DeviceType device_type; + typedef ArrayTypes AT; + FixNVTSllodKokkos(class LAMMPS *, int, char **); ~FixNVTSllodKokkos() {} void init(); @@ -51,14 +54,14 @@ class FixNVTSllodKokkos : public FixNHKokkos { void nh_v_temp(); protected: - typename ArrayTypes::t_x_array x; - typename ArrayTypes::t_v_array v; - typename ArrayTypes::t_v_array vdelu; - typename ArrayTypes::t_f_array_const f; - typename ArrayTypes::t_float_1d rmass; - typename ArrayTypes::t_float_1d mass; - typename ArrayTypes::t_int_1d type; - typename ArrayTypes::t_int_1d mask; + typename AT::t_x_array x; + typename AT::t_v_array v; + typename AT::t_v_array vdelu; + typename AT::t_f_array_const f; + typename AT::t_float_1d rmass; + typename AT::t_float_1d mass; + typename AT::t_int_1d type; + typename AT::t_int_1d mask; Few d_h_two; From d38549e05fcb6577af834dcdb4098529f3e25e65 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Thu, 26 Aug 2021 11:03:57 -0600 Subject: [PATCH 04/15] Update ancient Kokkos Arch in Makefile.kokkos_cuda_mpi --- src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi b/src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi index 3971cc6c06..c6071cf747 100644 --- a/src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi +++ b/src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi @@ -23,7 +23,7 @@ ARCHIVE = ar ARFLAGS = -rc SHLIBFLAGS = -shared KOKKOS_DEVICES = Cuda -KOKKOS_ARCH = Kepler35 +KOKKOS_ARCH = Volta70 # --------------------------------------------------------------------- # LAMMPS-specific settings, all OPTIONAL From 49b0623d6b27923e04d22552c98566cf1f527828 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Thu, 26 Aug 2021 13:26:46 -0600 Subject: [PATCH 05/15] Fix issue with Kokkos granular and pair/only on --- src/GRANULAR/fix_freeze.h | 2 +- src/KOKKOS/fix_freeze_kokkos.cpp | 49 ++----------------- src/KOKKOS/fix_freeze_kokkos.h | 6 +-- src/KOKKOS/fix_neigh_history_kokkos.cpp | 15 +++++- src/KOKKOS/fix_neigh_history_kokkos.h | 10 +++- src/KOKKOS/fix_nve_sphere_kokkos.cpp | 2 +- src/KOKKOS/fix_nve_sphere_kokkos.h | 2 + src/KOKKOS/pair_gran_hooke_history_kokkos.cpp | 12 ++++- src/KOKKOS/pair_gran_hooke_history_kokkos.h | 4 +- 9 files changed, 41 insertions(+), 61 deletions(-) diff --git a/src/GRANULAR/fix_freeze.h b/src/GRANULAR/fix_freeze.h index 2fc5fda71e..5846bfd769 100644 --- a/src/GRANULAR/fix_freeze.h +++ b/src/GRANULAR/fix_freeze.h @@ -30,7 +30,7 @@ class FixFreeze : public Fix { int setmask(); void init(); void setup(int); - void post_force(int); + virtual void post_force(int); void post_force_respa(int, int, int); double compute_vector(int); diff --git a/src/KOKKOS/fix_freeze_kokkos.cpp b/src/KOKKOS/fix_freeze_kokkos.cpp index 190a054f8b..9a486a3fcc 100644 --- a/src/KOKKOS/fix_freeze_kokkos.cpp +++ b/src/KOKKOS/fix_freeze_kokkos.cpp @@ -28,41 +28,16 @@ FixFreezeKokkos::FixFreezeKokkos(LAMMPS *lmp, int narg, char **arg) atomKK = (AtomKokkos *)atom; execution_space = ExecutionSpaceFromDevice::space; - datamask_read = F_MASK | MASK_MASK; + datamask_read = F_MASK | MASK_MASK | TORQUE_MASK; datamask_modify = F_MASK | TORQUE_MASK; } /* ---------------------------------------------------------------------- */ -template -int FixFreezeKokkos::setmask() -{ - return FixFreeze::setmask(); -} - -/* ---------------------------------------------------------------------- */ - -template -void FixFreezeKokkos::init() -{ - FixFreeze::init(); -} - -/* ---------------------------------------------------------------------- */ - -template -void FixFreezeKokkos::setup(int vflag) -{ - FixFreeze::setup(vflag); -} - -/* ---------------------------------------------------------------------- */ - template void FixFreezeKokkos::post_force(int /*vflag*/) { atomKK->sync(execution_space,datamask_read); - atomKK->modified(execution_space,datamask_modify); f = atomKK->k_f.view(); torque = atomKK->k_torque.view(); @@ -80,28 +55,10 @@ void FixFreezeKokkos::post_force(int /*vflag*/) foriginal[0] = original.values[0]; foriginal[1] = original.values[1]; foriginal[2] = original.values[2]; + + atomKK->modified(execution_space,datamask_modify); } -/* ---------------------------------------------------------------------- */ - -template -void FixFreezeKokkos::post_force_respa(int vflag, int /*ilevel*/, int /*iloop*/) -{ - post_force(vflag); -} - -/* ---------------------------------------------------------------------- - return components of total force on fix group before force was changed -------------------------------------------------------------------------- */ - -template -double FixFreezeKokkos::compute_vector(int n) -{ - return FixFreeze::compute_vector(n); -} - -/* ---------------------------------------------------------------------- */ - template KOKKOS_INLINE_FUNCTION void FixFreezeKokkos::operator()(const int i, OriginalForce &original) const { diff --git a/src/KOKKOS/fix_freeze_kokkos.h b/src/KOKKOS/fix_freeze_kokkos.h index dcfc14bd3d..67d4f3272c 100644 --- a/src/KOKKOS/fix_freeze_kokkos.h +++ b/src/KOKKOS/fix_freeze_kokkos.h @@ -31,6 +31,7 @@ namespace LAMMPS_NS { template class FixFreezeKokkos : public FixFreeze { public: + typedef DeviceType device_type; struct OriginalForce { double values[3]; @@ -58,12 +59,7 @@ class FixFreezeKokkos : public FixFreeze { }; FixFreezeKokkos(class LAMMPS *, int, char **); - int setmask(); - void init(); - void setup(int); void post_force(int); - void post_force_respa(int, int, int); - double compute_vector(int); KOKKOS_INLINE_FUNCTION void operator()(const int i, OriginalForce &original) const; diff --git a/src/KOKKOS/fix_neigh_history_kokkos.cpp b/src/KOKKOS/fix_neigh_history_kokkos.cpp index 611a8a40ef..4837148ee0 100644 --- a/src/KOKKOS/fix_neigh_history_kokkos.cpp +++ b/src/KOKKOS/fix_neigh_history_kokkos.cpp @@ -87,6 +87,9 @@ void FixNeighHistoryKokkos::pre_exchange() { copymode = 1; + k_firstflag.sync(); + k_firstvalue.sync(); + h_resize() = 1; while (h_resize() > 0) { FixNeighHistoryKokkosZeroPartnerCountFunctor zero(this); @@ -168,6 +171,9 @@ void FixNeighHistoryKokkos::post_neighbor() { tag = atomKK->k_tag.view(); + k_firstflag.sync(); + k_firstvalue.sync(); + int inum = pair->list->inum; NeighListKokkos* k_list = static_cast*>(pair->list); d_numneigh = k_list->d_numneigh; @@ -185,8 +191,10 @@ void FixNeighHistoryKokkos::post_neighbor() if (maxatom < nlocal || k_list->maxneighs > (int)d_firstflag.extent(1)) { maxatom = nall; - d_firstflag = Kokkos::View("neighbor_history:firstflag",maxatom,k_list->maxneighs); - d_firstvalue = Kokkos::View("neighbor_history:firstvalue",maxatom,k_list->maxneighs*dnum); + k_firstflag = DAT::tdual_int_2d("neighbor_history:firstflag",maxatom,k_list->maxneighs); + k_firstvalue = DAT::tdual_float_2d("neighbor_history:firstvalue",maxatom,k_list->maxneighs*dnum); + d_firstflag = k_firstflag.view(); + d_firstvalue = k_firstvalue.view(); } copymode = 1; @@ -194,6 +202,9 @@ void FixNeighHistoryKokkos::post_neighbor() FixNeighHistoryKokkosPostNeighborFunctor f(this); Kokkos::parallel_for(inum,f); + k_firstflag.modify(); + k_firstvalue.modify(); + copymode = 0; } diff --git a/src/KOKKOS/fix_neigh_history_kokkos.h b/src/KOKKOS/fix_neigh_history_kokkos.h index acc1e9c408..0442b46cbd 100644 --- a/src/KOKKOS/fix_neigh_history_kokkos.h +++ b/src/KOKKOS/fix_neigh_history_kokkos.h @@ -50,10 +50,13 @@ class FixNeighHistoryKokkos : public FixNeighHistory { KOKKOS_INLINE_FUNCTION void post_neighbor_item(const int &ii) const; - typename Kokkos::View d_firstflag; - typename Kokkos::View d_firstvalue; + typename DAT::tdual_int_2d k_firstflag; + typename DAT::tdual_float_2d k_firstvalue; private: + typename ArrayTypes::t_int_2d d_firstflag; + typename ArrayTypes::t_float_2d d_firstvalue; + typename ArrayTypes::tdual_int_1d k_npartner; typename ArrayTypes::tdual_tagint_2d k_partner; typename ArrayTypes::tdual_float_2d k_valuepartner; @@ -74,6 +77,7 @@ class FixNeighHistoryKokkos : public FixNeighHistory { template struct FixNeighHistoryKokkosZeroPartnerCountFunctor { + typedef DeviceType device_type; FixNeighHistoryKokkos c; FixNeighHistoryKokkosZeroPartnerCountFunctor(FixNeighHistoryKokkos *c_ptr): c(*c_ptr) {} KOKKOS_INLINE_FUNCTION @@ -84,6 +88,7 @@ struct FixNeighHistoryKokkosZeroPartnerCountFunctor { template struct FixNeighHistoryKokkosPreExchangeFunctor { + typedef DeviceType device_type; FixNeighHistoryKokkos c; FixNeighHistoryKokkosPreExchangeFunctor(FixNeighHistoryKokkos *c_ptr): c(*c_ptr) {} KOKKOS_INLINE_FUNCTION @@ -94,6 +99,7 @@ struct FixNeighHistoryKokkosPreExchangeFunctor { template struct FixNeighHistoryKokkosPostNeighborFunctor { + typedef DeviceType device_type; FixNeighHistoryKokkos c; FixNeighHistoryKokkosPostNeighborFunctor(FixNeighHistoryKokkos *c_ptr): c(*c_ptr) {} KOKKOS_INLINE_FUNCTION diff --git a/src/KOKKOS/fix_nve_sphere_kokkos.cpp b/src/KOKKOS/fix_nve_sphere_kokkos.cpp index 787171b6ce..1c5011c91f 100644 --- a/src/KOKKOS/fix_nve_sphere_kokkos.cpp +++ b/src/KOKKOS/fix_nve_sphere_kokkos.cpp @@ -31,7 +31,7 @@ FixNVESphereKokkos::FixNVESphereKokkos(LAMMPS *lmp, int narg, char * atomKK = (AtomKokkos *)atom; execution_space = ExecutionSpaceFromDevice::space; - datamask_read = F_MASK | TORQUE_MASK | RMASS_MASK | RADIUS_MASK | MASK_MASK; + datamask_read = X_MASK | V_MASK | OMEGA_MASK| F_MASK | TORQUE_MASK | RMASS_MASK | RADIUS_MASK | MASK_MASK; datamask_modify = X_MASK | V_MASK | OMEGA_MASK; } diff --git a/src/KOKKOS/fix_nve_sphere_kokkos.h b/src/KOKKOS/fix_nve_sphere_kokkos.h index 888a1baa0d..f3e3df13d4 100644 --- a/src/KOKKOS/fix_nve_sphere_kokkos.h +++ b/src/KOKKOS/fix_nve_sphere_kokkos.h @@ -56,6 +56,7 @@ class FixNVESphereKokkos : public FixNVESphere { template struct FixNVESphereKokkosInitialIntegrateFunctor { + typedef DeviceType device_type; FixNVESphereKokkos c; FixNVESphereKokkosInitialIntegrateFunctor(FixNVESphereKokkos *c_ptr): c(*c_ptr) { c.cleanup_copy(); } KOKKOS_INLINE_FUNCTION @@ -66,6 +67,7 @@ struct FixNVESphereKokkosInitialIntegrateFunctor { template struct FixNVESphereKokkosFinalIntegrateFunctor { + typedef DeviceType device_type; FixNVESphereKokkos c; FixNVESphereKokkosFinalIntegrateFunctor(FixNVESphereKokkos *c_ptr): c(*c_ptr) { c.cleanup_copy(); } KOKKOS_INLINE_FUNCTION diff --git a/src/KOKKOS/pair_gran_hooke_history_kokkos.cpp b/src/KOKKOS/pair_gran_hooke_history_kokkos.cpp index fab33d0ec7..b47b5f5a47 100644 --- a/src/KOKKOS/pair_gran_hooke_history_kokkos.cpp +++ b/src/KOKKOS/pair_gran_hooke_history_kokkos.cpp @@ -165,8 +165,11 @@ void PairGranHookeHistoryKokkos::compute(int eflag_in, int vflag_in) d_neighbors.extent(1) != d_neighbors_touch.extent(1)) d_neighbors_touch = typename AT::t_neighbors_2d("pair:neighbors_touch",d_neighbors.extent(0),d_neighbors.extent(1)); - d_firsttouch = fix_historyKK->d_firstflag; - d_firstshear = fix_historyKK->d_firstvalue; + fix_historyKK->k_firstflag.template sync(); + fix_historyKK->k_firstvalue.template sync(); + + d_firsttouch = fix_historyKK->k_firstflag.template view(); + d_firstshear = fix_historyKK->k_firstvalue.template view(); Kokkos::parallel_for(Kokkos::RangePolicy(0,inum),*this); @@ -258,6 +261,11 @@ void PairGranHookeHistoryKokkos::compute(int eflag_in, int vflag_in) } } + if (eflag_atom) { + k_eatom.template modify(); + k_eatom.template sync(); + } + if (vflag_global) { virial[0] += ev.v[0]; virial[1] += ev.v[1]; diff --git a/src/KOKKOS/pair_gran_hooke_history_kokkos.h b/src/KOKKOS/pair_gran_hooke_history_kokkos.h index 6b887c0df4..37fb208a70 100644 --- a/src/KOKKOS/pair_gran_hooke_history_kokkos.h +++ b/src/KOKKOS/pair_gran_hooke_history_kokkos.h @@ -92,8 +92,8 @@ class PairGranHookeHistoryKokkos : public PairGranHookeHistory { typename AT::t_int_1d_randomread d_ilist; typename AT::t_int_1d_randomread d_numneigh; - typename Kokkos::View d_firsttouch; - typename Kokkos::View d_firstshear; + typename AT::t_int_2d d_firsttouch; + typename AT::t_float_2d d_firstshear; typename AT::t_neighbors_2d d_neighbors_touch; typename AT::t_int_1d d_numneigh_touch; From ddbb8f1aa64539c70bffb99332367623a26016f0 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Thu, 26 Aug 2021 13:36:32 -0600 Subject: [PATCH 06/15] Remove unnecessary data tranfer in fix_nve_sphere_kokkos --- src/KOKKOS/fix_nve_sphere_kokkos.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/KOKKOS/fix_nve_sphere_kokkos.cpp b/src/KOKKOS/fix_nve_sphere_kokkos.cpp index 1c5011c91f..12b170acee 100644 --- a/src/KOKKOS/fix_nve_sphere_kokkos.cpp +++ b/src/KOKKOS/fix_nve_sphere_kokkos.cpp @@ -31,8 +31,8 @@ FixNVESphereKokkos::FixNVESphereKokkos(LAMMPS *lmp, int narg, char * atomKK = (AtomKokkos *)atom; execution_space = ExecutionSpaceFromDevice::space; - datamask_read = X_MASK | V_MASK | OMEGA_MASK| F_MASK | TORQUE_MASK | RMASS_MASK | RADIUS_MASK | MASK_MASK; - datamask_modify = X_MASK | V_MASK | OMEGA_MASK; + datamask_read = EMPTY_MASK; + datamask_modify = EMPTY_MASK; } /* ---------------------------------------------------------------------- */ @@ -61,8 +61,7 @@ void FixNVESphereKokkos::init() template void FixNVESphereKokkos::initial_integrate(int /*vflag*/) { - atomKK->sync(execution_space,datamask_read); - atomKK->modified(execution_space,datamask_modify); + atomKK->sync(execution_space, X_MASK | V_MASK | OMEGA_MASK| F_MASK | TORQUE_MASK | RMASS_MASK | RADIUS_MASK | MASK_MASK); x = atomKK->k_x.view(); v = atomKK->k_v.view(); @@ -78,6 +77,8 @@ void FixNVESphereKokkos::initial_integrate(int /*vflag*/) FixNVESphereKokkosInitialIntegrateFunctor f(this); Kokkos::parallel_for(nlocal,f); + + atomKK->modified(execution_space, X_MASK | V_MASK | OMEGA_MASK); } /* ---------------------------------------------------------------------- */ @@ -109,8 +110,7 @@ void FixNVESphereKokkos::initial_integrate_item(const int i) const template void FixNVESphereKokkos::final_integrate() { - atomKK->sync(execution_space,datamask_read); - atomKK->modified(execution_space,datamask_modify); + atomKK->sync(execution_space, V_MASK | OMEGA_MASK| F_MASK | TORQUE_MASK | RMASS_MASK | RADIUS_MASK | MASK_MASK); v = atomKK->k_v.view(); omega = atomKK->k_omega.view(); @@ -125,6 +125,8 @@ void FixNVESphereKokkos::final_integrate() FixNVESphereKokkosFinalIntegrateFunctor f(this); Kokkos::parallel_for(nlocal,f); + + atomKK->modified(execution_space, V_MASK | OMEGA_MASK); } /* ---------------------------------------------------------------------- */ From f63d0202be6b5c3c674143e5ebf5aa2ceb3b02ba Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Thu, 26 Aug 2021 14:54:48 -0600 Subject: [PATCH 07/15] Rely on auto_sync in verlet setup --- src/KOKKOS/verlet_kokkos.cpp | 54 ++++++++---------------------------- 1 file changed, 11 insertions(+), 43 deletions(-) diff --git a/src/KOKKOS/verlet_kokkos.cpp b/src/KOKKOS/verlet_kokkos.cpp index 909c6930cf..5e9ab757c6 100644 --- a/src/KOKKOS/verlet_kokkos.cpp +++ b/src/KOKKOS/verlet_kokkos.cpp @@ -92,40 +92,22 @@ void VerletKokkos::setup(int flag) // acquire ghosts // build neighbor lists - atomKK->sync(Host,ALL_MASK); - atomKK->modified(Host,ALL_MASK); + lmp->kokkos->auto_sync = 1; - atomKK->setup(); + atom->setup(); modify->setup_pre_exchange(); - // debug - atomKK->sync(Host,ALL_MASK); - atomKK->modified(Host,ALL_MASK); - if (triclinic) domain->x2lamda(atomKK->nlocal); + if (triclinic) domain->x2lamda(atom->nlocal); domain->pbc(); - - atomKK->sync(Host,ALL_MASK); - - domain->reset_box(); comm->setup(); if (neighbor->style) neighbor->setup_bins(); - comm->exchange(); - - if (atomKK->sortfreq > 0) atomKK->sort(); - + if (atom->sortfreq > 0) atom->sort(); comm->borders(); - - if (triclinic) domain->lamda2x(atomKK->nlocal+atomKK->nghost); - - atomKK->sync(Host,ALL_MASK); - + if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost); domain->image_check(); domain->box_too_small_check(); modify->setup_pre_neighbor(); - - atomKK->modified(Host,ALL_MASK); - neighbor->build(1); modify->setup_post_neighbor(); neighbor->ncalls = 0; @@ -144,7 +126,7 @@ void VerletKokkos::setup(int flag) } else if (force->pair) force->pair->compute_dummy(eflag,vflag); - if (atomKK->molecular != Atom::ATOMIC) { + if (atom->molecular != Atom::ATOMIC) { if (force->bond) { atomKK->sync(force->bond->execution_space,force->bond->datamask_read); force->bond->compute(eflag,vflag); @@ -200,35 +182,21 @@ void VerletKokkos::setup_minimal(int flag) // acquire ghosts // build neighbor lists + lmp->kokkos->auto_sync = 1; + if (flag) { - atomKK->sync(Host,ALL_MASK); - atomKK->modified(Host,ALL_MASK); - modify->setup_pre_exchange(); - // debug - atomKK->sync(Host,ALL_MASK); - atomKK->modified(Host,ALL_MASK); - - if (triclinic) domain->x2lamda(atomKK->nlocal); + if (triclinic) domain->x2lamda(atom->nlocal); domain->pbc(); - - atomKK->sync(Host,ALL_MASK); - domain->reset_box(); comm->setup(); if (neighbor->style) neighbor->setup_bins(); comm->exchange(); comm->borders(); - if (triclinic) domain->lamda2x(atomKK->nlocal+atomKK->nghost); - - atomKK->sync(Host,ALL_MASK); - + if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost); domain->image_check(); domain->box_too_small_check(); modify->setup_pre_neighbor(); - - atomKK->modified(Host,ALL_MASK); - neighbor->build(1); modify->setup_post_neighbor(); neighbor->ncalls = 0; @@ -247,7 +215,7 @@ void VerletKokkos::setup_minimal(int flag) } else if (force->pair) force->pair->compute_dummy(eflag,vflag); - if (atomKK->molecular != Atom::ATOMIC) { + if (atom->molecular != Atom::ATOMIC) { if (force->bond) { atomKK->sync(force->bond->execution_space,force->bond->datamask_read); force->bond->compute(eflag,vflag); From 90f82a8ef191b0fef12ec39397577733d7d4604a Mon Sep 17 00:00:00 2001 From: Jacob Gissinger Date: Fri, 27 Aug 2021 17:03:11 -0400 Subject: [PATCH 08/15] memory leak --- src/REACTION/fix_bond_react.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/REACTION/fix_bond_react.cpp b/src/REACTION/fix_bond_react.cpp index 192331889d..7b891d42fe 100644 --- a/src/REACTION/fix_bond_react.cpp +++ b/src/REACTION/fix_bond_react.cpp @@ -1990,7 +1990,10 @@ int FixBondReact::check_constraints() *ptr = satisfied[i] ? '1' : '0'; } double verdict = input->variable->evaluate_boolean(evalstr); - if (verdict == 0.0) return 0; + if (verdict == 0.0) { + memory->destroy(satisfied); + return 0; + } } // let's also check chirality within 'check_constraint' @@ -2012,7 +2015,10 @@ int FixBondReact::check_constraints() } } } - if (get_chirality(my4coords) != chiral_atoms[i][1][rxnID]) return 0; + if (get_chirality(my4coords) != chiral_atoms[i][1][rxnID]) { + memory->destroy(satisfied); + return 0; + } } } From c779798f3f3d0729cd1a44406f1386d3e5916c71 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Fri, 27 Aug 2021 19:38:48 -0400 Subject: [PATCH 09/15] properly disable clang-format processing --- src/KOKKOS/compute_temp_deform_kokkos.h | 2 +- src/KOKKOS/pppm_kokkos.cpp | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/KOKKOS/compute_temp_deform_kokkos.h b/src/KOKKOS/compute_temp_deform_kokkos.h index 8b53c1f633..0292c6776d 100644 --- a/src/KOKKOS/compute_temp_deform_kokkos.h +++ b/src/KOKKOS/compute_temp_deform_kokkos.h @@ -1,4 +1,3 @@ -// clang-format off /* -*- c++ -*- ---------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator https://www.lammps.org/, Sandia National Laboratories @@ -20,6 +19,7 @@ ComputeStyle(temp/deform/kk/host,ComputeTempDeformKokkos); // clang-format on #else +// clang-format off #ifndef LMP_COMPUTE_TEMP_DEFORM_KOKKOS_H #define LMP_COMPUTE_TEMP_DEFORM_KOKKOS_H diff --git a/src/KOKKOS/pppm_kokkos.cpp b/src/KOKKOS/pppm_kokkos.cpp index a7f58f2525..d71d7d1bad 100644 --- a/src/KOKKOS/pppm_kokkos.cpp +++ b/src/KOKKOS/pppm_kokkos.cpp @@ -1,3 +1,4 @@ +// clang-format off /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator https://www.lammps.org/, Sandia National Laboratories From 89556f0bcb3827f3c4560c2607fc4e1a8d5a84a7 Mon Sep 17 00:00:00 2001 From: Mike Brown Date: Sat, 28 Aug 2021 17:01:58 -0700 Subject: [PATCH 10/15] Override any OpenCL fast math JIT settings for born/coul/wolf{/cs}/gpu to resolve numerical deviations seen with some OpenCL implementations. --- lib/gpu/lal_base_charge.cpp | 16 +++++++++++----- lib/gpu/lal_base_charge.h | 7 +++++-- lib/gpu/lal_born_coul_wolf.cpp | 2 +- lib/gpu/lal_born_coul_wolf_cs.cpp | 2 +- lib/gpu/lal_device.cpp | 10 ++++++++++ lib/gpu/lal_device.h | 1 + 6 files changed, 29 insertions(+), 9 deletions(-) diff --git a/lib/gpu/lal_base_charge.cpp b/lib/gpu/lal_base_charge.cpp index 9045420425..84fbddd4e9 100644 --- a/lib/gpu/lal_base_charge.cpp +++ b/lib/gpu/lal_base_charge.cpp @@ -56,7 +56,7 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *_screen, const void *pair_program, - const char *k_name) { + const char *k_name, const int disable_fast_math) { screen=_screen; int gpu_nbor=0; @@ -83,7 +83,7 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall, _block_size=device->pair_block_size(); _block_bio_size=device->block_bio_pair(); - compile_kernels(*ucl_device,pair_program,k_name); + compile_kernels(*ucl_device,pair_program,k_name,disable_fast_math); if (_threads_per_atom>1 && gpu_nbor==0) { nbor->packing(true); @@ -321,14 +321,20 @@ double BaseChargeT::host_memory_usage_atomic() const { template void BaseChargeT::compile_kernels(UCL_Device &dev, const void *pair_str, - const char *kname) { + const char *kname, + const int disable_fast_math) { if (_compiled) return; std::string s_fast=std::string(kname)+"_fast"; if (pair_program) delete pair_program; pair_program=new UCL_Program(dev); - std::string oclstring = device->compile_string()+" -DEVFLAG=1"; + std::string device_compile_string; + if (disable_fast_math) + device_compile_string = device->compile_string_nofast(); + else + device_compile_string = device->compile_string(); + std::string oclstring = device_compile_string+" -DEVFLAG=1"; pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen); k_pair_fast.set_function(*pair_program,s_fast.c_str()); k_pair.set_function(*pair_program,kname); @@ -336,7 +342,7 @@ void BaseChargeT::compile_kernels(UCL_Device &dev, const void *pair_str, q_tex.get_texture(*pair_program,"q_tex"); #if defined(LAL_OCL_EV_JIT) - oclstring = device->compile_string()+" -DEVFLAG=0"; + oclstring = device_compile_string+" -DEVFLAG=0"; if (pair_program_noev) delete pair_program_noev; pair_program_noev=new UCL_Program(dev); pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen); diff --git a/lib/gpu/lal_base_charge.h b/lib/gpu/lal_base_charge.h index 6b8761092a..307c5c079f 100644 --- a/lib/gpu/lal_base_charge.h +++ b/lib/gpu/lal_base_charge.h @@ -44,6 +44,7 @@ class BaseCharge { * \param cell_size cutoff + skin * \param gpu_split fraction of particles handled by device * \param k_name name for the kernel for force calculation + * \param disable_fast_math override any fast math opts for kernel JIT * * Returns: * - 0 if successful @@ -54,7 +55,8 @@ class BaseCharge { int init_atomic(const int nlocal, const int nall, const int max_nbors, const int maxspecial, const double cell_size, const double gpu_split, FILE *screen, - const void *pair_program, const char *k_name); + const void *pair_program, const char *k_name, + const int disable_fast_math = 0); /// Estimate the overhead for GPU context changes and CPU driver void estimate_gpu_overhead(const int add_kernels=0); @@ -198,7 +200,8 @@ class BaseCharge { double _gpu_overhead, _driver_overhead; UCL_D_Vec *_nbor_data; - void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k); + void compile_kernels(UCL_Device &dev, const void *pair_string, + const char *k, const int disable_fast_math); virtual int loop(const int eflag, const int vflag) = 0; }; diff --git a/lib/gpu/lal_born_coul_wolf.cpp b/lib/gpu/lal_born_coul_wolf.cpp index e6caebbab8..9aac866353 100644 --- a/lib/gpu/lal_born_coul_wolf.cpp +++ b/lib/gpu/lal_born_coul_wolf.cpp @@ -57,7 +57,7 @@ int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rho const double alf, const double e_shift, const double f_shift) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,born_coul_wolf,"k_born_coul_wolf"); + _screen,born_coul_wolf,"k_born_coul_wolf",1); if (success!=0) return success; diff --git a/lib/gpu/lal_born_coul_wolf_cs.cpp b/lib/gpu/lal_born_coul_wolf_cs.cpp index 8deceeb1f4..abd4da439a 100644 --- a/lib/gpu/lal_born_coul_wolf_cs.cpp +++ b/lib/gpu/lal_born_coul_wolf_cs.cpp @@ -42,7 +42,7 @@ int BornCoulWolfCST::init(const int ntypes, double **host_cutsq, double **host_r const double alf, const double e_shift, const double f_shift) { int success; success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split, - _screen,born_coul_wolf_cs,"k_born_coul_wolf_cs"); + _screen,born_coul_wolf_cs,"k_born_coul_wolf_cs",1); if (success!=0) return success; diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp index e2b5b9cdb5..0ff7125089 100644 --- a/lib/gpu/lal_device.cpp +++ b/lib/gpu/lal_device.cpp @@ -420,6 +420,16 @@ int DeviceT::set_ocl_params(std::string s_config, std::string extra_args) { return 0; } +template +std::string DeviceT::compile_string_nofast() { + std::string no_fast = _ocl_compile_string; + size_t p = no_fast.find("-cl-fast-relaxed-math "); + if (p != std::string::npos) no_fast.erase(p,22); + p = no_fast.find("-DFAST_MATH="); + if (p != std::string::npos) no_fast[p + 12]='0'; + return no_fast; +} + template int DeviceT::init(Answer &ans, const bool charge, const bool rot, const int nlocal, diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h index 1db6ae3127..933a3508b5 100644 --- a/lib/gpu/lal_device.h +++ b/lib/gpu/lal_device.h @@ -312,6 +312,7 @@ class Device { } inline std::string compile_string() { return _ocl_compile_string; } + std::string compile_string_nofast(); inline std::string ocl_config_name() { return _ocl_config_name; } template From fb72e00081bef2f61c7cb4689829c9c12e57939f Mon Sep 17 00:00:00 2001 From: Mike Brown Date: Sat, 28 Aug 2021 17:18:05 -0700 Subject: [PATCH 11/15] Fix (the fix) for _MM_SCALE preprocessor defines for future Intel compilers. --- src/INTEL/intel_preprocess.h | 4 ---- src/INTEL/intel_simd.h | 7 +++++++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/INTEL/intel_preprocess.h b/src/INTEL/intel_preprocess.h index 27daa5f3d2..c7bd60b00b 100644 --- a/src/INTEL/intel_preprocess.h +++ b/src/INTEL/intel_preprocess.h @@ -20,10 +20,6 @@ #define USE_OMP_SIMD #define __INTEL_COMPILER __INTEL_LLVM_COMPILER #define __INTEL_COMPILER_BUILD_DATE __INTEL_LLVM_COMPILER -#define _MM_SCALE_1 1 -#define _MM_SCALE_2 2 -#define _MM_SCALE_4 4 -#define _MM_SCALE_8 8 #endif #ifdef __INTEL_COMPILER diff --git a/src/INTEL/intel_simd.h b/src/INTEL/intel_simd.h index 2affa6a394..d75b2b9175 100644 --- a/src/INTEL/intel_simd.h +++ b/src/INTEL/intel_simd.h @@ -35,6 +35,13 @@ authors for more details. #ifdef __AVX512F__ +#ifndef _MM_SCALE_1 +#define _MM_SCALE_1 1 +#define _MM_SCALE_2 2 +#define _MM_SCALE_4 4 +#define _MM_SCALE_8 8 +#endif + namespace ip_simd { typedef __mmask16 SIMD_mask; From 39d8b239ff1c58394dc49d31e8f3c0c43a1baf80 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Sun, 29 Aug 2021 17:56:47 -0400 Subject: [PATCH 12/15] don't report bogus timings --- lib/gpu/lal_base_ellipsoid.cpp | 11 +++++++---- lib/gpu/lal_device.cpp | 12 +++++++----- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/lib/gpu/lal_base_ellipsoid.cpp b/lib/gpu/lal_base_ellipsoid.cpp index 2e22b2f602..fa060bea5a 100644 --- a/lib/gpu/lal_base_ellipsoid.cpp +++ b/lib/gpu/lal_base_ellipsoid.cpp @@ -224,7 +224,9 @@ void BaseEllipsoidT::output_times() { #ifdef USE_OPENCL // Workaround for timing issue on Intel OpenCL + if (times[0] > 80e6) times[0]=0.0; if (times[3] > 80e6) times[3]=0.0; + if (times[6] > 80e6) times[6]=0.0; #endif if (device->replica_me()==0) @@ -237,17 +239,18 @@ void BaseEllipsoidT::output_times() { fprintf(screen,"\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); - if (device->procs_per_gpu()==1 && times[3]>0) { - fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/replica_size); + if (device->procs_per_gpu()==1 && (times[3] > 0.0)) { + if (times[0] > 0.0) + fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/replica_size); fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/replica_size); - if (nbor->gpu_nbor()>0) + if (nbor->gpu_nbor() > 0.0) fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/replica_size); else fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/replica_size); fprintf(screen,"Force calc: %.4f s.\n",times[3]/replica_size); fprintf(screen,"LJ calc: %.4f s.\n",times[4]/replica_size); } - if (times[6]>0) + if (times[6] > 0.0) fprintf(screen,"Device Overhead: %.4f s.\n",times[6]/replica_size); fprintf(screen,"Average split: %.4f.\n",avg_split); fprintf(screen,"Lanes / atom: %d.\n",_threads_per_atom); diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp index e2b5b9cdb5..50046d8bdd 100644 --- a/lib/gpu/lal_device.cpp +++ b/lib/gpu/lal_device.cpp @@ -777,28 +777,30 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer &ans, #ifdef USE_OPENCL // Workaround for timing issue on Intel OpenCL + if (times[0] > 80e6) times[0]=0.0; if (times[3] > 80e6) times[3]=0.0; if (times[5] > 80e6) times[5]=0.0; #endif if (replica_me()==0) - if (screen && times[6]>0.0) { + if (screen && (times[6] > 0.0)) { fprintf(screen,"\n\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); fprintf(screen," Device Time Info (average): "); fprintf(screen,"\n-------------------------------------"); fprintf(screen,"--------------------------------\n"); - if (time_device() && times[3]>0) { - fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/_replica_size); + if (time_device() && (times[3] > 0.0)) { + if (times[0] > 0.0) + fprintf(screen,"Data Transfer: %.4f s.\n",times[0]/_replica_size); fprintf(screen,"Neighbor copy: %.4f s.\n",times[1]/_replica_size); - if (nbor.gpu_nbor()>0) + if (nbor.gpu_nbor() > 0.0) fprintf(screen,"Neighbor build: %.4f s.\n",times[2]/_replica_size); else fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_replica_size); fprintf(screen,"Force calc: %.4f s.\n",times[3]/_replica_size); } - if (times[5]>0) + if (times[5] > 0.0) fprintf(screen,"Device Overhead: %.4f s.\n",times[5]/_replica_size); fprintf(screen,"Average split: %.4f.\n",avg_split); fprintf(screen,"Lanes / atom: %d.\n",threads_per_atom); From 664a07a3fe16e5bd2455bda218172f2c810bf143 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Sun, 29 Aug 2021 17:57:30 -0400 Subject: [PATCH 13/15] disallow GPU neighbor list with hybrid pair styles (which has still problems) --- src/GPU/fix_gpu.cpp | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp index 71ab3f4cb4..843bff2a35 100644 --- a/src/GPU/fix_gpu.cpp +++ b/src/GPU/fix_gpu.cpp @@ -13,25 +13,26 @@ ------------------------------------------------------------------------- */ #include "fix_gpu.h" -#include #include "atom.h" +#include "citeme.h" #include "comm.h" +#include "domain.h" +#include "error.h" #include "force.h" +#include "gpu_extra.h" +#include "input.h" +#include "modify.h" +#include "neighbor.h" #include "pair.h" #include "pair_hybrid.h" #include "pair_hybrid_overlay.h" #include "respa.h" -#include "input.h" #include "timer.h" -#include "modify.h" -#include "update.h" -#include "domain.h" #include "universe.h" -#include "gpu_extra.h" -#include "neighbor.h" -#include "citeme.h" -#include "error.h" +#include "update.h" + +#include #if (LAL_USE_OMP == 1) #include @@ -275,12 +276,15 @@ void FixGPU::init() error->warning(FLERR,"Using package gpu without any pair style defined"); // make sure fdotr virial is not accumulated multiple times + // also disallow GPU neighbor lists for hybrid styles if (force->pair_match("^hybrid",0) != nullptr) { PairHybrid *hybrid = (PairHybrid *) force->pair; for (int i = 0; i < hybrid->nstyles; i++) if (!utils::strmatch(hybrid->keywords[i],"/gpu$")) force->pair->no_virial_fdotr_compute = 1; + if (_gpu_mode != GPU_FORCE) + error->all(FLERR, "Must not use GPU neighbor lists with hybrid pair style"); } // rRESPA support @@ -295,8 +299,7 @@ void FixGPU::setup(int vflag) { if (_gpu_mode == GPU_NEIGH || _gpu_mode == GPU_HYB_NEIGH) if (neighbor->exclude_setting() != 0) - error->all(FLERR, - "Cannot use neigh_modify exclude with GPU neighbor builds"); + error->all(FLERR, "Cannot use neigh_modify exclude with GPU neighbor builds"); if (utils::strmatch(update->integrate_style,"^verlet")) post_force(vflag); else { From 284ed98fb8c3979b625261ceb4443bec7ddfdd2c Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Sun, 29 Aug 2021 22:08:49 -0400 Subject: [PATCH 14/15] fix spelling error and reformat paragraph --- doc/src/pair_snap.rst | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/doc/src/pair_snap.rst b/doc/src/pair_snap.rst index e00169b306..1bc17fa8c8 100644 --- a/doc/src/pair_snap.rst +++ b/doc/src/pair_snap.rst @@ -191,22 +191,19 @@ pair_coeff command, to avoid ambiguity in the number of coefficients. The keywords *chunksize* and *parallelthresh* are only applicable when using the pair style *snap* with the KOKKOS package on GPUs and are -ignored otherwise. -The *chunksize* keyword controls -the number of atoms in each pass used to compute the bispectrum -components and is used to avoid running out of memory. For example -if there are 8192 atoms in the simulation and the *chunksize* -is set to 4096, the bispectrum calculation will be broken up -into two passes (running on a single GPU). -The *parallelthresh* keyword controls -a crossover threshold for performing extra parallelism. For -small systems, exposing additional parallism can be beneficial when -there is not enough work to fully saturate the GPU threads otherwise. -However, the extra parallelism also leads to more divergence -and can hurt performance when the system is already large enough to -saturate the GPU threads. Extra parallelism will be performed if the -*chunksize* (or total number of atoms per GPU) is smaller than -*parallelthresh*. +ignored otherwise. The *chunksize* keyword controls the number of atoms +in each pass used to compute the bispectrum components and is used to +avoid running out of memory. For example if there are 8192 atoms in the +simulation and the *chunksize* is set to 4096, the bispectrum +calculation will be broken up into two passes (running on a single GPU). +The *parallelthresh* keyword controls a crossover threshold for +performing extra parallelism. For small systems, exposing additional +parallelism can be beneficial when there is not enough work to fully +saturate the GPU threads otherwise. However, the extra parallelism also +leads to more divergence and can hurt performance when the system is +already large enough to saturate the GPU threads. Extra parallelism +will be performed if the *chunksize* (or total number of atoms per GPU) +is smaller than *parallelthresh*. Detailed definitions for all the other keywords are given on the :doc:`compute sna/atom ` doc page. From 00c3c5cf06b013e4cad7eb052ab5634aa3a16e15 Mon Sep 17 00:00:00 2001 From: Stan Gerald Moore Date: Mon, 30 Aug 2021 12:43:07 -0600 Subject: [PATCH 15/15] Port changes from #2903 to Kokkos --- src/KOKKOS/fix_property_atom_kokkos.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/KOKKOS/fix_property_atom_kokkos.cpp b/src/KOKKOS/fix_property_atom_kokkos.cpp index 841b791c5c..97931f5d67 100644 --- a/src/KOKKOS/fix_property_atom_kokkos.cpp +++ b/src/KOKKOS/fix_property_atom_kokkos.cpp @@ -45,23 +45,23 @@ FixPropertyAtomKokkos::FixPropertyAtomKokkos(LAMMPS *lmp, int narg, char **arg) void FixPropertyAtomKokkos::grow_arrays(int nmax) { for (int m = 0; m < nvalue; m++) { - if (style[m] == MOLECULE) { + if (styles[m] == MOLECULE) { memory->grow(atom->molecule,nmax,"atom:molecule"); size_t nbytes = (nmax-nmax_old) * sizeof(tagint); memset(&atom->molecule[nmax_old],0,nbytes); - } else if (style[m] == CHARGE) { + } else if (styles[m] == CHARGE) { memory->grow(atom->q,nmax,"atom:q"); size_t nbytes = (nmax-nmax_old) * sizeof(double); memset(&atom->q[nmax_old],0,nbytes); - } else if (style[m] == RMASS) { + } else if (styles[m] == RMASS) { memory->grow(atom->rmass,nmax,"atom:rmass"); size_t nbytes = (nmax-nmax_old) * sizeof(double); memset(&atom->rmass[nmax_old],0,nbytes); - } else if (style[m] == INTEGER) { + } else if (styles[m] == INTEGER) { memory->grow(atom->ivector[index[m]],nmax,"atom:ivector"); size_t nbytes = (nmax-nmax_old) * sizeof(int); memset(&atom->ivector[index[m]][nmax_old],0,nbytes); - } else if (style[m] == DOUBLE) { + } else if (styles[m] == DOUBLE) { atomKK->sync(Device,DVECTOR_MASK); memoryKK->grow_kokkos(atomKK->k_dvector,atomKK->dvector,atomKK->k_dvector.extent(0),nmax, "atom:dvector");