From f560cd6dd596857d14d3663e3c225fbaec2096d0 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Sat, 21 Mar 2020 01:08:09 -0400 Subject: [PATCH 01/14] make certain, the molecular flag is always initialized --- src/atom_vec.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/atom_vec.cpp b/src/atom_vec.cpp index 7b89c2fd79..c4dd53ad18 100644 --- a/src/atom_vec.cpp +++ b/src/atom_vec.cpp @@ -36,6 +36,7 @@ AtomVec::AtomVec(LAMMPS *lmp) : Pointers(lmp) forceclearflag = 0; size_data_bonus = 0; maxexchange = 0; + molecular = 0; kokkosable = 0; From 79b84c0847d0f9504527ce0b5dcb99688af2943c Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Sun, 22 Mar 2020 15:44:28 -0400 Subject: [PATCH 02/14] more thorough checking if BUILD_OMP may be enabled by default. we need the OpenMP runtime, too. --- cmake/CMakeLists.txt | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 4dd079eaae..36bed2d649 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -227,7 +227,15 @@ pkg_depends(USER-LB MPI) pkg_depends(USER-PHONON KSPACE) pkg_depends(USER-SCAFACOS MPI) +# detect if we may enable OpenMP support by default +set(BUILD_OMP_DEFAULT OFF) find_package(OpenMP QUIET) +if(OpenMP_FOUND) + check_include_file_cxx(omp.h HAVE_OMP_H_INCLUDE) + if(HAVE_OMP_H_INCLUDE) + set(BUILD_OMP_DEFAULT ON) + endif() +endif() # TODO: this is a temporary workaround until a better solution is found. AK 2019-05-30 # GNU GCC 9.x uses settings incompatible with our use of 'default(none)' in OpenMP pragmas @@ -237,14 +245,14 @@ find_package(OpenMP QUIET) if ((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 8.99.9)) option(BUILD_OMP "Build with OpenMP support" OFF) else() - option(BUILD_OMP "Build with OpenMP support" ${OpenMP_FOUND}) + option(BUILD_OMP "Build with OpenMP support" ${BUILD_OMP_DEFAULT}) endif() if(BUILD_OMP) find_package(OpenMP REQUIRED) check_include_file_cxx(omp.h HAVE_OMP_H_INCLUDE) if(NOT HAVE_OMP_H_INCLUDE) - message(FATAL_ERROR "Cannot find required 'omp.h' header file") + message(FATAL_ERROR "Cannot find the 'omp.h' header file required for full OpenMP support") endif() set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") From e1f01d3e6514eecb83584c1e2b14c88569db70bb Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Sun, 22 Mar 2020 21:14:26 -0400 Subject: [PATCH 03/14] use consistent naming for c++11 style kspace thread --- src/USER-INTEL/verlet_lrt_intel.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/USER-INTEL/verlet_lrt_intel.cpp b/src/USER-INTEL/verlet_lrt_intel.cpp index a3c05c46bc..bd143c4c94 100644 --- a/src/USER-INTEL/verlet_lrt_intel.cpp +++ b/src/USER-INTEL/verlet_lrt_intel.cpp @@ -185,7 +185,7 @@ void VerletLRTIntel::setup(int flag) _kspace_done = 0; pthread_mutex_unlock(&_kmutex); #elif defined(_LMP_INTEL_LRT_11) - kspace_thread.join(); + _kspace_thread.join(); #endif if (kspace_compute_flag) _intel_kspace->compute_second(eflag,vflag); @@ -298,9 +298,9 @@ void VerletLRTIntel::run(int n) pthread_cond_signal(&_kcond); pthread_mutex_unlock(&_kmutex); #elif defined(_LMP_INTEL_LRT_11) - std::thread kspace_thread; + std::thread _kspace_thread; if (kspace_compute_flag) - kspace_thread=std::thread([=] { + _kspace_thread=std::thread([=] { _intel_kspace->compute_first(eflag, vflag); timer->stamp(Timer::KSPACE); } ); @@ -332,7 +332,7 @@ void VerletLRTIntel::run(int n) pthread_mutex_unlock(&_kmutex); #elif defined(_LMP_INTEL_LRT_11) if (kspace_compute_flag) - kspace_thread.join(); + _kspace_thread.join(); #endif if (kspace_compute_flag) { From 98bfbbd57630fb21421d73c2478571068e343a5c Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Sun, 22 Mar 2020 22:21:12 -0400 Subject: [PATCH 04/14] fix typo in CMake module --- cmake/Modules/Packages/USER-INTEL.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/Modules/Packages/USER-INTEL.cmake b/cmake/Modules/Packages/USER-INTEL.cmake index d0941a0a12..9ae4333ee2 100644 --- a/cmake/Modules/Packages/USER-INTEL.cmake +++ b/cmake/Modules/Packages/USER-INTEL.cmake @@ -31,7 +31,7 @@ if(PKG_USER-INTEL) endif() endif() if(INTEL_LRT_MODE STREQUAL "C++11") - add_definitions(-DLMP_INTEL_USERLRT -DLMP_INTEL_LRT11) + add_definitions(-DLMP_INTEL_USELRT -DLMP_INTEL_LRT11) endif() if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") From 6e1f18961c87595058f2f8c83414ba0717eb1e13 Mon Sep 17 00:00:00 2001 From: Richard Berger Date: Mon, 23 Mar 2020 09:42:46 -0400 Subject: [PATCH 05/14] Convert characters to UTF-8 --- src/KSPACE/msm.cpp | 4 ++-- src/USER-INTEL/pair_tersoff_intel.cpp | 2 +- src/USER-SMTBQ/pair_smtbq.cpp | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/KSPACE/msm.cpp b/src/KSPACE/msm.cpp index 126236a328..1520c2c607 100644 --- a/src/KSPACE/msm.cpp +++ b/src/KSPACE/msm.cpp @@ -2920,7 +2920,7 @@ void MSM::compute_phis_and_dphis(const double &dx, const double &dy, /* ---------------------------------------------------------------------- compute phi using interpolating polynomial - see Eq 7 from Parallel Computing 35 (2009) 164–177 + see Eq 7 from Parallel Computing 35 (2009) 164-177 and Hardy's thesis ------------------------------------------------------------------------- */ @@ -2999,7 +2999,7 @@ inline double MSM::compute_phi(const double &xi) /* ---------------------------------------------------------------------- compute the derivative of phi phi is an interpolating polynomial - see Eq 7 from Parallel Computing 35 (2009) 164–177 + see Eq 7 from Parallel Computing 35 (2009) 164-177 and Hardy's thesis ------------------------------------------------------------------------- */ diff --git a/src/USER-INTEL/pair_tersoff_intel.cpp b/src/USER-INTEL/pair_tersoff_intel.cpp index 76d06b02dd..8784029320 100644 --- a/src/USER-INTEL/pair_tersoff_intel.cpp +++ b/src/USER-INTEL/pair_tersoff_intel.cpp @@ -12,7 +12,7 @@ ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- - Contributing author: Markus Höhnerbach (RWTH) + Contributing author: Markus Höhnerbach (RWTH) ------------------------------------------------------------------------- */ #include diff --git a/src/USER-SMTBQ/pair_smtbq.cpp b/src/USER-SMTBQ/pair_smtbq.cpp index f61fc1a72e..fcf6d141f2 100644 --- a/src/USER-SMTBQ/pair_smtbq.cpp +++ b/src/USER-SMTBQ/pair_smtbq.cpp @@ -13,7 +13,7 @@ /* ---------------------------------------------------------------------- The SMTBQ code has been developed with the financial support of CNRS and - of the Regional Council of Burgundy (Convention nˇ 2010-9201AAO037S03129) + of the Regional Council of Burgundy (Convention n¡ 2010-9201AAO037S03129) Copyright (2015) Universite de Bourgogne : Nicolas SALLES, Olivier POLITANO @@ -943,7 +943,7 @@ void PairSMTBQ::compute(int eflag, int vflag) 3 -> Short int. Ox-Ox 4 -> Short int. SMTB (repulsion) 5 -> Covalent energy SMTB - 6 -> Somme des Q(i)˛ + 6 -> Somme des Q(i)² ------------------------------------------------------------------------- */ /* -------------- N-body forces Calcul --------------- */ @@ -3022,7 +3022,7 @@ void PairSMTBQ::groupQEqAllParallel_QEq() ngp = igp = 0; nelt[ngp] = 0; - // On prend un oxygčne + // On prend un oxygène // printf ("[me %d] On prend un oxygene\n",me); for (ii = 0; ii < inum; ii++) { From bcfc606efb2622e9afdfae6984f6b01a187f0997 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Mon, 23 Mar 2020 13:20:56 -0700 Subject: [PATCH 06/14] SNAP optimizations, kernel fusion, large reduction of memory usage on the GPU, misc. performance optimizations. --- src/KOKKOS/pair_snap_kokkos.h | 12 +- src/KOKKOS/pair_snap_kokkos_impl.h | 111 ++--- src/KOKKOS/sna_kokkos.h | 20 +- src/KOKKOS/sna_kokkos_impl.h | 676 ++++++++++++++--------------- 4 files changed, 349 insertions(+), 470 deletions(-) diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h index e53dec4d86..b57ef2d9e5 100644 --- a/src/KOKKOS/pair_snap_kokkos.h +++ b/src/KOKKOS/pair_snap_kokkos.h @@ -37,15 +37,13 @@ struct TagPairSNAPBeta{}; struct TagPairSNAPComputeNeigh{}; struct TagPairSNAPPreUi{}; struct TagPairSNAPComputeUi{}; -struct TagPairSNAPComputeUiTot{}; // accumulate ulist into ulisttot separately struct TagPairSNAPComputeUiCPU{}; struct TagPairSNAPComputeZi{}; struct TagPairSNAPComputeBi{}; struct TagPairSNAPZeroYi{}; struct TagPairSNAPComputeYi{}; -struct TagPairSNAPComputeDuidrj{}; +struct TagPairSNAPComputeFusedDeidrj{}; struct TagPairSNAPComputeDuidrjCPU{}; -struct TagPairSNAPComputeDeidrj{}; struct TagPairSNAPComputeDeidrjCPU{}; template @@ -83,9 +81,6 @@ public: KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy::member_type& team) const; - KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeUiTot,const typename Kokkos::TeamPolicy::member_type& team) const; - KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const; @@ -102,14 +97,11 @@ public: void operator() (TagPairSNAPComputeYi,const int& ii) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeDuidrj,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagPairSNAPComputeFusedDeidrj,const typename Kokkos::TeamPolicy::member_type& team) const; KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const; - KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeDeidrj,const typename Kokkos::TeamPolicy::member_type& team) const; - KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const; diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 1156d11c31..d807f149a9 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -30,7 +30,6 @@ #include "kokkos.h" #include "sna.h" - #define MAXLINE 1024 #define MAXWORD 3 @@ -255,26 +254,19 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) // scratch size: 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values // 2 is for double buffer - typename Kokkos::TeamPolicy policy_ui(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); + const int tile_size = (twojmax+1)*(twojmax+1); typedef Kokkos::View< SNAcomplex*, Kokkos::DefaultExecutionSpace::scratch_memory_space, Kokkos::MemoryTraits > ScratchViewType; - int scratch_size = ScratchViewType::shmem_size( 2 * team_size * (twojmax+1)*(twojmax+1)); + int scratch_size = ScratchViewType::shmem_size( 2 * team_size * tile_size ); + + typename Kokkos::TeamPolicy policy_ui(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam( scratch_size )); Kokkos::parallel_for("ComputeUi",policy_ui,*this); - // ComputeUitot - vector_length = 1; - team_size = 128; - team_size_max = Kokkos::TeamPolicy::team_size_max(*this); - if (team_size*vector_length > team_size_max) - team_size = team_size_max/vector_length; - - typename Kokkos::TeamPolicy policy_ui_tot(((idxu_max+team_size-1)/team_size)*chunk_size,team_size,vector_length); - Kokkos::parallel_for("ComputeUiTot",policy_ui_tot,*this); } @@ -316,7 +308,7 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) typename Kokkos::RangePolicy policy_yi(0,chunk_size*idxz_max); Kokkos::parallel_for("ComputeYi",policy_yi,*this); - //ComputeDuidrj + //ComputeDuidrj and Deidrj if (lmp->kokkos->ngpus == 0) { // CPU int vector_length = 1; int team_size = 1; @@ -324,53 +316,37 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) typename Kokkos::TeamPolicy policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); snaKK.set_dir(-1); // technically doesn't do anything Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this); - } else { // GPU, utilize scratch memory and splitting over dimensions - int team_size_max = Kokkos::TeamPolicy::team_size_max(*this); + typename Kokkos::TeamPolicy policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); + + Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this); + } else { // GPU, utilize scratch memory and splitting over dimensions, fused dui and dei + + int team_size_max = Kokkos::TeamPolicy::team_size_max(*this); int vector_length = 32; int team_size = 2; // need to cap b/c of shared memory reqs if (team_size*vector_length > team_size_max) team_size = team_size_max/vector_length; - // scratch size: 2 * 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values + // scratch size: 2 * 2 * team_size * (twojmax+1)*(twojmax/2+1), to cover half `m1`,`m2` values due to symmetry // 2 is for double buffer - typedef Kokkos::View< SNAcomplex*, - Kokkos::DefaultExecutionSpace::scratch_memory_space, - Kokkos::MemoryTraits > - ScratchViewType; + const int tile_size = (twojmax+1)*(twojmax/2+1); + + typedef Kokkos::View< SNAcomplex*, + Kokkos::DefaultExecutionSpace::scratch_memory_space, + Kokkos::MemoryTraits > + ScratchViewType; + int scratch_size = ScratchViewType::shmem_size( 4 * team_size * tile_size); + + typename Kokkos::TeamPolicy policy_fused_deidrj(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); + policy_fused_deidrj = policy_fused_deidrj.set_scratch_size(0, Kokkos::PerTeam( scratch_size )); - int scratch_size = ScratchViewType::shmem_size( 4 * team_size * (twojmax+1)*(twojmax+1)); - typename Kokkos::TeamPolicy policy_duidrj(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); - policy_duidrj = policy_duidrj.set_scratch_size(0, Kokkos::PerTeam( scratch_size )); - // Need to call three times, once for each direction for (int k = 0; k < 3; k++) { snaKK.set_dir(k); - Kokkos::parallel_for("ComputeDuidrj",policy_duidrj,*this); + Kokkos::parallel_for("ComputeFusedDeidrj",policy_fused_deidrj,*this); } } - //ComputeDeidrj - if (lmp->kokkos->ngpus == 0) { // CPU - int vector_length = 1; - int team_size = 1; - - typename Kokkos::TeamPolicy policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); - - Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this); - - } else { // GPU, different loop strategy internally - - int team_size_max = Kokkos::TeamPolicy::team_size_max(*this); - int vector_length = 32; // coalescing disaster right now, will fix later - int team_size = 8; - if (team_size*vector_length > team_size_max) - team_size = team_size_max/vector_length; - - typename Kokkos::TeamPolicy policy_deidrj(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); - - Kokkos::parallel_for("ComputeDeidrj",policy_deidrj,*this); - } - //ComputeForce if (eflag) { if (neighflag == HALF) { @@ -642,25 +618,6 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeUi,const typename my_sna.compute_ui(team,ii,jj); } -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeUiTot,const typename Kokkos::TeamPolicy::member_type& team) const { - SNAKokkos my_sna = snaKK; - - // Extract the quantum number - const int idx = team.team_rank() + team.team_size() * (team.league_rank() % ((my_sna.idxu_max+team.team_size()-1)/team.team_size())); - if (idx >= my_sna.idxu_max) return; - - // Extract the atomic index - const int ii = team.league_rank() / ((my_sna.idxu_max+team.team_size()-1)/team.team_size()); - if (ii >= chunk_size) return; - - // Extract the number of neighbors neighbor number - const int ninside = d_ninside(ii); - - my_sna.compute_uitot(team,idx,ii,ninside); -} - template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const { @@ -718,7 +675,7 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeBi,const typename template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeDuidrj,const typename Kokkos::TeamPolicy::member_type& team) const { +void PairSNAPKokkos::operator() (TagPairSNAPComputeFusedDeidrj,const typename Kokkos::TeamPolicy::member_type& team) const { SNAKokkos my_sna = snaKK; // Extract the atom number @@ -730,7 +687,7 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeDuidrj,const type const int ninside = d_ninside(ii); if (jj >= ninside) return; - my_sna.compute_duidrj(team,ii,jj); + my_sna.compute_fused_deidrj(team,ii,jj); } template @@ -750,24 +707,6 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeDuidrjCPU,const t my_sna.compute_duidrj_cpu(team,ii,jj); } - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeDeidrj,const typename Kokkos::TeamPolicy::member_type& team) const { - SNAKokkos my_sna = snaKK; - - // Extract the atom number - int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size())); - if (ii >= chunk_size) return; - - // Extract the neighbor number - const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size()); - const int ninside = d_ninside(ii); - if (jj >= ninside) return; - - my_sna.compute_deidrj(team,ii,jj); -} - template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const { diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index 48d9114fbf..a6d9db3218 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -135,14 +135,10 @@ inline KOKKOS_INLINE_FUNCTION void pre_ui(const typename Kokkos::TeamPolicy::member_type& team,const int&); // ForceSNAP KOKKOS_INLINE_FUNCTION - void compute_ui(const typename Kokkos::TeamPolicy::member_type& team, int, int); // ForceSNAP + void compute_ui(const typename Kokkos::TeamPolicy::member_type& team, const int, const int); // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int); // ForceSNAP KOKKOS_INLINE_FUNCTION - void compute_ui_orig(const typename Kokkos::TeamPolicy::member_type& team, int, int); // ForceSNAP - KOKKOS_INLINE_FUNCTION - void compute_uitot(const typename Kokkos::TeamPolicy::member_type& team, int, int, int); // ForceSNAP - KOKKOS_INLINE_FUNCTION void compute_zi(const int&); // ForceSNAP KOKKOS_INLINE_FUNCTION void zero_yi(const int&,const int&); // ForceSNAP @@ -155,12 +151,10 @@ inline // functions for derivatives KOKKOS_INLINE_FUNCTION - void compute_duidrj(const typename Kokkos::TeamPolicy::member_type& team, int, int); //ForceSNAP + void compute_fused_deidrj(const typename Kokkos::TeamPolicy::member_type& team, const int, const int); //ForceSNAP KOKKOS_INLINE_FUNCTION void compute_duidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int); //ForceSNAP KOKKOS_INLINE_FUNCTION - void compute_deidrj(const typename Kokkos::TeamPolicy::member_type& team, int, int); // ForceSNAP - KOKKOS_INLINE_FUNCTION void compute_deidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int); // ForceSNAP KOKKOS_INLINE_FUNCTION double compute_sfac(double, double); // add_uarraytot, compute_duarray @@ -251,10 +245,6 @@ inline KOKKOS_INLINE_FUNCTION void add_uarraytot(const typename Kokkos::TeamPolicy::member_type& team, int, int, double, double, double); // compute_ui - KOKKOS_INLINE_FUNCTION - void compute_uarray(const typename Kokkos::TeamPolicy::member_type& team, int, int, - double, double, double, - double, double); // compute_ui KOKKOS_INLINE_FUNCTION void compute_uarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int, double, double, double, @@ -267,12 +257,8 @@ inline inline int compute_ncoeff(); // SNAKokkos() KOKKOS_INLINE_FUNCTION - void compute_duarray(const typename Kokkos::TeamPolicy::member_type& team, int, int, - double, double, double, // compute_duidrj - double, double, double, double, double); - KOKKOS_INLINE_FUNCTION void compute_duarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int, - double, double, double, // compute_duidrj + double, double, double, // compute_duidrj_cpu double, double, double, double, double); // Sets the style for the switching function diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index ef3312bd16..1daf8fd05c 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -19,6 +19,7 @@ #include #include #include +#include namespace LAMMPS_NS { @@ -231,11 +232,22 @@ void SNAKokkos::grow_rij(int newnatom, int newnmax) zlist = t_sna_2c_ll("sna:zlist",idxz_max,natom); //ulist = t_sna_3c("sna:ulist",natom,nmax,idxu_max); - ulist = t_sna_3c_ll("sna:ulist",idxu_max,natom,nmax); +#ifdef KOKKOS_ENABLE_CUDA + if (std::is_same::value) { + // dummy allocation + ulist = t_sna_3c_ll("sna:ulist",1,1,1); + dulist = t_sna_4c_ll("sna:dulist",1,1,1); + } else { +#endif + ulist = t_sna_3c_ll("sna:ulist",idxu_max,natom,nmax); + dulist = t_sna_4c_ll("sna:dulist",idxu_max,natom,nmax); +#ifdef KOKKOS_ENABLE_CUDA + } +#endif + //ylist = t_sna_2c_lr("sna:ylist",natom,idxu_max); ylist = t_sna_2c_ll("sna:ylist",idxu_max,natom); - //dulist = t_sna_4c("sna:dulist",natom,nmax,idxu_max); dulist = t_sna_4c_ll("sna:dulist",idxu_max,natom,nmax); } @@ -269,14 +281,14 @@ void SNAKokkos::pre_ui(const typename Kokkos::TeamPolicy } /* ---------------------------------------------------------------------- - compute Ui by summing over bispectrum components + compute Ui by computing Wigner U-functions for one neighbor and + accumulating to the total. GPU only. ------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_ui(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) +void SNAKokkos::compute_ui(const typename Kokkos::TeamPolicy::member_type& team, const int iatom, const int jnbor) { - double rsq, r, x, y, z, z0, theta0; // utot(j,ma,mb) = 0 for all j,ma,ma // utot(j,ma,ma) = 1 for all j,ma @@ -284,22 +296,143 @@ void SNAKokkos::compute_ui(const typename Kokkos::TeamPolicy 0)?rootpq2*buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.); + //const SNAcomplex u_up2 = (ma > 0)?rootpq2*ulist(jjup_index-1,iatom,jnbor):SNAcomplex(0.,0.); + caconjxpy(b, u_up2, u_accum); + + // VMK recursion relation: grab contribution which is multiplied by a* + const double rootpq1 = rootpqarray(j - ma, j - mb); + const SNAcomplex u_up1 = (ma < j)?rootpq1*buf1[jjup_shared_idx]:SNAcomplex(0.,0.); + //const SNAcomplex u_up1 = (ma < j)?rootpq1*ulist(jjup_index,iatom,jnbor):SNAcomplex(0.,0.); + caconjxpy(a, u_up1, u_accum); + + //ulist(jju_index,iatom,jnbor) = u_accum; + // back up into shared memory for next iter + buf2[jju_shared_idx] = u_accum; + + Kokkos::atomic_add(&(ulisttot(jju_index,iatom).re), sfac * u_accum.re); + Kokkos::atomic_add(&(ulisttot(jju_index,iatom).im), sfac * u_accum.im); + + // copy left side to right side with inversion symmetry VMK 4.4(2) + // u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb)) + // if j is even (-> physical j integer), last element maps to self, skip + //if (!(m == total_iters - 1 && j % 2 == 0)) { + if (m < total_iters - 1 || j % 2 == 1) { + const int sign_factor = (((ma+mb)%2==0)?1:-1); + const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1); + const int jjup_flip = jju + jju_shared_flip; // jju+(j+1-mb)*(j+1)-(ma+1); + + + if (sign_factor == 1) { + u_accum.im = -u_accum.im; + } else { + u_accum.re = -u_accum.re; + } + //ulist(jjup_flip,iatom,jnbor) = u_accum; + buf2[jju_shared_flip] = u_accum; + + Kokkos::atomic_add(&(ulisttot(jjup_flip,iatom).re), sfac * u_accum.re); + Kokkos::atomic_add(&(ulisttot(jjup_flip,iatom).im), sfac * u_accum.im); + } + }); + // In CUDA backend, + // ThreadVectorRange has a __syncwarp (appropriately masked for + // vector lengths < 32) implict at the end + + // swap double buffers + auto tmp = buf1; buf1 = buf2; buf2 = tmp; + + + } } +/* ---------------------------------------------------------------------- + compute Ui by summing over bispectrum components. CPU only. +------------------------------------------------------------------------- */ + template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) @@ -327,37 +460,6 @@ void SNAKokkos::compute_ui_cpu(const typename Kokkos::TeamPolicy -KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_uitot(const typename Kokkos::TeamPolicy::member_type& team, int idx, int iatom, int ninside) -{ - // fuse initialize in, avoid this load? - SNAcomplex utot = ulisttot(idx, iatom); - for (int jnbor = 0; jnbor < ninside; jnbor++) { - - const auto x = rij(iatom,jnbor,0); - const auto y = rij(iatom,jnbor,1); - const auto z = rij(iatom,jnbor,2); - const auto rsq = x * x + y * y + z * z; - const auto r = sqrt(rsq); - - const double wj_local = wj(iatom, jnbor); - const double rcut = rcutij(iatom, jnbor); - const double sfac = compute_sfac(r, rcut) * wj_local; - - auto ulist_local = ulist(idx, iatom, jnbor); - utot.re += sfac * ulist_local.re; - utot.im += sfac * ulist_local.im; - } - - ulisttot(idx, iatom) = utot; - -} - /* ---------------------------------------------------------------------- compute Zi by summing over products of Ui not updated yet @@ -509,72 +611,203 @@ void SNAKokkos::compute_yi(int iter, } /* ---------------------------------------------------------------------- - compute dEidRj + Fused calculation of the derivative of Ui w.r.t. atom j + and of dEidRj. GPU only. ------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_deidrj(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) +void SNAKokkos::compute_fused_deidrj(const typename Kokkos::TeamPolicy::member_type& team, const int iatom, const int jnbor) { - t_scalar3 final_sum; + // get shared memory offset + const int max_m_tile = (twojmax+1)*(twojmax/2+1); + const int team_rank = team.team_rank(); + const int scratch_shift = team_rank * max_m_tile; - // Like in ComputeUi/ComputeDuidrj, regular loop over j. - for (int j = 0; j <= twojmax; j++) { - int jju = idxu_block(j); + // double buffer for ulist + SNAcomplex* ulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift; + SNAcomplex* ulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift; - // Flatten loop over ma, mb, reduce w/in + // double buffer for dulist + SNAcomplex* dulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift; + SNAcomplex* dulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift; + const double x = rij(iatom,jnbor,0); + const double y = rij(iatom,jnbor,1); + const double z = rij(iatom,jnbor,2); + const double rsq = x * x + y * y + z * z; + const double r = sqrt(rsq); + const double rcut = rcutij(iatom, jnbor); + const double rscale0 = rfac0 * MY_PI / (rcut - rmin0); + const double theta0 = (r - rmin0) * rscale0; + const double cs = cos(theta0); + const double sn = sin(theta0); + const double z0 = r * cs / sn; + const double dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq; + + const double wj_local = wj(iatom, jnbor); + const double sfac = wj_local * compute_sfac(r, rcut); + const double dsfac = wj_local * compute_dsfac(r, rcut); + + const double rinv = 1.0 / r; + + // extract a single unit vector + const double u = (dir == 0 ? x * rinv : dir == 1 ? y * rinv : z * rinv); + + // Compute Cayley-Klein parameters for unit quaternion + const double r0inv = 1.0 / sqrt(r * r + z0 * z0); + + const SNAcomplex a = { r0inv * z0, -r0inv * z }; + const SNAcomplex b = { r0inv * y, -r0inv * x }; + + const double dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr); + const double dr0inv = dr0invdr * u; + const double dz0 = dz0dr * u; + + const SNAcomplex da = { dz0 * r0inv + z0 * dr0inv, + - z * dr0inv + (dir == 2 ? - r0inv : 0.) }; + + const SNAcomplex db = { y * dr0inv + (dir==1?r0inv:0.), + -x * dr0inv + (dir==0?-r0inv:0.) }; + + // Accumulate the full contribution to dedr on the fly + const double du_prod = dsfac * u; // chain rule + const SNAcomplex y_local = ylist(0, iatom); + + // Symmetry factor of 0.5 b/c 0 element is on diagonal for even j==0 + double dedr_full_sum = 0.5 * du_prod * y_local.re; + + // single has a warp barrier at the end + Kokkos::single(Kokkos::PerThread(team), [=]() { + //dulist(0,iatom,jnbor,dir) = { dsfac * u, 0. }; // fold in chain rule here + ulist_buf1[0] = {1., 0.}; + dulist_buf1[0] = {0., 0.}; + }); + + for (int j = 1; j <= twojmax; j++) { + int jju = idxu_block[j]; + int jjup = idxu_block[j-1]; + + // flatten the loop over ma,mb + + // for (int ma = 0; ma <= j; ma++) const int n_ma = j+1; // for (int mb = 0; 2*mb <= j; mb++) const int n_mb = j/2+1; const int total_iters = n_ma * n_mb; - t_scalar3 sum; + double dedr_sum = 0.; // j-local sum //for (int m = 0; m < total_iters; m++) { Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, total_iters), - [&] (const int m, t_scalar3& sum_tmp) { + [&] (const int m, double& sum_tmp) { // ma fast, mb slow int ma = m % n_ma; int mb = m / n_ma; - // get index - const int jju_index = jju+mb+mb*j+ma; - - // get ylist, rescale last element by 0.5 - SNAcomplex y_local = ylist(jju_index,iatom); - - const SNAcomplex du_x = dulist(jju_index,iatom,jnbor,0); - const SNAcomplex du_y = dulist(jju_index,iatom,jnbor,1); - const SNAcomplex du_z = dulist(jju_index,iatom,jnbor,2); + const int jju_index = jju+m; + // Load y_local, apply the symmetry scaling factor + // The "secret" of the shared memory optimization is it eliminates + // all global memory reads to duidrj in lieu of caching values in + // shared memory and otherwise always writing, making the kernel + // ultimately compute bound. We take advantage of that by adding + // some reads back in. + auto y_local = ylist(jju_index,iatom); if (j % 2 == 0 && 2*mb == j) { if (ma == mb) { y_local = 0.5*y_local; } - else if (ma > mb) { y_local = { 0., 0. }; } + else if (ma > mb) { y_local = { 0., 0. }; } // can probably avoid this outright // else the ma < mb gets "double counted", cancelling the 0.5. } - sum_tmp.x += du_x.re * y_local.re + du_x.im * y_local.im; - sum_tmp.y += du_y.re * y_local.re + du_y.im * y_local.im; - sum_tmp.z += du_z.re * y_local.re + du_z.im * y_local.im; + // index into shared memory + const int jju_shared_idx = m; + const int jjup_shared_idx = jju_shared_idx - mb; - }, sum); // end loop over flattened ma,mb + // Need to compute and accumulate both u and du (mayhaps, we could probably + // balance some read and compute by reading u each time). + SNAcomplex u_accum = { 0., 0. }; + SNAcomplex du_accum = { 0., 0. }; - final_sum.x += sum.x; - final_sum.y += sum.y; - final_sum.z += sum.z; + const double rootpq2 = -rootpqarray(ma, j - mb); + const SNAcomplex u_up2 = (ma > 0)?rootpq2*ulist_buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.); + caconjxpy(b, u_up2, u_accum); + + const double rootpq1 = rootpqarray(j - ma, j - mb); + const SNAcomplex u_up1 = (ma < j)?rootpq1*ulist_buf1[jjup_shared_idx]:SNAcomplex(0.,0.); + caconjxpy(a, u_up1, u_accum); + + // Next, spin up du_accum + const SNAcomplex du_up1 = (ma < j) ? rootpq1*dulist_buf1[jjup_shared_idx] : SNAcomplex(0.,0.); + caconjxpy(da, u_up1, du_accum); + caconjxpy(a, du_up1, du_accum); + + const SNAcomplex du_up2 = (ma > 0) ? rootpq2*dulist_buf1[jjup_shared_idx-1] : SNAcomplex(0.,0.); + caconjxpy(db, u_up2, du_accum); + caconjxpy(b, du_up2, du_accum); + + // No need to save u_accum to global memory + // Cache u_accum, du_accum to scratch memory. + ulist_buf2[jju_shared_idx] = u_accum; + dulist_buf2[jju_shared_idx] = du_accum; + + // Directly accumulate deidrj into sum_tmp + //dulist(jju_index,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum); + const SNAcomplex du_prod = ((dsfac * u)*u_accum) + (sfac*du_accum); + sum_tmp += du_prod.re * y_local.re + du_prod.im * y_local.im; + + // copy left side to right side with inversion symmetry VMK 4.4(2) + // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb]) + if (j%2==1 && mb+1==n_mb) { + int sign_factor = (((ma+mb)%2==0)?1:-1); + //const int jjup_flip = jju+(j+1-mb)*(j+1)-(ma+1); // no longer needed b/c we don't update dulist + const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1); + + if (sign_factor == 1) { + u_accum.im = -u_accum.im; + du_accum.im = -du_accum.im; + } else { + u_accum.re = -u_accum.re; + du_accum.re = -du_accum.re; + } + + // We don't need the second half of the tile for the deidrj accumulation. + // That's taken care of by the symmetry factor above. + //dulist(jjup_flip,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum); + + // We do need it for ortho polynomial generation, though + ulist_buf2[jju_shared_flip] = u_accum; + dulist_buf2[jju_shared_flip] = du_accum; + } + + }, dedr_sum); + + // swap buffers + auto tmp = ulist_buf1; ulist_buf1 = ulist_buf2; ulist_buf2 = tmp; + tmp = dulist_buf1; dulist_buf1 = dulist_buf2; dulist_buf2 = tmp; + + // Accumulate dedr. This "should" be in a single, but + // a Kokkos::single call implies a warp sync, and we may + // as well avoid that. This does no harm as long as the + // final assignment is in a single block. + //Kokkos::single(Kokkos::PerThread(team), [=]() { + dedr_full_sum += dedr_sum; + //}); } + // Store the accumulated dedr. Kokkos::single(Kokkos::PerThread(team), [&] () { - dedr(iatom,jnbor,0) = final_sum.x*2.0; - dedr(iatom,jnbor,1) = final_sum.y*2.0; - dedr(iatom,jnbor,2) = final_sum.z*2.0; + dedr(iatom,jnbor,dir) = dedr_full_sum*2.0; }); - } +/* ---------------------------------------------------------------------- + compute dEidRj, CPU path only. +------------------------------------------------------------------------- */ + + template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_deidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) @@ -624,6 +857,7 @@ void SNAKokkos::compute_deidrj_cpu(const typename Kokkos::TeamPolicy /* ---------------------------------------------------------------------- compute Bi by summing conj(Ui)*Zi + not updated yet ------------------------------------------------------------------------- */ template @@ -708,28 +942,6 @@ void SNAKokkos::compute_bi(const typename Kokkos::TeamPolicy -KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_duidrj(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) -{ - double rsq, r, x, y, z, z0, theta0, cs, sn; - double dz0dr; - - x = rij(iatom,jnbor,0); - y = rij(iatom,jnbor,1); - z = rij(iatom,jnbor,2); - rsq = x * x + y * y + z * z; - r = sqrt(rsq); - double rscale0 = rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0); - theta0 = (r - rmin0) * rscale0; - cs = cos(theta0); - sn = sin(theta0); - z0 = r * cs / sn; - dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq; - - compute_duarray(team, iatom, jnbor, x, y, z, z0, r, dz0dr, wj(iatom,jnbor), rcutij(iatom,jnbor)); -} - template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_duidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) @@ -774,119 +986,6 @@ void SNAKokkos::add_uarraytot(const typename Kokkos::TeamPolicy -KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_uarray(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor, - double x, double y, double z, - double z0, double r) -{ - // define size of scratch memory buffer - const int max_m_tile = (twojmax+1)*(twojmax+1); - const int team_rank = team.team_rank(); - - // get scratch memory double buffer - SNAcomplex* buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0); - SNAcomplex* buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0); - - // compute Cayley-Klein parameters for unit quaternion, - // pack into complex number - double r0inv = 1.0 / sqrt(r * r + z0 * z0); - SNAcomplex a = { r0inv * z0, -r0inv * z }; - SNAcomplex b = { r0inv * y, -r0inv * x }; - - // VMK Section 4.8.2 - - // All writes go to global memory and shared memory - // so we can avoid all global memory reads - Kokkos::single(Kokkos::PerThread(team), [=]() { - ulist(0,iatom,jnbor) = { 1.0, 0.0 }; - buf1[max_m_tile*team_rank] = {1.,0.}; - }); - - for (int j = 1; j <= twojmax; j++) { - const int jju = idxu_block[j]; - int jjup = idxu_block[j-1]; - - // fill in left side of matrix layer from previous layer - - // Flatten loop over ma, mb, need to figure out total - // number of iterations - - // for (int ma = 0; ma <= j; ma++) - const int n_ma = j+1; - // for (int mb = 0; 2*mb <= j; mb++) - const int n_mb = j/2+1; - - const int total_iters = n_ma * n_mb; - - //for (int m = 0; m < total_iters; m++) { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, total_iters), - [&] (const int m) { - - // ma fast, mb slow - int ma = m % n_ma; - int mb = m / n_ma; - - // index into global memory array - const int jju_index = jju+mb+mb*j+ma; - - // index into shared memory buffer for previous level - const int jju_shared_idx = max_m_tile*team_rank+mb+mb*j+ma; - - // index into shared memory buffer for next level - const int jjup_shared_idx = max_m_tile*team_rank+mb*j+ma; - - SNAcomplex u_accum = {0., 0.}; - - // VMK recursion relation: grab contribution which is multiplied by a* - const double rootpq1 = rootpqarray(j - ma, j - mb); - const SNAcomplex u_up1 = (ma < j)?rootpq1*buf1[jjup_shared_idx]:SNAcomplex(0.,0.); - caconjxpy(a, u_up1, u_accum); - - // VMK recursion relation: grab contribution which is multiplied by b* - const double rootpq2 = -rootpqarray(ma, j - mb); - const SNAcomplex u_up2 = (ma > 0)?rootpq2*buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.); - caconjxpy(b, u_up2, u_accum); - - ulist(jju_index,iatom,jnbor) = u_accum; - - // We no longer accumulate into ulisttot in this kernel. - // Instead, we have a separate kernel which avoids atomics. - // Running two separate kernels is net faster. - - // back up into shared memory for next iter - if (j != twojmax) buf2[jju_shared_idx] = u_accum; - - // copy left side to right side with inversion symmetry VMK 4.4(2) - // u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb)) - // We can avoid this if we're on the last row for an integer j - if (!(n_ma % 2 == 1 && (mb+1) == n_mb)) { - - int sign_factor = ((ma%2==0)?1:-1)*(mb%2==0?1:-1); - const int jjup_flip = jju+(j+1-mb)*(j+1)-(ma+1); - const int jju_shared_flip = max_m_tile*team_rank+(j+1-mb)*(j+1)-(ma+1); - - if (sign_factor == 1) { - u_accum.im = -u_accum.im; - } else { - u_accum.re = -u_accum.re; - } - ulist(jjup_flip,iatom,jnbor) = u_accum; - if (j != twojmax) buf2[jju_shared_flip] = u_accum; - } - }); - // In CUDA backend, - // ThreadVectorRange has a __syncwarp (appropriately masked for - // vector lengths < 32) implicit at the end - - // swap double buffers - auto tmp = buf1; buf1 = buf2; buf2 = tmp; - //std::swap(buf1, buf2); // throws warnings - - } -} - -// CPU version template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_uarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor, @@ -976,152 +1075,9 @@ void SNAKokkos::compute_uarray_cpu(const typename Kokkos::TeamPolicy /* ---------------------------------------------------------------------- compute derivatives of Wigner U-functions for one neighbor - see comments in compute_uarray() + see comments in compute_uarray_cpu() ------------------------------------------------------------------------- */ -template -KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_duarray(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor, - double x, double y, double z, - double z0, double r, double dz0dr, - double wj, double rcut) -{ - - // get shared memory offset - const int max_m_tile = (twojmax+1)*(twojmax+1); - const int team_rank = team.team_rank(); - - // double buffer for ulist - SNAcomplex* ulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0); - SNAcomplex* ulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0); - - // double buffer for dulist - SNAcomplex* dulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0); - SNAcomplex* dulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0); - - const double sfac = wj * compute_sfac(r, rcut); - const double dsfac = wj * compute_dsfac(r, rcut); - - const double rinv = 1.0 / r; - - // extract a single unit vector - const double u = (dir == 0 ? x * rinv : dir == 1 ? y * rinv : z * rinv); - - // Compute Cayley-Klein parameters for unit quaternion - - const double r0inv = 1.0 / sqrt(r * r + z0 * z0); - - const SNAcomplex a = { r0inv * z0, -r0inv * z }; - const SNAcomplex b = { r0inv * y, -r0inv * x }; - - const double dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr); - const double dr0inv = dr0invdr * u; - const double dz0 = dz0dr * u; - - const SNAcomplex da = { dz0 * r0inv + z0 * dr0inv, - - z * dr0inv + (dir == 2 ? - r0inv : 0.) }; - - const SNAcomplex db = { y * dr0inv + (dir==1?r0inv:0.), - -x * dr0inv + (dir==0?-r0inv:0.) }; - - // single has a warp barrier at the end - Kokkos::single(Kokkos::PerThread(team), [=]() { - dulist(0,iatom,jnbor,dir) = { dsfac * u, 0. }; // fold in chain rule here - ulist_buf1[max_m_tile*team_rank] = {1., 0.}; - dulist_buf1[max_m_tile*team_rank] = {0., 0.}; - }); - - - for (int j = 1; j <= twojmax; j++) { - int jju = idxu_block[j]; - int jjup = idxu_block[j-1]; - - // flatten the loop over ma,mb - - // for (int ma = 0; ma <= j; ma++) - const int n_ma = j+1; - // for (int mb = 0; 2*mb <= j; mb++) - const int n_mb = j/2+1; - - const int total_iters = n_ma * n_mb; - - //for (int m = 0; m < total_iters; m++) { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, total_iters), - [&] (const int m) { - - // ma fast, mb slow - int ma = m % n_ma; - int mb = m / n_ma; - - const int jju_index = jju+mb+mb*j+ma; - - // index into shared memory - const int jju_shared_idx = max_m_tile*team_rank+mb+mb*j+ma; - const int jjup_shared_idx = max_m_tile*team_rank+mb*j+ma; - - // Need to compute and accumulate both u and du (mayhaps, we could probably - // balance some read and compute by reading u each time). - SNAcomplex u_accum = { 0., 0. }; - SNAcomplex du_accum = { 0., 0. }; - - const double rootpq1 = rootpqarray(j - ma, j - mb); - const SNAcomplex u_up1 = (ma < j)?rootpq1*ulist_buf1[jjup_shared_idx]:SNAcomplex(0.,0.); - caconjxpy(a, u_up1, u_accum); - - const double rootpq2 = -rootpqarray(ma, j - mb); - const SNAcomplex u_up2 = (ma > 0)?rootpq2*ulist_buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.); - caconjxpy(b, u_up2, u_accum); - - // No need to save u_accum to global memory - if (j != twojmax) ulist_buf2[jju_shared_idx] = u_accum; - - // Next, spin up du_accum - const SNAcomplex du_up1 = (ma < j) ? rootpq1*dulist_buf1[jjup_shared_idx] : SNAcomplex(0.,0.); - caconjxpy(da, u_up1, du_accum); - caconjxpy(a, du_up1, du_accum); - - const SNAcomplex du_up2 = (ma > 0) ? rootpq2*dulist_buf1[jjup_shared_idx-1] : SNAcomplex(0.,0.); - caconjxpy(db, u_up2, du_accum); - caconjxpy(b, du_up2, du_accum); - - dulist(jju_index,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum); - - if (j != twojmax) dulist_buf2[jju_shared_idx] = du_accum; - - // copy left side to right side with inversion symmetry VMK 4.4(2) - // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb]) - - int sign_factor = ((ma%2==0)?1:-1)*(mb%2==0?1:-1); - const int jjup_flip = jju+(j+1-mb)*(j+1)-(ma+1); - const int jju_shared_flip = max_m_tile*team_rank+(j+1-mb)*(j+1)-(ma+1); - - if (sign_factor == 1) { - //ulist_alt(iatom,jnbor,jjup_flip).re = u_accum.re; - //ulist_alt(iatom,jnbor,jjup_flip).im = -u_accum.im; - u_accum.im = -u_accum.im; - du_accum.im = -du_accum.im; - } else { - //ulist_alt(iatom,jnbor,jjup_flip).re = -u_accum.re; - //ulist_alt(iatom,jnbor,jjup_flip).im = u_accum.im; - u_accum.re = -u_accum.re; - du_accum.re = -du_accum.re; - } - - dulist(jjup_flip,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum); - - if (j != twojmax) { - ulist_buf2[jju_shared_flip] = u_accum; - dulist_buf2[jju_shared_flip] = du_accum; - } - - }); - - // swap buffers - auto tmp = ulist_buf1; ulist_buf1 = ulist_buf2; ulist_buf2 = tmp; - tmp = dulist_buf1; dulist_buf1 = dulist_buf2; dulist_buf2 = tmp; - } -} - template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_duarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor, @@ -1680,11 +1636,17 @@ double SNAKokkos::memory_usage() bytes += jdimpq*jdimpq * sizeof(double); // pqarray bytes += idxcg_max * sizeof(double); // cglist - bytes += natom * idxu_max * sizeof(double) * 2; // ulist +#ifdef KOKKOS_ENABLE_CUDA + if (!std::is_same::value) { +#endif + bytes += natom * idxu_max * sizeof(double) * 2; // ulist + bytes += natom * idxu_max * 3 * sizeof(double) * 2; // dulist +#ifdef KOKKOS_ENABLE_CUDA + } +#endif bytes += natom * idxu_max * sizeof(double) * 2; // ulisttot if (!Kokkos::Impl::is_same::value) bytes += natom * idxu_max * sizeof(double) * 2; // ulisttot_lr - bytes += natom * idxu_max * 3 * sizeof(double) * 2; // dulist bytes += natom * idxz_max * sizeof(double) * 2; // zlist bytes += natom * idxb_max * sizeof(double); // blist From 36095bbfdf39c85b6932bd0ec03ac062209e497e Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Mon, 23 Mar 2020 21:15:00 -0600 Subject: [PATCH 07/14] Tweak comment --- src/KOKKOS/sna_kokkos_impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 1daf8fd05c..182a49bfb1 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -333,7 +333,7 @@ void SNAKokkos::compute_ui(const typename Kokkos::TeamPolicy Date: Mon, 23 Mar 2020 23:31:12 -0400 Subject: [PATCH 08/14] Cleaned up comment. --- src/KOKKOS/sna_kokkos_impl.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 182a49bfb1..e7b7087951 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -462,7 +462,6 @@ void SNAKokkos::compute_ui_cpu(const typename Kokkos::TeamPolicy From 5fa99cb07281675ebde5ada638d6597ba23a95f5 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Mon, 23 Mar 2020 21:33:11 -0600 Subject: [PATCH 09/14] Comment cleanup --- src/KOKKOS/sna_kokkos_impl.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index e7b7087951..e6c34a245b 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -856,7 +856,6 @@ void SNAKokkos::compute_deidrj_cpu(const typename Kokkos::TeamPolicy /* ---------------------------------------------------------------------- compute Bi by summing conj(Ui)*Zi - not updated yet ------------------------------------------------------------------------- */ template From 0060473cee2824527c44a1839ae7a870f42f9b08 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Tue, 24 Mar 2020 11:35:21 -0400 Subject: [PATCH 10/14] fix up some escaped '*' characters in "code-block" sections that do not need to be escaped --- doc/src/bond_oxdna.rst | 2 +- doc/src/pair_bop.rst | 4 ++-- doc/src/pair_class2.rst | 8 ++++---- doc/src/pair_coeff.rst | 12 ++++++------ doc/src/pair_cosine_squared.rst | 2 +- doc/src/pair_meam_sw_spline.rst | 4 +++- doc/src/pair_nb3b_harmonic.rst | 4 +++- doc/src/pair_polymorphic.rst | 4 ++-- doc/src/pair_python.rst | 12 ++++++------ doc/src/pair_spin_magelec.rst | 2 +- 10 files changed, 29 insertions(+), 25 deletions(-) diff --git a/doc/src/bond_oxdna.rst b/doc/src/bond_oxdna.rst index 8e69b298bf..71e5105436 100644 --- a/doc/src/bond_oxdna.rst +++ b/doc/src/bond_oxdna.rst @@ -32,7 +32,7 @@ Examples bond_coeff * 2.0 0.25 0.7564 bond_style oxrna2/fene - bond_coeff \* 2.0 0.25 0.76107 + bond_coeff * 2.0 0.25 0.76107 Description """"""""""" diff --git a/doc/src/pair_bop.rst b/doc/src/pair_bop.rst index ce7b69f41f..5cd045931a 100644 --- a/doc/src/pair_bop.rst +++ b/doc/src/pair_bop.rst @@ -132,9 +132,9 @@ and Te. If your LAMMPS simulation has 4 atoms types and you want the 1st 3 to be Cd, and the 4th to be Te, you would use the following pair_coeff command: -.. parsed-literal:: +.. code-block:: LAMMPS - pair_coeff \* \* CdTe Cd Cd Cd Te + pair_coeff * * CdTe Cd Cd Cd Te The 1st 2 arguments must be \* \* so as to span all LAMMPS atom types. The first three Cd arguments map LAMMPS atom types 1,2,3 to the Cd diff --git a/doc/src/pair_class2.rst b/doc/src/pair_class2.rst index d1c673ab97..8131799181 100644 --- a/doc/src/pair_class2.rst +++ b/doc/src/pair_class2.rst @@ -60,18 +60,18 @@ Examples .. code-block:: LAMMPS pair_style lj/class2 10.0 - pair_coeff \* \* 100.0 2.5 - pair_coeff 1 2\* 100.0 2.5 9.0 + pair_coeff * * 100.0 2.5 + pair_coeff 1 2* 100.0 2.5 9.0 pair_style lj/class2/coul/cut 10.0 pair_style lj/class2/coul/cut 10.0 8.0 - pair_coeff \* \* 100.0 3.0 + pair_coeff * * 100.0 3.0 pair_coeff 1 1 100.0 3.5 9.0 pair_coeff 1 1 100.0 3.5 9.0 9.0 pair_style lj/class2/coul/long 10.0 pair_style lj/class2/coul/long 10.0 8.0 - pair_coeff \* \* 100.0 3.0 + pair_coeff * * 100.0 3.0 pair_coeff 1 1 100.0 3.5 9.0 Description diff --git a/doc/src/pair_coeff.rst b/doc/src/pair_coeff.rst index 26910c1746..1886ce1118 100644 --- a/doc/src/pair_coeff.rst +++ b/doc/src/pair_coeff.rst @@ -19,11 +19,11 @@ Examples .. code-block:: LAMMPS pair_coeff 1 2 1.0 1.0 2.5 - pair_coeff 2 \* 1.0 1.0 - pair_coeff 3\* 1\*2 1.0 1.0 2.5 - pair_coeff \* \* 1.0 1.0 - pair_coeff \* \* nialhjea 1 1 2 - pair_coeff \* 3 morse.table ENTRY1 + pair_coeff 2 * 1.0 1.0 + pair_coeff 3* 1*2 1.0 1.0 2.5 + pair_coeff * * 1.0 1.0 + pair_coeff * * nialhjea 1 1 2 + pair_coeff * 3 morse.table ENTRY1 pair_coeff 1 2 lj/cut 1.0 1.0 2.5 (for pair_style hybrid) Description @@ -55,7 +55,7 @@ pairs, then overwrite the coeffs for just the I,J = 2,3 pair: .. code-block:: LAMMPS - pair_coeff \* \* 1.0 1.0 2.5 + pair_coeff * * 1.0 1.0 2.5 pair_coeff 2 3 2.0 1.0 1.12 A line in a data file that specifies pair coefficients uses the exact diff --git a/doc/src/pair_cosine_squared.rst b/doc/src/pair_cosine_squared.rst index 4f19ddd1e4..b7fa29bbd5 100644 --- a/doc/src/pair_cosine_squared.rst +++ b/doc/src/pair_cosine_squared.rst @@ -31,7 +31,7 @@ Examples .. code-block:: LAMMPS pair_style cosine/squared 3.0 - pair_coeff \* \* 1.0 1.3 + pair_coeff * * 1.0 1.3 pair_coeff 1 3 1.0 1.3 2.0 pair_coeff 1 3 1.0 1.3 wca pair_coeff 1 3 1.0 1.3 2.0 wca diff --git a/doc/src/pair_meam_sw_spline.rst b/doc/src/pair_meam_sw_spline.rst index ebb51795c2..0cf4c57975 100644 --- a/doc/src/pair_meam_sw_spline.rst +++ b/doc/src/pair_meam_sw_spline.rst @@ -75,7 +75,9 @@ If your LAMMPS simulation has 3 atoms types and they are all to be treated with this potential, you would use the following pair_coeff command: -pair_coeff \* \* Ti.meam.sw.spline Ti Ti Ti +.. code-block:: LAMMPS + + pair_coeff * * Ti.meam.sw.spline Ti Ti Ti The 1st 2 arguments must be \* \* so as to span all LAMMPS atom types. The three Ti arguments map LAMMPS atom types 1,2,3 to the Ti element diff --git a/doc/src/pair_nb3b_harmonic.rst b/doc/src/pair_nb3b_harmonic.rst index c4f3f3c3e8..9d63df65d5 100644 --- a/doc/src/pair_nb3b_harmonic.rst +++ b/doc/src/pair_nb3b_harmonic.rst @@ -64,7 +64,9 @@ NULL values are placeholders for atom types that will be used with other potentials. An example of a pair_coeff command for use with the *hybrid* pair style is: -pair_coeff \* \* nb3b/harmonic MgOH.nb3b.harmonic Mg O H +.. code-block:: LAMMPS + + pair_coeff * * nb3b/harmonic MgOH.nb3b.harmonic Mg O H Three-body non-bonded harmonic files in the *potentials* directory of the LAMMPS distribution have a ".nb3b.harmonic" suffix. Lines that diff --git a/doc/src/pair_polymorphic.rst b/doc/src/pair_polymorphic.rst index e06e9e7855..c0db3f10a4 100644 --- a/doc/src/pair_polymorphic.rst +++ b/doc/src/pair_polymorphic.rst @@ -180,9 +180,9 @@ functions for Si-C tersoff potential. If your LAMMPS simulation has 4 atoms types and you want the 1st 3 to be Si, and the 4th to be C, you would use the following pair_coeff command: -.. parsed-literal:: +.. code-block:: LAMMPS - pair_coeff \* \* SiC_tersoff.poly Si Si Si C + pair_coeff * * SiC_tersoff.poly Si Si Si C The 1st 2 arguments must be \* \* so as to span all LAMMPS atom types. The first three Si arguments map LAMMPS atom types 1,2,3 to the diff --git a/doc/src/pair_python.rst b/doc/src/pair_python.rst index fa76d4c16c..e654a6025f 100644 --- a/doc/src/pair_python.rst +++ b/doc/src/pair_python.rst @@ -113,8 +113,8 @@ which the parameters epsilon and sigma are both 1.0: class LJCutMelt(LAMMPSPairPotential): def __init__(self): super(LJCutMelt,self).__init__() - # set coeffs: 48\*eps\*sig\*\*12, 24\*eps\*sig\*\*6, - # 4\*eps\*sig\*\*12, 4\*eps\*sig\*\*6 + # set coeffs: 48*eps*sig**12, 24*eps*sig**6, + # 4*eps*sig**12, 4*eps*sig**6 self.units = 'lj' self.coeff = {'lj' : {'lj' : (48.0,24.0,4.0,4.0)}} @@ -137,18 +137,18 @@ the *LJCutMelt* example, here are the two functions: def compute_force(self,rsq,itype,jtype): coeff = self.coeff[self.pmap[itype]][self.pmap[jtype]] r2inv = 1.0/rsq - r6inv = r2inv\*r2inv\*r2inv + r6inv = r2inv*r2inv*r2inv lj1 = coeff[0] lj2 = coeff[1] - return (r6inv \* (lj1\*r6inv - lj2))\*r2inv + return (r6inv * (lj1*r6inv - lj2))*r2inv def compute_energy(self,rsq,itype,jtype): coeff = self.coeff[self.pmap[itype]][self.pmap[jtype]] r2inv = 1.0/rsq - r6inv = r2inv\*r2inv\*r2inv + r6inv = r2inv*r2inv*r2inv lj3 = coeff[2] lj4 = coeff[3] - return (r6inv \* (lj3\*r6inv - lj4)) + return (r6inv * (lj3*r6inv - lj4)) .. note:: diff --git a/doc/src/pair_spin_magelec.rst b/doc/src/pair_spin_magelec.rst index a220299b07..e4d6b81b8a 100644 --- a/doc/src/pair_spin_magelec.rst +++ b/doc/src/pair_spin_magelec.rst @@ -18,7 +18,7 @@ Examples .. code-block:: LAMMPS pair_style spin/magelec 4.5 - pair_coeff \* \* magelec 4.5 0.00109 1.0 1.0 1.0 + pair_coeff * * magelec 4.5 0.00109 1.0 1.0 1.0 Description """"""""""" From 0f35c1d009b70258c3c967e51abbd11e923c6f30 Mon Sep 17 00:00:00 2001 From: Richard Berger Date: Tue, 24 Mar 2020 18:43:20 -0400 Subject: [PATCH 11/14] Update ubuntu package name to libpng-dev --- doc/src/Howto_bash.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/src/Howto_bash.rst b/doc/src/Howto_bash.rst index 02322e5e1c..98cc6c0dff 100644 --- a/doc/src/Howto_bash.rst +++ b/doc/src/Howto_bash.rst @@ -103,7 +103,7 @@ needed for various LAMMPS features: .. code-block:: bash - sudo apt install -y build-essential ccache gfortran openmpi-bin libopenmpi-dev libfftw3-dev libjpeg-dev libpng12-dev python-dev python-virtualenv libblas-dev liblapack-dev libhdf5-serial-dev hdf5-tools + sudo apt install -y build-essential ccache gfortran openmpi-bin libopenmpi-dev libfftw3-dev libjpeg-dev libpng-dev python-dev python-virtualenv libblas-dev liblapack-dev libhdf5-serial-dev hdf5-tools Files in Ubuntu on Windows ^^^^^^^^^^^^^^^^^^^^^^^^^^ From 431647d943d64001e2e2d6a6e01294c2903c42d7 Mon Sep 17 00:00:00 2001 From: Richard Berger Date: Tue, 24 Mar 2020 18:52:05 -0400 Subject: [PATCH 12/14] Add link to official WSL docs --- doc/src/Howto_bash.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/src/Howto_bash.rst b/doc/src/Howto_bash.rst index 98cc6c0dff..b452f579d0 100644 --- a/doc/src/Howto_bash.rst +++ b/doc/src/Howto_bash.rst @@ -12,6 +12,10 @@ via apt-get and all files are accessible in both the Windows Explorer and your Linux shell (bash). This avoids switching to a different operating system or installing a virtual machine. Everything runs on Windows. +.. seealso:: + + You can find more detailed information at the `Windows Subsystem for Linux Installation Guide for Windows 10 `_. + Installing Bash on Windows -------------------------- From 398c030925e8175f6e15177f2d9b3194fca9524a Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Wed, 25 Mar 2020 06:52:37 -0400 Subject: [PATCH 13/14] whitespace cleanup --- src/KOKKOS/sna_kokkos_impl.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index e6c34a245b..dcedf333e5 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -382,7 +382,7 @@ void SNAKokkos::compute_ui(const typename Kokkos::TeamPolicy 0)?rootpq2*buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.); //const SNAcomplex u_up2 = (ma > 0)?rootpq2*ulist(jjup_index-1,iatom,jnbor):SNAcomplex(0.,0.); caconjxpy(b, u_up2, u_accum); - + // VMK recursion relation: grab contribution which is multiplied by a* const double rootpq1 = rootpqarray(j - ma, j - mb); const SNAcomplex u_up1 = (ma < j)?rootpq1*buf1[jjup_shared_idx]:SNAcomplex(0.,0.); @@ -399,12 +399,12 @@ void SNAKokkos::compute_ui(const typename Kokkos::TeamPolicy physical j integer), last element maps to self, skip - //if (!(m == total_iters - 1 && j % 2 == 0)) { + //if (!(m == total_iters - 1 && j % 2 == 0)) { if (m < total_iters - 1 || j % 2 == 1) { const int sign_factor = (((ma+mb)%2==0)?1:-1); const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1); const int jjup_flip = jju + jju_shared_flip; // jju+(j+1-mb)*(j+1)-(ma+1); - + if (sign_factor == 1) { u_accum.im = -u_accum.im; @@ -419,12 +419,12 @@ void SNAKokkos::compute_ui(const typename Kokkos::TeamPolicy::compute_fused_deidrj(const typename Kokkos::TeamPoli // copy left side to right side with inversion symmetry VMK 4.4(2) // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb]) if (j%2==1 && mb+1==n_mb) { - int sign_factor = (((ma+mb)%2==0)?1:-1); + int sign_factor = (((ma+mb)%2==0)?1:-1); //const int jjup_flip = jju+(j+1-mb)*(j+1)-(ma+1); // no longer needed b/c we don't update dulist const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1); @@ -787,18 +787,18 @@ void SNAKokkos::compute_fused_deidrj(const typename Kokkos::TeamPoli auto tmp = ulist_buf1; ulist_buf1 = ulist_buf2; ulist_buf2 = tmp; tmp = dulist_buf1; dulist_buf1 = dulist_buf2; dulist_buf2 = tmp; - // Accumulate dedr. This "should" be in a single, but + // Accumulate dedr. This "should" be in a single, but // a Kokkos::single call implies a warp sync, and we may // as well avoid that. This does no harm as long as the // final assignment is in a single block. //Kokkos::single(Kokkos::PerThread(team), [=]() { - dedr_full_sum += dedr_sum; + dedr_full_sum += dedr_sum; //}); } // Store the accumulated dedr. Kokkos::single(Kokkos::PerThread(team), [&] () { - dedr(iatom,jnbor,dir) = dedr_full_sum*2.0; + dedr(iatom,jnbor,dir) = dedr_full_sum*2.0; }); } From 1192845ad58327cee3c3e4608596baf8f848b1e7 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Wed, 25 Mar 2020 08:19:24 -0400 Subject: [PATCH 14/14] avoid segmentation faults in universe/uloop variable increment --- src/variable.cpp | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/src/variable.cpp b/src/variable.cpp index 1093ce9066..0fc53f8074 100644 --- a/src/variable.cpp +++ b/src/variable.cpp @@ -661,6 +661,8 @@ int Variable::next(int narg, char **arg) } else if (istyle == UNIVERSE || istyle == ULOOP) { + uloop_again: + // wait until lock file can be created and owned by proc 0 of this world // rename() is not atomic in practice, but no known simple fix // means multiple procs can read/write file at the same time (bad!) @@ -669,7 +671,7 @@ int Variable::next(int narg, char **arg) // delay for random fraction of 1 second before subsequent tries // when successful, read next available index and Bcast it within my world - int nextindex; + int nextindex = -1; if (me == 0) { int seed = 12345 + universe->me + which[find(arg[0])]; RanMars *random = new RanMars(lmp,seed); @@ -682,10 +684,33 @@ int Variable::next(int narg, char **arg) } delete random; - FILE *fp = fopen("tmp.lammps.variable.lock","r"); - fscanf(fp,"%d",&nextindex); + // if the file cannot be found, we may have a race with some + // other MPI rank that has called rename at the same time + // and we have to start over. + // if the read is short (we need at least one byte) we try reading again. + + FILE *fp; + char buf[64]; + for (int loopmax = 0; loopmax < 100; ++loopmax) { + fp = fopen("tmp.lammps.variable.lock","r"); + if (fp == NULL) goto uloop_again; + + buf[0] = buf[1] = '\0'; + fread(buf,1,64,fp); + fclose(fp); + + if (strlen(buf) > 0) { + nextindex = atoi(buf); + break; + } + delay = (int) (1000000*random->uniform()); + usleep(delay); + } + if (nextindex < 0) + error->one(FLERR,"Unexpected error while incrementing uloop " + "style variable. Please contact LAMMPS developers."); + //printf("READ %d %d\n",universe->me,nextindex); - fclose(fp); fp = fopen("tmp.lammps.variable.lock","w"); fprintf(fp,"%d\n",nextindex+1); //printf("WRITE %d %d\n",universe->me,nextindex+1);