diff --git a/doc/src/min_style.rst b/doc/src/min_style.rst index 0382f0d075..a7453b5a57 100644 --- a/doc/src/min_style.rst +++ b/doc/src/min_style.rst @@ -182,4 +182,4 @@ Jonsson, Mills, Jacobsen. .. _Guenole: **(Guenole)** Guenole, Noehring, Vaid, Houlle, Xie, Prakash, Bitzek, -Comput Mater Sci, (2020), in press (arXiv:190802038). +Comput Mater Sci, 175, 109584 (2020). diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h index f20ab34231..2766ac3391 100644 --- a/src/KOKKOS/pair_snap_kokkos.h +++ b/src/KOKKOS/pair_snap_kokkos.h @@ -37,12 +37,16 @@ struct TagPairSNAPBeta{}; struct TagPairSNAPComputeNeigh{}; struct TagPairSNAPPreUi{}; struct TagPairSNAPComputeUi{}; +struct TagPairSNAPComputeUiTot{}; // accumulate ulist into ulisttot separately +struct TagPairSNAPComputeUiCPU{}; struct TagPairSNAPComputeZi{}; struct TagPairSNAPComputeBi{}; struct TagPairSNAPZeroYi{}; struct TagPairSNAPComputeYi{}; struct TagPairSNAPComputeDuidrj{}; +struct TagPairSNAPComputeDuidrjCPU{}; struct TagPairSNAPComputeDeidrj{}; +struct TagPairSNAPComputeDeidrjCPU{}; template class PairSNAPKokkos : public PairSNAP { @@ -74,11 +78,17 @@ public: void operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPPreUi,const int& ii) const; + void operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy::member_type& team) const; KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy::member_type& team) const; + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeUiTot,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeZi,const int& ii) const; @@ -86,7 +96,7 @@ public: void operator() (TagPairSNAPComputeBi,const typename Kokkos::TeamPolicy::member_type& team) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPZeroYi,const int& ii) const; + void operator() (TagPairSNAPZeroYi,const typename Kokkos::TeamPolicy::member_type& team) const; KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeYi,const int& ii) const; @@ -94,9 +104,15 @@ public: KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeDuidrj,const typename Kokkos::TeamPolicy::member_type& team) const; + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeDeidrj,const typename Kokkos::TeamPolicy::member_type& team) const; + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPBeta,const typename Kokkos::TeamPolicy::member_type& team) const; diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 32b63cb776..1156d11c31 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -12,7 +12,8 @@ ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- - Contributing authors: Christian Trott (SNL), Stan Moore (SNL) + Contributing authors: Christian Trott (SNL), Stan Moore (SNL), + Evan Weinberg (NVIDIA) ------------------------------------------------------------------------- */ #include @@ -29,6 +30,7 @@ #include "kokkos.h" #include "sna.h" + #define MAXLINE 1024 #define MAXWORD 3 @@ -194,7 +196,7 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) if (beta_max < inum) { beta_max = inum; - d_beta = Kokkos::View("PairSNAPKokkos:beta",inum,ncoeff); + d_beta = Kokkos::View("PairSNAPKokkos:beta",ncoeff,inum); d_ninside = Kokkos::View("PairSNAPKokkos:ninside",inum); } @@ -205,6 +207,8 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) EV_FLOAT ev; + int idxu_max = snaKK.idxu_max; + while (chunk_offset < inum) { // chunk up loop to prevent running out of memory EV_FLOAT ev_tmp; @@ -217,15 +221,62 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this); //PreUi - typename Kokkos::RangePolicy policy_preui(0,chunk_size); - Kokkos::parallel_for("PreUi",policy_preui,*this); + { + int vector_length = 1; + int team_size = 1; + if (lmp->kokkos->ngpus != 0) { + vector_length = 32; + team_size = 32;//max_neighs; + int team_size_max = Kokkos::TeamPolicy::team_size_max(*this); + if (team_size*vector_length > team_size_max) + team_size = team_size_max/vector_length; + } + typename Kokkos::TeamPolicy policy_preui((chunk_size+team_size-1)/team_size,team_size,vector_length); + Kokkos::parallel_for("PreUi",policy_preui,*this); + } - //ComputeUi - typename Kokkos::TeamPolicy policy_ui(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); - Kokkos::parallel_for("ComputeUi",policy_ui,*this); + // ComputeUI + if (lmp->kokkos->ngpus == 0) { // CPU + // Run a fused calculation of ulist and accumulation into ulisttot using atomics + int vector_length = 1; + int team_size = 1; + + typename Kokkos::TeamPolicy policy_ui_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); + + Kokkos::parallel_for("ComputeUiCPU",policy_ui_cpu,*this); + } else { // GPU, vector parallelism, shared memory, separate ulist and ulisttot to avoid atomics + + // ComputeUi + int vector_length = 32; + int team_size = 4; // need to cap b/c of shared memory reqs + int team_size_max = Kokkos::TeamPolicy::team_size_max(*this); + if (team_size*vector_length > team_size_max) + team_size = team_size_max/vector_length; + + // scratch size: 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values + // 2 is for double buffer + typename Kokkos::TeamPolicy policy_ui(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); + + typedef Kokkos::View< SNAcomplex*, + Kokkos::DefaultExecutionSpace::scratch_memory_space, + Kokkos::MemoryTraits > + ScratchViewType; + int scratch_size = ScratchViewType::shmem_size( 2 * team_size * (twojmax+1)*(twojmax+1)); + policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam( scratch_size )); + + Kokkos::parallel_for("ComputeUi",policy_ui,*this); + + // ComputeUitot + vector_length = 1; + team_size = 128; + team_size_max = Kokkos::TeamPolicy::team_size_max(*this); + if (team_size*vector_length > team_size_max) + team_size = team_size_max/vector_length; + + typename Kokkos::TeamPolicy policy_ui_tot(((idxu_max+team_size-1)/team_size)*chunk_size,team_size,vector_length); + Kokkos::parallel_for("ComputeUiTot",policy_ui_tot,*this); + } - //Ulisttot transpose - snaKK.transpose_ulisttot(); //Compute bispectrum if (quadraticflag || eflag) { @@ -237,15 +288,28 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) //ComputeBi typename Kokkos::TeamPolicy policy_bi(chunk_size,team_size,vector_length); Kokkos::parallel_for("ComputeBi",policy_bi,*this); + } //Compute beta = dE_i/dB_i for all i in list typename Kokkos::TeamPolicy policy_beta(chunk_size,team_size,vector_length); Kokkos::parallel_for("ComputeBeta",policy_beta,*this); - //ComputeYi - typename Kokkos::RangePolicy policy_zero_yi(0,chunk_size); - Kokkos::parallel_for("ZeroYi",policy_zero_yi,*this); + //ZeroYi + { + int vector_length = 1; + int team_size = 1; + int team_size_max = Kokkos::TeamPolicy::team_size_max(*this); + +#ifdef KOKKOS_ENABLE_CUDA + team_size = 128; + if (team_size*vector_length > team_size_max) + team_size = team_size_max/vector_length; +#endif + + typename Kokkos::TeamPolicy policy_zero_yi(((idxu_max+team_size-1)/team_size)*chunk_size,team_size,vector_length); + Kokkos::parallel_for("ZeroYi",policy_zero_yi,*this); + } //ComputeYi int idxz_max = snaKK.idxz_max; @@ -253,12 +317,59 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) Kokkos::parallel_for("ComputeYi",policy_yi,*this); //ComputeDuidrj - typename Kokkos::TeamPolicy policy_duidrj(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); - Kokkos::parallel_for("ComputeDuidrj",policy_duidrj,*this); + if (lmp->kokkos->ngpus == 0) { // CPU + int vector_length = 1; + int team_size = 1; + + typename Kokkos::TeamPolicy policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); + snaKK.set_dir(-1); // technically doesn't do anything + Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this); + } else { // GPU, utilize scratch memory and splitting over dimensions + + int team_size_max = Kokkos::TeamPolicy::team_size_max(*this); + int vector_length = 32; + int team_size = 2; // need to cap b/c of shared memory reqs + if (team_size*vector_length > team_size_max) + team_size = team_size_max/vector_length; + + // scratch size: 2 * 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values + // 2 is for double buffer + typedef Kokkos::View< SNAcomplex*, + Kokkos::DefaultExecutionSpace::scratch_memory_space, + Kokkos::MemoryTraits > + ScratchViewType; + + int scratch_size = ScratchViewType::shmem_size( 4 * team_size * (twojmax+1)*(twojmax+1)); + typename Kokkos::TeamPolicy policy_duidrj(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); + policy_duidrj = policy_duidrj.set_scratch_size(0, Kokkos::PerTeam( scratch_size )); + // Need to call three times, once for each direction + for (int k = 0; k < 3; k++) { + snaKK.set_dir(k); + Kokkos::parallel_for("ComputeDuidrj",policy_duidrj,*this); + } + } //ComputeDeidrj - typename Kokkos::TeamPolicy policy_deidrj(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); - Kokkos::parallel_for("ComputeDeidrj",policy_deidrj,*this); + if (lmp->kokkos->ngpus == 0) { // CPU + int vector_length = 1; + int team_size = 1; + + typename Kokkos::TeamPolicy policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); + + Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this); + + } else { // GPU, different loop strategy internally + + int team_size_max = Kokkos::TeamPolicy::team_size_max(*this); + int vector_length = 32; // coalescing disaster right now, will fix later + int team_size = 8; + if (team_size*vector_length > team_size_max) + team_size = team_size_max/vector_length; + + typename Kokkos::TeamPolicy policy_deidrj(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); + + Kokkos::parallel_for("ComputeDeidrj",policy_deidrj,*this); + } //ComputeForce if (eflag) { @@ -284,6 +395,7 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) } ev += ev_tmp; chunk_offset += chunk_size; + } // end while if (need_dup) @@ -341,18 +453,18 @@ void PairSNAPKokkos::operator() (TagPairSNAPBeta,const typename Kokk d_coeffi(d_coeffelem,ielem,Kokkos::ALL); for (int icoeff = 0; icoeff < ncoeff; icoeff++) - d_beta(ii,icoeff) = d_coeffi[icoeff+1]; + d_beta(icoeff,ii) = d_coeffi[icoeff+1]; if (quadraticflag) { int k = ncoeff+1; for (int icoeff = 0; icoeff < ncoeff; icoeff++) { - double bveci = my_sna.blist(ii,icoeff); - d_beta(ii,icoeff) += d_coeffi[k]*bveci; + double bveci = my_sna.blist(icoeff,ii); + d_beta(icoeff,ii) += d_coeffi[k]*bveci; k++; for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { - double bvecj = my_sna.blist(ii,jcoeff); - d_beta(ii,icoeff) += d_coeffi[k]*bvecj; - d_beta(ii,jcoeff) += d_coeffi[k]*bveci; + double bvecj = my_sna.blist(jcoeff,ii); + d_beta(icoeff,ii) += d_coeffi[k]*bvecj; + d_beta(jcoeff,ii) += d_coeffi[k]*bveci; k++; } } @@ -503,9 +615,14 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeNeigh,const typen template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPPreUi,const int& ii) const { +void PairSNAPKokkos::operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy::member_type& team) const { SNAKokkos my_sna = snaKK; - my_sna.pre_ui(ii); + + // Extract the atom number + const int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size())); + if (ii >= chunk_size) return; + + my_sna.pre_ui(team,ii); } template @@ -514,8 +631,7 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeUi,const typename SNAKokkos my_sna = snaKK; // Extract the atom number - int ii = team.team_rank() + team.team_size() * (team.league_rank() % - ((chunk_size+team.team_size()-1)/team.team_size())); + int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size())); if (ii >= chunk_size) return; // Extract the neighbor number @@ -528,9 +644,54 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeUi,const typename template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPZeroYi,const int& ii) const { +void PairSNAPKokkos::operator() (TagPairSNAPComputeUiTot,const typename Kokkos::TeamPolicy::member_type& team) const { SNAKokkos my_sna = snaKK; - my_sna.zero_yi(ii); + + // Extract the quantum number + const int idx = team.team_rank() + team.team_size() * (team.league_rank() % ((my_sna.idxu_max+team.team_size()-1)/team.team_size())); + if (idx >= my_sna.idxu_max) return; + + // Extract the atomic index + const int ii = team.league_rank() / ((my_sna.idxu_max+team.team_size()-1)/team.team_size()); + if (ii >= chunk_size) return; + + // Extract the number of neighbors neighbor number + const int ninside = d_ninside(ii); + + my_sna.compute_uitot(team,idx,ii,ninside); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const { + SNAKokkos my_sna = snaKK; + + // Extract the atom number + int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size())); + if (ii >= chunk_size) return; + + // Extract the neighbor number + const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size()); + const int ninside = d_ninside(ii); + if (jj >= ninside) return; + + my_sna.compute_ui_cpu(team,ii,jj); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPZeroYi,const typename Kokkos::TeamPolicy::member_type& team) const { + SNAKokkos my_sna = snaKK; + + // Extract the quantum number + const int idx = team.team_rank() + team.team_size() * (team.league_rank() % ((my_sna.idxu_max+team.team_size()-1)/team.team_size())); + if (idx >= my_sna.idxu_max) return; + + // Extract the atomic index + const int ii = team.league_rank() / ((my_sna.idxu_max+team.team_size()-1)/team.team_size()); + if (ii >= chunk_size) return; + + my_sna.zero_yi(idx,ii); } template @@ -561,8 +722,7 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeDuidrj,const type SNAKokkos my_sna = snaKK; // Extract the atom number - int ii = team.team_rank() + team.team_size() * (team.league_rank() % - ((chunk_size+team.team_size()-1)/team.team_size())); + int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size())); if (ii >= chunk_size) return; // Extract the neighbor number @@ -573,14 +733,31 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeDuidrj,const type my_sna.compute_duidrj(team,ii,jj); } +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const { + SNAKokkos my_sna = snaKK; + + // Extract the atom number + int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size())); + if (ii >= chunk_size) return; + + // Extract the neighbor number + const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size()); + const int ninside = d_ninside(ii); + if (jj >= ninside) return; + + my_sna.compute_duidrj_cpu(team,ii,jj); +} + + template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeDeidrj,const typename Kokkos::TeamPolicy::member_type& team) const { SNAKokkos my_sna = snaKK; // Extract the atom number - int ii = team.team_rank() + team.team_size() * (team.league_rank() % - ((chunk_size+team.team_size()-1)/team.team_size())); + int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size())); if (ii >= chunk_size) return; // Extract the neighbor number @@ -591,6 +768,23 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeDeidrj,const type my_sna.compute_deidrj(team,ii,jj); } +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const { + SNAKokkos my_sna = snaKK; + + // Extract the atom number + int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size())); + if (ii >= chunk_size) return; + + // Extract the neighbor number + const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size()); + const int ninside = d_ninside(ii); + if (jj >= ninside) return; + + my_sna.compute_deidrj_cpu(team,ii,jj); +} + template template KOKKOS_INLINE_FUNCTION @@ -658,17 +852,17 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeForce +struct alignas(2*sizeof(real)) SNAComplex +{ + real re,im; + + KOKKOS_FORCEINLINE_FUNCTION SNAComplex() = default; - KOKKOS_INLINE_FUNCTION - SNAcomplex() : re(0),im(0) - {} + KOKKOS_FORCEINLINE_FUNCTION SNAComplex(real re) + : re(re), im(static_cast(0.)) { ; } + + KOKKOS_FORCEINLINE_FUNCTION SNAComplex(real re, real im) + : re(re), im(im) { ; } + + KOKKOS_FORCEINLINE_FUNCTION SNAComplex(const SNAComplex& other) + : re(other.re), im(other.im) { ; } + + KOKKOS_FORCEINLINE_FUNCTION SNAComplex& operator=(const SNAComplex& other) { + re = other.re; im = other.im; + return *this; + } + + KOKKOS_FORCEINLINE_FUNCTION SNAComplex(SNAComplex&& other) + : re(other.re), im(other.im) { ; } + + KOKKOS_FORCEINLINE_FUNCTION SNAComplex& operator=(SNAComplex&& other) { + re = other.re; im = other.im; + return *this; + } + + KOKKOS_FORCEINLINE_FUNCTION SNAComplex operator+(SNAComplex const& other) { + return SNAComplex(re + other.re, im + other.im); + } + + KOKKOS_FORCEINLINE_FUNCTION SNAComplex& operator+=(SNAComplex const& other) { + re += other.re; im += other.im; + return *this; + } - KOKKOS_INLINE_FUNCTION - SNAcomplex(SNAreal real_in, SNAreal imag_in) - :re(real_in),im(imag_in) - {} }; +template +KOKKOS_FORCEINLINE_FUNCTION SNAComplex operator*(const real& r, const SNAComplex& self) { + return SNAComplex(r*self.re, r*self.im); +} + +typedef SNAComplex SNAcomplex; + //struct SNAKK_ZINDICES { // int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju; //}; @@ -58,6 +92,7 @@ public: typedef Kokkos::View > t_sna_1d_atomic; typedef Kokkos::View t_sna_2i; typedef Kokkos::View t_sna_2d; + typedef Kokkos::View t_sna_2d_ll; typedef Kokkos::View t_sna_3d; typedef Kokkos::View t_sna_4d; typedef Kokkos::View t_sna_3d3; @@ -66,32 +101,15 @@ public: typedef Kokkos::View t_sna_1c; typedef Kokkos::View > t_sna_1c_atomic; typedef Kokkos::View t_sna_2c; + typedef Kokkos::View t_sna_2c_ll; typedef Kokkos::View t_sna_2c_lr; typedef Kokkos::View t_sna_3c; + typedef Kokkos::View t_sna_3c_ll; typedef Kokkos::View t_sna_4c; + typedef Kokkos::View t_sna_4c_ll; typedef Kokkos::View t_sna_3c3; typedef Kokkos::View t_sna_5c; -// Helper class to get ulisttot_r - -template -class UlisttotHelper { -public: - inline - static void transpose(T1 &ulisttot_lr, const T2 &ulisttot) { - Kokkos::deep_copy(ulisttot_lr,ulisttot); - } -}; - -template -class UlisttotHelper { -public: - inline - static void transpose(T1 &ulisttot_lr, const T2 &ulisttot) { - ulisttot_lr = ulisttot; - } -}; - inline SNAKokkos() {}; KOKKOS_INLINE_FUNCTION @@ -113,20 +131,21 @@ inline int ncoeff; -inline - void transpose_ulisttot(); - // functions for bispectrum coefficients KOKKOS_INLINE_FUNCTION - void pre_ui(const int&); // ForceSNAP + void pre_ui(const typename Kokkos::TeamPolicy::member_type& team,const int&); // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_ui(const typename Kokkos::TeamPolicy::member_type& team, int, int); // ForceSNAP KOKKOS_INLINE_FUNCTION + void compute_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int); // ForceSNAP + KOKKOS_INLINE_FUNCTION void compute_ui_orig(const typename Kokkos::TeamPolicy::member_type& team, int, int); // ForceSNAP KOKKOS_INLINE_FUNCTION + void compute_uitot(const typename Kokkos::TeamPolicy::member_type& team, int, int, int); // ForceSNAP + KOKKOS_INLINE_FUNCTION void compute_zi(const int&); // ForceSNAP KOKKOS_INLINE_FUNCTION - void zero_yi(const int&); + void zero_yi(const int&,const int&); // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_yi(int, const Kokkos::View &beta); // ForceSNAP @@ -138,12 +157,29 @@ inline KOKKOS_INLINE_FUNCTION void compute_duidrj(const typename Kokkos::TeamPolicy::member_type& team, int, int); //ForceSNAP KOKKOS_INLINE_FUNCTION + void compute_duidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int); //ForceSNAP + KOKKOS_INLINE_FUNCTION void compute_deidrj(const typename Kokkos::TeamPolicy::member_type& team, int, int); // ForceSNAP KOKKOS_INLINE_FUNCTION + void compute_deidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int); // ForceSNAP + KOKKOS_INLINE_FUNCTION double compute_sfac(double, double); // add_uarraytot, compute_duarray KOKKOS_INLINE_FUNCTION double compute_dsfac(double, double); // compute_duarray + // efficient complex FMA + // efficient caxpy (i.e., y += a x) + static KOKKOS_FORCEINLINE_FUNCTION + void caxpy(const SNAcomplex& a, const SNAcomplex& x, SNAcomplex& y); + + // efficient complex FMA, conjugate of scalar + static KOKKOS_FORCEINLINE_FUNCTION + void caconjxpy(const SNAcomplex& a, const SNAcomplex& x, SNAcomplex& y); + + // Set the direction for split ComputeDuidrj + KOKKOS_INLINE_FUNCTION + void set_dir(int); + #ifdef TIMING_INFO double* timers; timespec starttime, endtime; @@ -165,17 +201,16 @@ inline void grow_rij(int, int); int twojmax, diagonalstyle; + + t_sna_2d_ll blist; + t_sna_2c_ll ulisttot; + t_sna_2c_ll zlist; - t_sna_2d blist; - t_sna_2c ulisttot; - t_sna_2c_lr ulisttot_lr; - t_sna_2c zlist; - - t_sna_3c ulist; - t_sna_2c_lr ylist; + t_sna_3c_ll ulist; + t_sna_2c_ll ylist; // derivatives of data - t_sna_4c dulist; + t_sna_4c_ll dulist; int idxcg_max, idxu_max, idxz_max, idxb_max; @@ -212,10 +247,7 @@ inline inline void init_rootpqarray(); // init() - KOKKOS_INLINE_FUNCTION - void zero_uarraytot(const int&); // compute_ui - KOKKOS_INLINE_FUNCTION - void addself_uarraytot(const int&, const double&); // compute_ui + KOKKOS_INLINE_FUNCTION void add_uarraytot(const typename Kokkos::TeamPolicy::member_type& team, int, int, double, double, double); // compute_ui @@ -223,6 +255,12 @@ inline void compute_uarray(const typename Kokkos::TeamPolicy::member_type& team, int, int, double, double, double, double, double); // compute_ui + KOKKOS_INLINE_FUNCTION + void compute_uarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int, + double, double, double, + double, double); // compute_ui_cpu + + inline double deltacg(int, int, int); // init_clebsch_gordan @@ -232,6 +270,10 @@ inline void compute_duarray(const typename Kokkos::TeamPolicy::member_type& team, int, int, double, double, double, // compute_duidrj double, double, double, double, double); + KOKKOS_INLINE_FUNCTION + void compute_duarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int, + double, double, double, // compute_duidrj + double, double, double, double, double); // Sets the style for the switching function // 0 = none @@ -243,6 +285,9 @@ inline int bzero_flag; // 1 if bzero subtracted from barray Kokkos::View bzero; // array of B values for isolated atoms + + // for per-direction dulist calculation, specify the direction. + int dir; }; } diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 8d5a4a21be..c8c9a07e3e 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -224,16 +224,19 @@ void SNAKokkos::grow_rij(int newnatom, int newnmax) rcutij = t_sna_2d("sna:rcutij",natom,nmax); dedr = t_sna_3d("sna:dedr",natom,nmax,3); - blist = t_sna_2d("sna:blist",natom,idxb_max); - ulisttot = t_sna_2c("sna:ulisttot",natom,idxu_max); - if (!Kokkos::Impl::is_same::value) - ulisttot_lr = t_sna_2c_lr("sna:ulisttot_lr",natom,idxu_max); - zlist = t_sna_2c("sna:zlist",natom,idxz_max); + blist = t_sna_2d_ll("sna:blist",idxb_max,natom); + //ulisttot = t_sna_2c("sna:ulisttot",natom,idxu_max); + ulisttot = t_sna_2c_ll("sna:ulisttot",idxu_max,natom); - ulist = t_sna_3c("sna:ulist",natom,nmax,idxu_max); - ylist = t_sna_2c_lr("sna:ylist",natom,idxu_max); + zlist = t_sna_2c_ll("sna:zlist",idxz_max,natom); - dulist = t_sna_4c("sna:dulist",natom,nmax,idxu_max); + //ulist = t_sna_3c("sna:ulist",natom,nmax,idxu_max); + ulist = t_sna_3c_ll("sna:ulist",idxu_max,natom,nmax); + //ylist = t_sna_2c_lr("sna:ylist",natom,idxu_max); + ylist = t_sna_2c_ll("sna:ylist",idxu_max,natom); + + //dulist = t_sna_4c("sna:dulist",natom,nmax,idxu_max); + dulist = t_sna_4c_ll("sna:dulist",idxu_max,natom,nmax); } /* ---------------------------------------------------------------------- @@ -242,19 +245,31 @@ void SNAKokkos::grow_rij(int newnatom, int newnmax) template KOKKOS_INLINE_FUNCTION -void SNAKokkos::pre_ui(const int& iatom) +void SNAKokkos::pre_ui(const typename Kokkos::TeamPolicy::member_type& team, const int& iatom) { - //if(team.team_rank() == 0) { - zero_uarraytot(iatom); - //Kokkos::single(Kokkos::PerThread(team), [&] (){ - addself_uarraytot(iatom,wself); - //}); - //} - //team.team_barrier(); + for (int j = 0; j <= twojmax; j++) { + const int jju = idxu_block(j); + + // Only diagonal elements get initialized + // for (int m = 0; m < (j+1)*(j+1); m++) + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j+1)), + [&] (const int m) { + + const int jjup = jju + m; + + // if m is on the "diagonal", initialize it with the self energy. + // Otherwise zero it out + SNAcomplex init = {0., 0.}; + if (m % (j+2) == 0) { init = {wself, 0.0}; } + + ulisttot(jjup, iatom) = init; + }); + } + } /* ---------------------------------------------------------------------- - compute Ui by summing over neighbors j + compute Ui by summing over bispectrum components ------------------------------------------------------------------------- */ template @@ -280,11 +295,72 @@ void SNAKokkos::compute_ui(const typename Kokkos::TeamPolicy +KOKKOS_INLINE_FUNCTION +void SNAKokkos::compute_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) +{ + double rsq, r, x, y, z, z0, theta0; + + // utot(j,ma,mb) = 0 for all j,ma,ma + // utot(j,ma,ma) = 1 for all j,ma + // for j in neighbors of i: + // compute r0 = (x,y,z,z0) + // utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb + + x = rij(iatom,jnbor,0); + y = rij(iatom,jnbor,1); + z = rij(iatom,jnbor,2); + rsq = x * x + y * y + z * z; + r = sqrt(rsq); + + theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0); + // theta0 = (r - rmin0) * rscale0; + z0 = r / tan(theta0); + + compute_uarray_cpu(team, iatom, jnbor, x, y, z, z0, r); add_uarraytot(team, iatom, jnbor, r, wj(iatom,jnbor), rcutij(iatom,jnbor)); + +} + +/* ---------------------------------------------------------------------- + compute UiTot by summing over neighbors +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void SNAKokkos::compute_uitot(const typename Kokkos::TeamPolicy::member_type& team, int idx, int iatom, int ninside) +{ + // fuse initialize in, avoid this load? + SNAcomplex utot = ulisttot(idx, iatom); + for (int jnbor = 0; jnbor < ninside; jnbor++) { + + const auto x = rij(iatom,jnbor,0); + const auto y = rij(iatom,jnbor,1); + const auto z = rij(iatom,jnbor,2); + const auto rsq = x * x + y * y + z * z; + const auto r = sqrt(rsq); + + const double wj_local = wj(iatom, jnbor); + const double rcut = rcutij(iatom, jnbor); + const double sfac = compute_sfac(r, rcut) * wj_local; + + auto ulist_local = ulist(idx, iatom, jnbor); + utot.re += sfac * ulist_local.re; + utot.im += sfac * ulist_local.im; + } + + ulisttot(idx, iatom) = utot; + } /* ---------------------------------------------------------------------- compute Zi by summing over products of Ui + not updated yet ------------------------------------------------------------------------- */ template @@ -306,8 +382,8 @@ void SNAKokkos::compute_zi(const int& iter) const double* cgblock = cglist.data() + idxcg_block(j1,j2,j); - zlist(iatom,jjz).re = 0.0; - zlist(iatom,jjz).im = 0.0; + zlist(jjz,iatom).re = 0.0; + zlist(jjz,iatom).im = 0.0; int jju1 = idxu_block[j1] + (j1+1)*mb1min; int jju2 = idxu_block[j2] + (j2+1)*mb2max; @@ -321,15 +397,15 @@ void SNAKokkos::compute_zi(const int& iter) int ma2 = ma2max; int icga = ma1min*(j2+1) + ma2max; for(int ia = 0; ia < na; ia++) { - suma1_r += cgblock[icga] * (ulisttot(iatom,jju1+ma1).re * ulisttot(iatom,jju2+ma2).re - ulisttot(iatom,jju1+ma1).im * ulisttot(iatom,jju2+ma2).im); - suma1_i += cgblock[icga] * (ulisttot(iatom,jju1+ma1).re * ulisttot(iatom,jju2+ma2).im + ulisttot(iatom,jju1+ma1).im * ulisttot(iatom,jju2+ma2).re); + suma1_r += cgblock[icga] * (ulisttot(jju1+ma1,iatom).re * ulisttot(jju2+ma2,iatom).re - ulisttot(jju1+ma1,iatom).im * ulisttot(jju2+ma2,iatom).im); + suma1_i += cgblock[icga] * (ulisttot(jju1+ma1,iatom).re * ulisttot(jju2+ma2,iatom).im + ulisttot(jju1+ma1,iatom).im * ulisttot(jju2+ma2,iatom).re); ma1++; ma2--; icga += j2; } // end loop over ia - zlist(iatom,jjz).re += cgblock[icgb] * suma1_r; - zlist(iatom,jjz).im += cgblock[icgb] * suma1_i; + zlist(jjz,iatom).re += cgblock[icgb] * suma1_r; + zlist(jjz,iatom).im += cgblock[icgb] * suma1_i; jju1 += j1+1; jju2 -= j2+1; @@ -343,10 +419,9 @@ void SNAKokkos::compute_zi(const int& iter) template KOKKOS_INLINE_FUNCTION -void SNAKokkos::zero_yi(const int& iatom) +void SNAKokkos::zero_yi(const int& idx, const int& iatom) { - for (int j = 0; j < idxu_max; j++) - ylist(iatom,j) = {0.0,0.0}; + ylist(idx,iatom) = {0.0, 0.0}; } /* ---------------------------------------------------------------------- @@ -393,8 +468,8 @@ void SNAKokkos::compute_yi(int iter, int icga = ma1min*(j2+1) + ma2max; for(int ia = 0; ia < na; ia++) { - suma1_r += cgblock[icga] * (ulisttot_lr(iatom,jju1+ma1).re * ulisttot_lr(iatom,jju2+ma2).re - ulisttot_lr(iatom,jju1+ma1).im * ulisttot_lr(iatom,jju2+ma2).im); - suma1_i += cgblock[icga] * (ulisttot_lr(iatom,jju1+ma1).re * ulisttot_lr(iatom,jju2+ma2).im + ulisttot_lr(iatom,jju1+ma1).im * ulisttot_lr(iatom,jju2+ma2).re); + suma1_r += cgblock[icga] * (ulisttot(jju1+ma1,iatom).re * ulisttot(jju2+ma2,iatom).re - ulisttot(jju1+ma1,iatom).im * ulisttot(jju2+ma2,iatom).im); + suma1_i += cgblock[icga] * (ulisttot(jju1+ma1,iatom).re * ulisttot(jju2+ma2,iatom).im + ulisttot(jju1+ma1,iatom).im * ulisttot(jju2+ma2,iatom).re); ma1++; ma2--; icga += j2; @@ -417,20 +492,20 @@ void SNAKokkos::compute_yi(int iter, if (j >= j1) { const int jjb = idxb_block(j1,j2,j); if (j1 == j) { - if (j2 == j) betaj = 3*beta(iatom,jjb); - else betaj = 2*beta(iatom,jjb); - } else betaj = beta(iatom,jjb); + if (j2 == j) betaj = 3*beta(jjb,iatom); + else betaj = 2*beta(jjb,iatom); + } else betaj = beta(jjb,iatom); } else if (j >= j2) { const int jjb = idxb_block(j,j2,j1); - if (j2 == j) betaj = 2*beta(iatom,jjb)*(j1+1)/(j+1.0); - else betaj = beta(iatom,jjb)*(j1+1)/(j+1.0); + if (j2 == j) betaj = 2*beta(jjb,iatom)*(j1+1)/(j+1.0); + else betaj = beta(jjb,iatom)*(j1+1)/(j+1.0); } else { const int jjb = idxb_block(j2,j,j1); - betaj = beta(iatom,jjb)*(j1+1)/(j+1.0); + betaj = beta(jjb,iatom)*(j1+1)/(j+1.0); } - Kokkos::atomic_add(&(ylist(iatom,jju).re), betaj*ztmp_r); - Kokkos::atomic_add(&(ylist(iatom,jju).im), betaj*ztmp_i); + Kokkos::atomic_add(&(ylist(jju,iatom).re), betaj*ztmp_r); + Kokkos::atomic_add(&(ylist(jju,iatom).im), betaj*ztmp_i); } /* ---------------------------------------------------------------------- @@ -441,18 +516,81 @@ template KOKKOS_INLINE_FUNCTION void SNAKokkos::compute_deidrj(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) { - t_scalar3 sum; + t_scalar3 final_sum; + // Like in ComputeUi/ComputeDuidrj, regular loop over j. + for (int j = 0; j <= twojmax; j++) { + int jju = idxu_block(j); + + // Flatten loop over ma, mb, reduce w/in + + const int n_ma = j+1; + // for (int mb = 0; 2*mb <= j; mb++) + const int n_mb = j/2+1; + + const int total_iters = n_ma * n_mb; + + t_scalar3 sum; + + //for (int m = 0; m < total_iters; m++) { + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, total_iters), + [&] (const int m, t_scalar3& sum_tmp) { + + // ma fast, mb slow + int ma = m % n_ma; + int mb = m / n_ma; + + // get index + const int jju_index = jju+mb+mb*j+ma; + + // get ylist, rescale last element by 0.5 + SNAcomplex y_local = ylist(jju_index,iatom); + + const SNAcomplex du_x = dulist(jju_index,iatom,jnbor,0); + const SNAcomplex du_y = dulist(jju_index,iatom,jnbor,1); + const SNAcomplex du_z = dulist(jju_index,iatom,jnbor,2); + + if (j % 2 == 0 && 2*mb == j) { + if (ma == mb) { y_local = 0.5*y_local; } + else if (ma > mb) { y_local = { 0., 0. }; } + // else the ma < mb gets "double counted", cancelling the 0.5. + } + + sum_tmp.x += du_x.re * y_local.re + du_x.im * y_local.im; + sum_tmp.y += du_y.re * y_local.re + du_y.im * y_local.im; + sum_tmp.z += du_z.re * y_local.re + du_z.im * y_local.im; + + }, sum); // end loop over flattened ma,mb + + final_sum.x += sum.x; + final_sum.y += sum.y; + final_sum.z += sum.z; + } + + Kokkos::single(Kokkos::PerThread(team), [&] () { + dedr(iatom,jnbor,0) = final_sum.x*2.0; + dedr(iatom,jnbor,1) = final_sum.y*2.0; + dedr(iatom,jnbor,2) = final_sum.z*2.0; + }); + +} + +template +KOKKOS_INLINE_FUNCTION +void SNAKokkos::compute_deidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) +{ + t_scalar3 final_sum; + + //for(int j = 0; j <= twojmax; j++) { Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,twojmax+1), [&] (const int& j, t_scalar3& sum_tmp) { - //for(int j = 0; j <= twojmax; j++) { int jju = idxu_block[j]; for(int mb = 0; 2*mb < j; mb++) for(int ma = 0; ma <= j; ma++) { - sum_tmp.x += dulist(iatom,jnbor,jju,0).re * ylist(iatom,jju).re + dulist(iatom,jnbor,jju,0).im * ylist(iatom,jju).im; - sum_tmp.y += dulist(iatom,jnbor,jju,1).re * ylist(iatom,jju).re + dulist(iatom,jnbor,jju,1).im * ylist(iatom,jju).im; - sum_tmp.z += dulist(iatom,jnbor,jju,2).re * ylist(iatom,jju).re + dulist(iatom,jnbor,jju,2).im * ylist(iatom,jju).im; + sum_tmp.x += dulist(jju,iatom,jnbor,0).re * ylist(jju,iatom).re + dulist(jju,iatom,jnbor,0).im * ylist(jju,iatom).im; + sum_tmp.y += dulist(jju,iatom,jnbor,1).re * ylist(jju,iatom).re + dulist(jju,iatom,jnbor,1).im * ylist(jju,iatom).im; + sum_tmp.z += dulist(jju,iatom,jnbor,2).re * ylist(jju,iatom).re + dulist(jju,iatom,jnbor,2).im * ylist(jju,iatom).im; jju++; } //end loop over ma mb @@ -462,25 +600,24 @@ void SNAKokkos::compute_deidrj(const typename Kokkos::TeamPolicy::compute_bi(const typename Kokkos::TeamPolicy::compute_bi(const typename Kokkos::TeamPolicy::compute_bi(const typename Kokkos::TeamPolicy::compute_bi(const typename Kokkos::TeamPolicy::compute_duidrj(const typename Kokkos::TeamPolicy KOKKOS_INLINE_FUNCTION -void SNAKokkos::zero_uarraytot(const int& iatom) +void SNAKokkos::compute_duidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) { - { - //Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,ulisttot.extent(1)), - // [&] (const int& i) { - for (int i = 0; i < ulisttot.extent(1); i++) { - ulisttot(iatom,i).re = 0.0; - ulisttot(iatom,i).im = 0.0; - } - //}); - } -} + double rsq, r, x, y, z, z0, theta0, cs, sn; + double dz0dr; -/* ---------------------------------------------------------------------- */ + x = rij(iatom,jnbor,0); + y = rij(iatom,jnbor,1); + z = rij(iatom,jnbor,2); + rsq = x * x + y * y + z * z; + r = sqrt(rsq); + double rscale0 = rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0); + theta0 = (r - rmin0) * rscale0; + cs = cos(theta0); + sn = sin(theta0); + z0 = r * cs / sn; + dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq; -template -KOKKOS_INLINE_FUNCTION -void SNAKokkos::addself_uarraytot(const int& iatom, const double& wself_in) -{ - //Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,twojmax+1), - // [&] (const int& j) { - for (int j = 0; j <= twojmax; j++) { - int jju = idxu_block[j]; - for (int ma = 0; ma <= j; ma++) { - ulisttot(iatom,jju).re = wself_in; - ulisttot(iatom,jju).im = 0.0; - jju += j+2; - } - }//}); + compute_duarray_cpu(team, iatom, jnbor, x, y, z, z0, r, dz0dr, wj(iatom,jnbor), rcutij(iatom,jnbor)); } /* ---------------------------------------------------------------------- @@ -639,10 +763,10 @@ void SNAKokkos::add_uarraytot(const typename Kokkos::TeamPolicy::compute_uarray(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor, double x, double y, double z, double z0, double r) +{ + // define size of scratch memory buffer + const int max_m_tile = (twojmax+1)*(twojmax+1); + const int team_rank = team.team_rank(); + + // get scratch memory double buffer + SNAcomplex* buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0); + SNAcomplex* buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0); + + // compute Cayley-Klein parameters for unit quaternion, + // pack into complex number + double r0inv = 1.0 / sqrt(r * r + z0 * z0); + SNAcomplex a = { r0inv * z0, -r0inv * z }; + SNAcomplex b = { r0inv * y, -r0inv * x }; + + // VMK Section 4.8.2 + + // All writes go to global memory and shared memory + // so we can avoid all global memory reads + Kokkos::single(Kokkos::PerThread(team), [=]() { + ulist(0,iatom,jnbor) = { 1.0, 0.0 }; + buf1[max_m_tile*team_rank] = {1.,0.}; + }); + + for (int j = 1; j <= twojmax; j++) { + const int jju = idxu_block[j]; + int jjup = idxu_block[j-1]; + + // fill in left side of matrix layer from previous layer + + // Flatten loop over ma, mb, need to figure out total + // number of iterations + + // for (int ma = 0; ma <= j; ma++) + const int n_ma = j+1; + // for (int mb = 0; 2*mb <= j; mb++) + const int n_mb = j/2+1; + + const int total_iters = n_ma * n_mb; + + //for (int m = 0; m < total_iters; m++) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, total_iters), + [&] (const int m) { + + // ma fast, mb slow + int ma = m % n_ma; + int mb = m / n_ma; + + // index into global memory array + const int jju_index = jju+mb+mb*j+ma; + + // index into shared memory buffer for previous level + const int jju_shared_idx = max_m_tile*team_rank+mb+mb*j+ma; + + // index into shared memory buffer for next level + const int jjup_shared_idx = max_m_tile*team_rank+mb*j+ma; + + SNAcomplex u_accum = {0., 0.}; + + // VMK recursion relation: grab contribution which is multiplied by a* + const double rootpq1 = rootpqarray(j - ma, j - mb); + const SNAcomplex u_up1 = (ma < j)?rootpq1*buf1[jjup_shared_idx]:SNAcomplex(0.,0.); + caconjxpy(a, u_up1, u_accum); + + // VMK recursion relation: grab contribution which is multiplied by b* + const double rootpq2 = -rootpqarray(ma, j - mb); + const SNAcomplex u_up2 = (ma > 0)?rootpq2*buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.); + caconjxpy(b, u_up2, u_accum); + + ulist(jju_index,iatom,jnbor) = u_accum; + + // We no longer accumulate into ulisttot in this kernel. + // Instead, we have a separate kernel which avoids atomics. + // Running two separate kernels is net faster. + + // back up into shared memory for next iter + if (j != twojmax) buf2[jju_shared_idx] = u_accum; + + // copy left side to right side with inversion symmetry VMK 4.4(2) + // u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb)) + // We can avoid this if we're on the last row for an integer j + if (!(n_ma % 2 == 1 && (mb+1) == n_mb)) { + + int sign_factor = ((ma%2==0)?1:-1)*(mb%2==0?1:-1); + const int jjup_flip = jju+(j+1-mb)*(j+1)-(ma+1); + const int jju_shared_flip = max_m_tile*team_rank+(j+1-mb)*(j+1)-(ma+1); + + if (sign_factor == 1) { + u_accum.im = -u_accum.im; + } else { + u_accum.re = -u_accum.re; + } + ulist(jjup_flip,iatom,jnbor) = u_accum; + if (j != twojmax) buf2[jju_shared_flip] = u_accum; + } + }); + // In CUDA backend, + // ThreadVectorRange has a __syncwarp (appropriately masked for + // vector lengths < 32) implict at the end + + // swap double buffers + auto tmp = buf1; buf1 = buf2; buf2 = tmp; + //std::swap(buf1, buf2); // throws warnings + + } +} + +// CPU version +template +KOKKOS_INLINE_FUNCTION +void SNAKokkos::compute_uarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor, + double x, double y, double z, + double z0, double r) { double r0inv; double a_r, b_r, a_i, b_i; @@ -670,8 +907,8 @@ void SNAKokkos::compute_uarray(const typename Kokkos::TeamPolicy::compute_uarray(const typename Kokkos::TeamPolicy::compute_uarray(const typename Kokkos::TeamPolicy::compute_uarray(const typename Kokkos::TeamPolicy -void SNAKokkos::transpose_ulisttot() -{ - UlisttotHelper::transpose(ulisttot_lr,ulisttot); -} - /* ---------------------------------------------------------------------- compute derivatives of Wigner U-functions for one neighbor see comments in compute_uarray() @@ -755,7 +986,150 @@ void SNAKokkos::compute_duarray(const typename Kokkos::TeamPolicy 0)?rootpq2*ulist_buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.); + caconjxpy(b, u_up2, u_accum); + + // No need to save u_accum to global memory + if (j != twojmax) ulist_buf2[jju_shared_idx] = u_accum; + + // Next, spin up du_accum + const SNAcomplex du_up1 = (ma < j) ? rootpq1*dulist_buf1[jjup_shared_idx] : SNAcomplex(0.,0.); + caconjxpy(da, u_up1, du_accum); + caconjxpy(a, du_up1, du_accum); + + const SNAcomplex du_up2 = (ma > 0) ? rootpq2*dulist_buf1[jjup_shared_idx-1] : SNAcomplex(0.,0.); + caconjxpy(db, u_up2, du_accum); + caconjxpy(b, du_up2, du_accum); + + dulist(jju_index,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum); + + if (j != twojmax) dulist_buf2[jju_shared_idx] = du_accum; + + // copy left side to right side with inversion symmetry VMK 4.4(2) + // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb]) + + int sign_factor = ((ma%2==0)?1:-1)*(mb%2==0?1:-1); + const int jjup_flip = jju+(j+1-mb)*(j+1)-(ma+1); + const int jju_shared_flip = max_m_tile*team_rank+(j+1-mb)*(j+1)-(ma+1); + + if (sign_factor == 1) { + //ulist_alt(iatom,jnbor,jjup_flip).re = u_accum.re; + //ulist_alt(iatom,jnbor,jjup_flip).im = -u_accum.im; + u_accum.im = -u_accum.im; + du_accum.im = -du_accum.im; + } else { + //ulist_alt(iatom,jnbor,jjup_flip).re = -u_accum.re; + //ulist_alt(iatom,jnbor,jjup_flip).im = u_accum.im; + u_accum.re = -u_accum.re; + du_accum.re = -du_accum.re; + } + + dulist(jjup_flip,iatom,jnbor,dir) = ((dsfac * u)*u_accum) + (sfac*du_accum); + + if (j != twojmax) { + ulist_buf2[jju_shared_flip] = u_accum; + dulist_buf2[jju_shared_flip] = du_accum; + } + + }); + + // swap buffers + auto tmp = ulist_buf1; ulist_buf1 = ulist_buf2; ulist_buf2 = tmp; + tmp = dulist_buf1; dulist_buf1 = dulist_buf2; dulist_buf2 = tmp; + } +} + +template +KOKKOS_INLINE_FUNCTION +void SNAKokkos::compute_duarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor, + double x, double y, double z, + double z0, double r, double dz0dr, + double wj, double rcut) +{ +double r0inv; double a_r, a_i, b_r, b_i; double da_r[3], da_i[3], db_r[3], db_i[3]; double dz0[3], dr0inv[3], dr0invdr; @@ -772,7 +1146,7 @@ void SNAKokkos::compute_duarray(const typename Kokkos::TeamPolicy::compute_duarray(const typename Kokkos::TeamPolicy::compute_duarray(const typename Kokkos::TeamPolicy::compute_duarray(const typename Kokkos::TeamPolicy::compute_duarray(const typename Kokkos::TeamPolicy::compute_dsfac(double r, double rcut) return 0.0; } +/* ---------------------------------------------------------------------- */ + +// efficient complex FMA (i.e., y += a x) +template +KOKKOS_FORCEINLINE_FUNCTION +void SNAKokkos::caxpy(const SNAcomplex& a, const SNAcomplex& x, SNAcomplex& y) { + y.re += a.re * x.re; + y.re -= a.im * x.im; + y.im += a.im * x.re; + y.im += a.re * x.im; +} + +/* ---------------------------------------------------------------------- */ + +// efficient complex FMA, conjugate of scalar (i.e.) y += (a.re - i a.im) x) +template +KOKKOS_FORCEINLINE_FUNCTION +void SNAKokkos::caconjxpy(const SNAcomplex& a, const SNAcomplex& x, SNAcomplex& y) { + y.re += a.re * x.re; + y.re += a.im * x.im; + y.im -= a.im * x.re; + y.im += a.re * x.im; +} + +/* ---------------------------------------------------------------------- */ + +// set direction of batched Duidrj +template +KOKKOS_FORCEINLINE_FUNCTION +void SNAKokkos::set_dir(int dir_) { + dir = dir_; +} + /* ---------------------------------------------------------------------- memory usage of arrays ------------------------------------------------------------------------- */ diff --git a/src/min_fire.cpp b/src/min_fire.cpp index 4aeeddf6b2..0b0a9d801b 100644 --- a/src/min_fire.cpp +++ b/src/min_fire.cpp @@ -87,6 +87,7 @@ void MinFire::setup_style() for (int i = 0; i < nlocal; i++) v[i][0] = v[i][1] = v[i][2] = 0.0; + flagv0 = 1; } /* ---------------------------------------------------------------------- @@ -259,14 +260,40 @@ int MinFire::iterate(int maxiter) if (halfstepback_flag) { for (int i = 0; i < nlocal; i++) { - x[i][0] -= 0.5 * dtv * v[i][0]; - x[i][1] -= 0.5 * dtv * v[i][1]; - x[i][2] -= 0.5 * dtv * v[i][2]; + x[i][0] -= 0.5 * dt * v[i][0]; + x[i][1] -= 0.5 * dt * v[i][1]; + x[i][2] -= 0.5 * dt * v[i][2]; } } for (int i = 0; i < nlocal; i++) v[i][0] = v[i][1] = v[i][2] = 0.0; + flagv0 = 1; + } + + // evaluates velocties to estimate wether dtv has to be limited + // required when v have been reset + + if (flagv0) { + dtf = dt * force->ftm2v; + energy_force(0); + neval++; + + if (rmass) { + for (int i = 0; i < nlocal; i++) { + dtfm = dtf / rmass[i]; + v[i][0] = dtfm * f[i][0]; + v[i][1] = dtfm * f[i][1]; + v[i][2] = dtfm * f[i][2]; + } + } else { + for (int i = 0; i < nlocal; i++) { + dtfm = dtf / mass[type[i]]; + v[i][0] = dtfm * f[i][0]; + v[i][1] = dtfm * f[i][1]; + v[i][2] = dtfm * f[i][2]; + } + } } // limit timestep so no particle moves further than dmax @@ -281,6 +308,13 @@ int MinFire::iterate(int maxiter) MPI_Allreduce(&dtvone,&dtv,1,MPI_DOUBLE,MPI_MIN,world); + // reset velocities when necessary + + if (flagv0) { + for (int i = 0; i < nlocal; i++) + v[i][0] = v[i][1] = v[i][2] = 0.0; + } + // min dtv over replicas, if necessary // this communicator would be invalid for multiprocess replicas @@ -438,6 +472,10 @@ int MinFire::iterate(int maxiter) neval++; } + // velocities have been evaluated + + flagv0 = 0; + // energy tolerance criterion // only check after delaystep elapsed since velocties reset to 0 // sync across replicas if running multi-replica minimization diff --git a/src/min_fire.h b/src/min_fire.h index 0c4be968dc..79e5cb7c14 100644 --- a/src/min_fire.h +++ b/src/min_fire.h @@ -37,7 +37,7 @@ class MinFire : public Min { double dt,dtmax,dtmin; double alpha; bigint last_negative,ntimestep_start; - int vdotf_negatif; + int vdotf_negatif,flagv0; }; }