diff --git a/doc/src/compute_orientorder_atom.rst b/doc/src/compute_orientorder_atom.rst index 2a36fb4cc0..af4e6a21e8 100644 --- a/doc/src/compute_orientorder_atom.rst +++ b/doc/src/compute_orientorder_atom.rst @@ -115,8 +115,8 @@ The optional keyword *chunksize* is only applicable when using the the KOKKOS package and is ignored otherwise. This keyword controls the number of atoms in each pass used to compute the bond-orientational order parameters and is used to avoid running out of memory. For example -if there are 4000 atoms in the simulation and the *chunksize* -is set to 2000, the parameter calculation will be broken up +if there are 32768 atoms in the simulation and the *chunksize* +is set to 16384, the parameter calculation will be broken up into two passes. The value of :math:`Q_l` is set to zero for atoms not in the @@ -193,7 +193,7 @@ Default The option defaults are *cutoff* = pair style cutoff, *nnn* = 12, *degrees* = 5 4 6 8 10 12 i.e. :math:`Q_4`, :math:`Q_6`, :math:`Q_8`, :math:`Q_{10}`, and :math:`Q_{12}`, -*wl* = no, *wl/hat* = no, *components* off, and *chunksize* = 2000 +*wl* = no, *wl/hat* = no, *components* off, and *chunksize* = 16384 ---------- diff --git a/doc/src/pair_snap.rst b/doc/src/pair_snap.rst index 8aa10f5986..a99331345e 100644 --- a/doc/src/pair_snap.rst +++ b/doc/src/pair_snap.rst @@ -152,7 +152,7 @@ The default values for these keywords are * *chemflag* = 0 * *bnormflag* = 0 * *wselfallflag* = 0 -* *chunksize* = 2000 +* *chunksize* = 4096 If *quadraticflag* is set to 1, then the SNAP energy expression includes additional quadratic terms that have been shown to increase the overall accuracy of the potential without much increase @@ -189,8 +189,8 @@ pair style *snap* with the KOKKOS package and is ignored otherwise. This keyword controls the number of atoms in each pass used to compute the bispectrum components and is used to avoid running out of memory. For example -if there are 4000 atoms in the simulation and the *chunksize* -is set to 2000, the bispectrum calculation will be broken up +if there are 8192 atoms in the simulation and the *chunksize* +is set to 4096, the bispectrum calculation will be broken up into two passes. Detailed definitions for all the other keywords diff --git a/src/KOKKOS/kokkos_type.h b/src/KOKKOS/kokkos_type.h index 72513d2b17..fbe9799bee 100644 --- a/src/KOKKOS/kokkos_type.h +++ b/src/KOKKOS/kokkos_type.h @@ -1076,20 +1076,34 @@ struct params_lj_coul { // Pair SNAP +#define SNAP_KOKKOS_REAL double +#define SNAP_KOKKOS_HOST_VECLEN 1 + +#ifdef LMP_KOKKOS_GPU +#define SNAP_KOKKOS_DEVICE_VECLEN 32 +#else +#define SNAP_KOKKOS_DEVICE_VECLEN 1 +#endif + + +// intentional: SNAreal/complex gets reused beyond SNAP typedef double SNAreal; //typedef struct { SNAreal re, im; } SNAcomplex; -template -struct alignas(2*sizeof(real)) SNAComplex +template +struct alignas(2*sizeof(real_type_)) SNAComplex { - real re,im; + using real_type = real_type_; + using complex = SNAComplex; + real_type re,im; - SNAComplex() = default; + KOKKOS_FORCEINLINE_FUNCTION SNAComplex() + : re(static_cast(0.)), im(static_cast(0.)) { ; } - KOKKOS_FORCEINLINE_FUNCTION SNAComplex(real re) - : re(re), im(static_cast(0.)) { ; } + KOKKOS_FORCEINLINE_FUNCTION SNAComplex(real_type re) + : re(re), im(static_cast(0.)) { ; } - KOKKOS_FORCEINLINE_FUNCTION SNAComplex(real re, real im) + KOKKOS_FORCEINLINE_FUNCTION SNAComplex(real_type re, real_type im) : re(re), im(im) { ; } KOKKOS_FORCEINLINE_FUNCTION SNAComplex(const SNAComplex& other) @@ -1117,27 +1131,24 @@ struct alignas(2*sizeof(real)) SNAComplex return *this; } + KOKKOS_INLINE_FUNCTION + static constexpr complex zero() { return complex(static_cast(0.), static_cast(0.)); } + + KOKKOS_INLINE_FUNCTION + static constexpr complex one() { return complex(static_cast(1.), static_cast(0.)); } + + KOKKOS_INLINE_FUNCTION + const complex conj() { return complex(re, -im); } + }; -template -KOKKOS_FORCEINLINE_FUNCTION SNAComplex operator*(const real& r, const SNAComplex& self) { - return SNAComplex(r*self.re, r*self.im); +template +KOKKOS_FORCEINLINE_FUNCTION SNAComplex operator*(const real_type& r, const SNAComplex& self) { + return SNAComplex(r*self.re, r*self.im); } typedef SNAComplex SNAcomplex; -// Cayley-Klein pack -// Can guarantee it's aligned to 2 complex -struct alignas(32) CayleyKleinPack { - - SNAcomplex a, b; - SNAcomplex da[3], db[3]; - SNAreal sfac; - SNAreal dsfacu[3]; - -}; - - #if defined(KOKKOS_ENABLE_CXX11) #undef ISFINITE #define ISFINITE(x) std::isfinite(x) diff --git a/src/KOKKOS/pair_snap_kokkos.cpp b/src/KOKKOS/pair_snap_kokkos.cpp index 6f324a97be..4677d178e0 100644 --- a/src/KOKKOS/pair_snap_kokkos.cpp +++ b/src/KOKKOS/pair_snap_kokkos.cpp @@ -15,9 +15,11 @@ #include "pair_snap_kokkos_impl.h" namespace LAMMPS_NS { -template class PairSNAPKokkos; + +template class PairSNAPKokkosDevice; #ifdef LMP_KOKKOS_GPU -template class PairSNAPKokkos; +template class PairSNAPKokkosHost; #endif + } diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h index e7a1f3173d..416cc1b888 100644 --- a/src/KOKKOS/pair_snap_kokkos.h +++ b/src/KOKKOS/pair_snap_kokkos.h @@ -13,9 +13,13 @@ #ifdef PAIR_CLASS -PairStyle(snap/kk,PairSNAPKokkos) -PairStyle(snap/kk/device,PairSNAPKokkos) -PairStyle(snap/kk/host,PairSNAPKokkos) +PairStyle(snap/kk,PairSNAPKokkosDevice) +PairStyle(snap/kk/device,PairSNAPKokkosDevice) +#ifdef LMP_KOKKOS_GPU +PairStyle(snap/kk/host,PairSNAPKokkosHost) +#else +PairStyle(snap/kk/host,PairSNAPKokkosDevice) +#endif #else @@ -33,9 +37,11 @@ namespace LAMMPS_NS { // Routines for both the CPU and GPU backend template struct TagPairSNAPComputeForce{}; -struct TagPairSNAPComputeNeigh{}; + // GPU backend only +struct TagPairSNAPComputeNeigh{}; +struct TagPairSNAPComputeCayleyKlein{}; struct TagPairSNAPPreUi{}; struct TagPairSNAPComputeUi{}; struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist @@ -44,10 +50,10 @@ struct TagPairSNAPBeta{}; struct TagPairSNAPComputeBi{}; struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS struct TagPairSNAPComputeYi{}; -struct TagPairSNAPTransformYi{}; // re-order ylist from AoSoA to AoS struct TagPairSNAPComputeFusedDeidrj{}; // CPU backend only +struct TagPairSNAPComputeNeighCPU{}; struct TagPairSNAPPreUiCPU{}; struct TagPairSNAPComputeUiCPU{}; struct TagPairSNAPTransformUiCPU{}; @@ -59,7 +65,7 @@ struct TagPairSNAPComputeYiCPU{}; struct TagPairSNAPComputeDuidrjCPU{}; struct TagPairSNAPComputeDeidrjCPU{}; -template +template class PairSNAPKokkos : public PairSNAP { public: enum {EnabledNeighFlags=FULL|HALF|HALFTHREAD}; @@ -68,6 +74,14 @@ public: typedef ArrayTypes AT; typedef EV_FLOAT value_type; + static constexpr int vector_length = vector_length_; + using real_type = real_type_; + using complex = SNAComplex; + + // type-dependent team sizes + static constexpr int team_size_compute_ui = sizeof(real_type) == 4 ? 8 : 4; + static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2; + PairSNAPKokkos(class LAMMPS *); ~PairSNAPKokkos(); @@ -78,10 +92,10 @@ public: double memory_usage(); template - void check_team_size_for(int, int&, int); + void check_team_size_for(int, int&); template - void check_team_size_reduce(int, int&, int); + void check_team_size_reduce(int, int&); template KOKKOS_INLINE_FUNCTION @@ -91,15 +105,18 @@ public: KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeForce,const typename Kokkos::TeamPolicy >::member_type& team, EV_FLOAT&) const; - KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const; - KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPBetaCPU,const int& ii) const; // GPU backend only KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeCayleyKlein, const int iatom_mod, const int jnbor, const int iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPPreUi,const int iatom_mod, const int j, const int iatom_div) const; KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy::member_type& team) const; @@ -122,13 +139,13 @@ public: KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeYi,const int iatom_mod, const int idxz, const int iatom_div) const; - KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPTransformYi,const int iatom_mod, const int idxu, const int iatom_div) const; - KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeFusedDeidrj,const typename Kokkos::TeamPolicy::member_type& team) const; // CPU backend only + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeNeighCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPPreUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const; @@ -173,7 +190,7 @@ protected: t_bvec bvec; typedef Kokkos::View t_dbvec; t_dbvec dbvec; - SNAKokkos snaKK; + SNAKokkos snaKK; int inum,max_neighs,chunk_size,chunk_offset; int host_flag; @@ -208,14 +225,14 @@ inline double dist2(double* x,double* y); Kokkos::View i_uarraytot_r, i_uarraytot_i; Kokkos::View i_zarray_r, i_zarray_i; - Kokkos::View d_radelem; // element radii - Kokkos::View d_wjelem; // elements weights - Kokkos::View d_coeffelem; // element bispectrum coefficients + Kokkos::View d_radelem; // element radii + Kokkos::View d_wjelem; // elements weights + Kokkos::View d_coeffelem; // element bispectrum coefficients Kokkos::View d_map; // mapping from atom types to elements Kokkos::View d_ninside; // ninside for all atoms in list - Kokkos::View d_beta; // betas for all atoms in list - Kokkos::View d_beta_pack; // betas for all atoms in list, GPU - Kokkos::View d_bispectrum; // bispectrum components for all atoms in list + Kokkos::View d_beta; // betas for all atoms in list + Kokkos::View d_beta_pack; // betas for all atoms in list, GPU + Kokkos::View d_bispectrum; // bispectrum components for all atoms in list typedef Kokkos::DualView tdual_fparams; tdual_fparams k_cutsq; @@ -237,6 +254,49 @@ inline double dist2(double* x,double* y); }; + +// These wrapper classes exist to make the pair style factory happy/avoid having +// to extend the pair style factory to support Pair classes w/an arbitrary number +// of extra template parameters + +template +class PairSNAPKokkosDevice : public PairSNAPKokkos { + +private: + using Base = PairSNAPKokkos; + +public: + + PairSNAPKokkosDevice(class LAMMPS *); + + void coeff(int, char**); + void init_style(); + double init_one(int, int); + void compute(int, int); + double memory_usage(); + +}; + +#ifdef LMP_KOKKOS_GPU +template +class PairSNAPKokkosHost : public PairSNAPKokkos { + +private: + using Base = PairSNAPKokkos; + +public: + + PairSNAPKokkosHost(class LAMMPS *); + + void coeff(int, char**); + void init_style(); + double init_one(int, int); + void compute(int, int); + double memory_usage(); + +}; +#endif + } #endif diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index bacb5f75d6..1349232136 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -48,8 +48,8 @@ namespace LAMMPS_NS { //static double t7 = 0.0; /* ---------------------------------------------------------------------- */ -template -PairSNAPKokkos::PairSNAPKokkos(LAMMPS *lmp) : PairSNAP(lmp) +template +PairSNAPKokkos::PairSNAPKokkos(LAMMPS *lmp) : PairSNAP(lmp) { respa_enable = 0; @@ -67,8 +67,8 @@ PairSNAPKokkos::PairSNAPKokkos(LAMMPS *lmp) : PairSNAP(lmp) /* ---------------------------------------------------------------------- */ -template -PairSNAPKokkos::~PairSNAPKokkos() +template +PairSNAPKokkos::~PairSNAPKokkos() { if (copymode) return; @@ -81,8 +81,8 @@ PairSNAPKokkos::~PairSNAPKokkos() init specific to this pair style ------------------------------------------------------------------------- */ -template -void PairSNAPKokkos::init_style() +template +void PairSNAPKokkos::init_style() { if (force->newton_pair == 0) error->all(FLERR,"Pair style SNAP requires newton pair on"); @@ -128,8 +128,8 @@ struct FindMaxNumNeighs { This version is a straightforward implementation ---------------------------------------------------------------------- */ -template -void PairSNAPKokkos::compute(int eflag_in, int vflag_in) +template +void PairSNAPKokkos::compute(int eflag_in, int vflag_in) { eflag = eflag_in; vflag = vflag_in; @@ -186,16 +186,15 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) max_neighs = 0; Kokkos::parallel_reduce("PairSNAPKokkos::find_max_neighs",inum, FindMaxNumNeighs(k_list), Kokkos::Max(max_neighs)); - int vector_length_default = 1; int team_size_default = 1; if (!host_flag) team_size_default = 32;//max_neighs; if (beta_max < inum) { beta_max = inum; - d_beta = Kokkos::View("PairSNAPKokkos:beta",ncoeff,inum); + d_beta = Kokkos::View("PairSNAPKokkos:beta",ncoeff,inum); if (!host_flag) - d_beta_pack = Kokkos::View("PairSNAPKokkos:beta_pack",32,ncoeff,(inum+32-1)/32); + d_beta_pack = Kokkos::View("PairSNAPKokkos:beta_pack",vector_length,ncoeff,(inum + vector_length - 1) / vector_length); d_ninside = Kokkos::View("PairSNAPKokkos:ninside",inum); } @@ -213,31 +212,28 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) if (chunk_size > inum - chunk_offset) chunk_size = inum - chunk_offset; - //ComputeNeigh - { - int vector_length = vector_length_default; - int team_size = team_size_default; - check_team_size_for(chunk_size,team_size,vector_length); - typename Kokkos::TeamPolicy policy_neigh(chunk_size,team_size,vector_length); - Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this); - } - if (host_flag) { // Host codepath + //ComputeNeigh + { + int team_size = team_size_default; + check_team_size_for(chunk_size,team_size); + typename Kokkos::TeamPolicy policy_neigh(chunk_size,team_size,vector_length); + Kokkos::parallel_for("ComputeNeighCPU",policy_neigh,*this); + } + //PreUi { - int vector_length = vector_length_default; int team_size = team_size_default; - check_team_size_for(chunk_size,team_size,vector_length); + check_team_size_for(chunk_size,team_size); typename Kokkos::TeamPolicy policy_preui_cpu((chunk_size+team_size-1)/team_size,team_size,vector_length); Kokkos::parallel_for("PreUiCPU",policy_preui_cpu,*this); } // ComputeUi { - int vector_length = vector_length_default; int team_size = team_size_default; // Fused calculation of ulist and accumulation into ulisttot using atomics typename Kokkos::TeamPolicy policy_ui_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); @@ -259,9 +255,8 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) Kokkos::parallel_for("ComputeZiCPU",policy_zi_cpu,*this); //ComputeBi - int vector_length = vector_length_default; int team_size = team_size_default; - check_team_size_for(chunk_size,team_size,vector_length); + check_team_size_for(chunk_size,team_size); typename Kokkos::TeamPolicy policy_bi_cpu(chunk_size,team_size,vector_length); Kokkos::parallel_for("ComputeBiCPU",policy_bi_cpu,*this); } @@ -281,7 +276,6 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) //ComputeDuidrj and Deidrj { int team_size = team_size_default; - int vector_length = vector_length_default; typename Kokkos::TeamPolicy policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); snaKK.set_dir(-1); // technically doesn't do anything @@ -295,39 +289,67 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) } else { // GPU #ifdef LMP_KOKKOS_GPU + + //ComputeNeigh + { + constexpr int team_size = 4; + + // scratch size: max_neighs * sizeof(int) * number of threads per team + typedef Kokkos::View< int*, + Kokkos::DefaultExecutionSpace::scratch_memory_space, + Kokkos::MemoryTraits > + ScratchViewType; + int scratch_size = ScratchViewType::shmem_size(team_size * max_neighs); + + typename Kokkos::TeamPolicy policy_neigh(chunk_size,team_size,vector_length); + policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this); + } + + //ComputeCayleyKlein + { + typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPComputeCayleyKlein> + policy_compute_ck({0,0,0},{vector_length,max_neighs,(chunk_size + vector_length - 1) / vector_length},{vector_length,4,1}); + Kokkos::parallel_for("ComputeCayleyKlein",policy_compute_ck,*this); + } + //PreUi { - int vector_length = vector_length_default; - int team_size = team_size_default; - check_team_size_for(chunk_size,team_size,vector_length); - typename Kokkos::TeamPolicy policy_preui((chunk_size+team_size-1)/team_size,team_size,vector_length); + typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPPreUi> + policy_preui({0,0,0},{vector_length,twojmax+1,(chunk_size + vector_length - 1) / vector_length},{vector_length,4,1}); Kokkos::parallel_for("PreUi",policy_preui,*this); + } // ComputeUi w/vector parallelism, shared memory, direct atomicAdd into ulisttot { + // new AoSoA form - int vector_length = 32; - int team_size = 4; // need to cap b/c of shared memory reqs - check_team_size_for(chunk_size,team_size,vector_length); + // team_size_compute_ui is defined in `pair_snap_kokkos.h` + constexpr int team_size = team_size_compute_ui; - // scratch size: 2 * team_size * (twojmax+1)^2, to cover all `m1`,`m2` values, div 2 for symmetry - // 2 is for double buffer - - const int tile_size = (twojmax+1)*(twojmax/2+1); - typedef Kokkos::View< SNAcomplex*, + // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer + const int tile_size = vector_length * (twojmax + 1); + typedef Kokkos::View< complex*, Kokkos::DefaultExecutionSpace::scratch_memory_space, Kokkos::MemoryTraits > ScratchViewType; - int scratch_size = ScratchViewType::shmem_size( 2 * team_size * tile_size ); + int scratch_size = ScratchViewType::shmem_size(team_size * tile_size); - typename Kokkos::TeamPolicy policy_ui(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); - policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam( scratch_size )); + // total number of teams needed + int chunk_size_div = (chunk_size + vector_length - 1) / vector_length; + + // (natoms / 32) * (max_neighs) * ("bend" locations) + int n_teams = chunk_size_div * max_neighs * (twojmax + 1); + int n_teams_div = (n_teams + team_size - 1) / team_size; + + typename Kokkos::TeamPolicy policy_ui(n_teams_div, team_size, vector_length); + policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); Kokkos::parallel_for("ComputeUi",policy_ui,*this); - //Transform data layout of ulisttot to AoSoA, zero ylist - typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUi> policy_transform_ui({0,0,0},{32,twojmax+1,(chunk_size + 32 - 1) / 32},{32,4,1}); + // un-"fold" ulisttot, zero ylist + typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUi> policy_transform_ui({0,0,0},{vector_length,snaKK.idxu_max,(chunk_size + vector_length - 1) / vector_length},{vector_length,4,1}); Kokkos::parallel_for("TransformUi",policy_transform_ui,*this); } @@ -336,19 +358,19 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) if (quadraticflag || eflag) { //ComputeZi int idxz_max = snaKK.idxz_max; - typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPComputeZi> policy_compute_zi({0,0,0},{32,idxz_max,(chunk_size + 32 - 1) / 32},{32,4,1}); + typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPComputeZi> policy_compute_zi({0,0,0},{vector_length,idxz_max,(chunk_size + vector_length - 1) / vector_length},{vector_length,4,1}); Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this); //ComputeBi int idxb_max = snaKK.idxb_max; - typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPComputeBi> policy_compute_bi({0,0,0},{32,idxb_max,(chunk_size + 32 - 1) / 32},{32,4,1}); + typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPComputeBi> policy_compute_bi({0,0,0},{vector_length,idxb_max,(chunk_size + vector_length - 1) / vector_length},{vector_length,4,1}); Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this); //Transform data layout of blist out of AoSoA //We need this b/c `blist` gets used in ComputeForce which doesn't //take advantage of AoSoA (which at best would only be beneficial //on the margins) - typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformBi> policy_transform_bi({0,0,0},{32,idxb_max,(chunk_size + 32 - 1) / 32},{32,4,1}); + typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformBi> policy_transform_bi({0,0,0},{vector_length,idxb_max,(chunk_size + vector_length - 1) / vector_length},{vector_length,4,1}); Kokkos::parallel_for("TransformBi",policy_transform_bi,*this); } @@ -361,39 +383,41 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) //ComputeYi const int idxz_max = snaKK.idxz_max; - typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPComputeYi> policy_compute_yi({0,0,0},{32,idxz_max,(chunk_size + 32 - 1) / 32},{32,4,1}); + typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPComputeYi> policy_compute_yi({0,0,0},{vector_length,idxz_max,(chunk_size + vector_length - 1) / vector_length},{vector_length,4,1}); Kokkos::parallel_for("ComputeYi",policy_compute_yi,*this); - //Transform data layout of ylist out of AoSoA - const int idxu_half_max = snaKK.idxu_half_max; - typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformYi> policy_transform_yi({0,0,0},{32,idxu_half_max,(chunk_size + 32 - 1) / 32},{32,4,1}); - Kokkos::parallel_for("TransformYi",policy_transform_yi,*this); - } // Fused ComputeDuidrj, ComputeDeidrj { - int vector_length = 32; - int team_size = 2; // need to cap b/c of shared memory reqs - check_team_size_for(chunk_size,team_size,vector_length); + // new AoSoA form - // scratch size: 2 * 2 * team_size * (twojmax+1)*(twojmax/2+1), to cover half `m1`,`m2` values due to symmetry - // 2 is for double buffer - const int tile_size = (twojmax+1)*(twojmax/2+1); + // team_size_compute_fused_deidrj is defined in `pair_snap_kokkos.h` + constexpr int team_size = team_size_compute_fused_deidrj; - typedef Kokkos::View< SNAcomplex*, + // scratch size: 32 atoms * (twojmax+1) cached values * 2 for u, du, no double buffer + const int tile_size = vector_length * (twojmax + 1); + typedef Kokkos::View< complex*, Kokkos::DefaultExecutionSpace::scratch_memory_space, Kokkos::MemoryTraits > ScratchViewType; - int scratch_size = ScratchViewType::shmem_size( 4 * team_size * tile_size); + int scratch_size = ScratchViewType::shmem_size(2 * team_size * tile_size); - typename Kokkos::TeamPolicy policy_fused_deidrj(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); - policy_fused_deidrj = policy_fused_deidrj.set_scratch_size(0, Kokkos::PerTeam( scratch_size )); + // total number of teams needed + int chunk_size_div = (chunk_size + vector_length - 1) / vector_length; + + // (natoms / 32) * (max_neighs) * ("bend" locations) + int n_teams = chunk_size_div * max_neighs * (twojmax + 1); + int n_teams_div = (n_teams + team_size - 1) / team_size; + + typename Kokkos::TeamPolicy policy_fused_deidrj(n_teams_div,team_size,vector_length); + policy_fused_deidrj = policy_fused_deidrj.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); for (int k = 0; k < 3; k++) { snaKK.set_dir(k); Kokkos::parallel_for("ComputeFusedDeidrj",policy_fused_deidrj,*this); } + } #endif // LMP_KOKKOS_GPU @@ -403,27 +427,26 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) //ComputeForce { int team_size = team_size_default; - int vector_length = vector_length_default; if (evflag) { if (neighflag == HALF) { - check_team_size_reduce >(chunk_size,team_size,vector_length); + check_team_size_reduce >(chunk_size,team_size); typename Kokkos::TeamPolicy > policy_force(chunk_size,team_size,vector_length); Kokkos::parallel_reduce(policy_force ,*this,ev_tmp); } else if (neighflag == HALFTHREAD) { - check_team_size_reduce >(chunk_size,team_size,vector_length); + check_team_size_reduce >(chunk_size,team_size); typename Kokkos::TeamPolicy > policy_force(chunk_size,team_size,vector_length); Kokkos::parallel_reduce(policy_force ,*this,ev_tmp); } } else { if (neighflag == HALF) { - check_team_size_for >(chunk_size,team_size,vector_length); + check_team_size_for >(chunk_size,team_size); typename Kokkos::TeamPolicy > policy_force(chunk_size,team_size,vector_length); Kokkos::parallel_for(policy_force ,*this); } else if (neighflag == HALFTHREAD) { - check_team_size_for >(chunk_size,team_size,vector_length); + check_team_size_for >(chunk_size,team_size); typename Kokkos::TeamPolicy > policy_force(chunk_size,team_size,vector_length); Kokkos::parallel_for(policy_force ,*this); @@ -478,8 +501,8 @@ void PairSNAPKokkos::compute(int eflag_in, int vflag_in) allocate all arrays ------------------------------------------------------------------------- */ -template -void PairSNAPKokkos::allocate() +template +void PairSNAPKokkos::allocate() { PairSNAP::allocate(); @@ -492,8 +515,8 @@ void PairSNAPKokkos::allocate() init for one type pair i,j and corresponding j,i ------------------------------------------------------------------------- */ -template -double PairSNAPKokkos::init_one(int i, int j) +template +double PairSNAPKokkos::init_one(int i, int j) { double cutone = PairSNAP::init_one(i,j); k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutone*cutone; @@ -506,16 +529,16 @@ double PairSNAPKokkos::init_one(int i, int j) set coeffs for one or more type pairs ------------------------------------------------------------------------- */ -template -void PairSNAPKokkos::coeff(int narg, char **arg) +template +void PairSNAPKokkos::coeff(int narg, char **arg) { PairSNAP::coeff(narg,arg); // Set up element lists - d_radelem = Kokkos::View("pair:radelem",nelements); - d_wjelem = Kokkos::View("pair:wjelem",nelements); - d_coeffelem = Kokkos::View("pair:coeffelem",nelements,ncoeffall); + d_radelem = Kokkos::View("pair:radelem",nelements); + d_wjelem = Kokkos::View("pair:wjelem",nelements); + d_coeffelem = Kokkos::View("pair:coeffelem",nelements,ncoeffall); auto h_radelem = Kokkos::create_mirror_view(d_radelem); auto h_wjelem = Kokkos::create_mirror_view(d_wjelem); @@ -539,25 +562,374 @@ void PairSNAPKokkos::coeff(int narg, char **arg) Kokkos::deep_copy(d_coeffelem,h_coeffelem); Kokkos::deep_copy(d_map,h_map); - snaKK = SNAKokkos(rfac0,twojmax, + snaKK = SNAKokkos(rfac0,twojmax, rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements); snaKK.grow_rij(0,0); snaKK.init(); } /* ---------------------------------------------------------------------- - Begin routines that are called on both CPU and GPU codepaths + Begin routines that are unique to the GPU codepath. These take advantage + of AoSoA data layouts and scratch memory for recursive polynomials ------------------------------------------------------------------------- */ -/* ---------------------------------------------------------------------- */ - -template +template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const { +void PairSNAPKokkos::operator() (TagPairSNAPBeta,const int& ii) const { + + if (ii >= chunk_size) return; + + const int iatom_mod = ii % vector_length; + const int iatom_div = ii / vector_length; + + const int i = d_ilist[ii + chunk_offset]; + const int itype = type[i]; + const int ielem = d_map[itype]; + SNAKokkos my_sna = snaKK; + + auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL); + + for (int icoeff = 0; icoeff < ncoeff; icoeff++) { + d_beta_pack(iatom_mod,icoeff,iatom_div) = d_coeffi[icoeff+1]; + } + + if (quadraticflag) { + const auto idxb_max = my_sna.idxb_max; + int k = ncoeff+1; + for (int icoeff = 0; icoeff < ncoeff; icoeff++) { + const auto idxb = icoeff % idxb_max; + const auto idx_chem = icoeff / idxb_max; + auto bveci = my_sna.blist(idxb, idx_chem, ii); + d_beta_pack(iatom_mod,icoeff,iatom_div) += d_coeffi[k]*bveci; + k++; + for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { + const auto jdxb = jcoeff % idxb_max; + const auto jdx_chem = jcoeff / idxb_max; + real_type bvecj = my_sna.blist(jdxb, jdx_chem, ii); + d_beta_pack(iatom_mod,icoeff,iatom_div) += d_coeffi[k]*bvecj; + d_beta_pack(iatom_mod,jcoeff,iatom_div) += d_coeffi[k]*bveci; + k++; + } + } + } +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const { + + SNAKokkos my_sna = snaKK; + + // extract atom number + int ii = team.team_rank() + team.league_rank() * team.team_size(); + if (ii >= chunk_size) return; + + // get a pointer to scratch memory + // This is used to cache whether or not an atom is within the cutoff. + // If it is, type_cache is assigned to the atom type. + // If it's not, it's assigned to -1. + const int tile_size = max_neighs; // number of elements per thread + const int team_rank = team.team_rank(); + const int scratch_shift = team_rank * tile_size; // offset into pointer for entire team + int* type_cache = (int*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(int), 0) + scratch_shift; + + // Load various info about myself up front + const int i = d_ilist[ii + chunk_offset]; + const F_FLOAT xtmp = x(i,0); + const F_FLOAT ytmp = x(i,1); + const F_FLOAT ztmp = x(i,2); + const int itype = type[i]; + const int ielem = d_map[itype]; + const double radi = d_radelem[ielem]; + + const int num_neighs = d_numneigh[i]; + + // rij[][3] = displacements between atom I and those neighbors + // inside = indices of neighbors of I within cutoff + // wj = weights for neighbors of I within cutoff + // rcutij = cutoffs for neighbors of I within cutoff + // note Rij sign convention => dU/dRij = dU/dRj = -dU/dRi + + // Compute the number of neighbors, store rsq + int ninside = 0; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,num_neighs), + [&] (const int jj, int& count) { + T_INT j = d_neighbors(i,jj); + const F_FLOAT dx = x(j,0) - xtmp; + const F_FLOAT dy = x(j,1) - ytmp; + const F_FLOAT dz = x(j,2) - ztmp; + + int jtype = type(j); + const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; + + if (rsq >= rnd_cutsq(itype,jtype)) { + jtype = -1; // use -1 to signal it's outside the radius + } + + type_cache[jj] = jtype; + + if (jtype >= 0) + count++; + }, ninside); + + d_ninside(ii) = ninside; + + Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,num_neighs), + [&] (const int jj, int& offset, bool final) { + + const int jtype = type_cache[jj]; + + if (jtype >= 0) { + if (final) { + T_INT j = d_neighbors(i,jj); + const F_FLOAT dx = x(j,0) - xtmp; + const F_FLOAT dy = x(j,1) - ytmp; + const F_FLOAT dz = x(j,2) - ztmp; + const int elem_j = d_map[jtype]; + my_sna.rij(ii,offset,0) = static_cast(dx); + my_sna.rij(ii,offset,1) = static_cast(dy); + my_sna.rij(ii,offset,2) = static_cast(dz); + my_sna.wj(ii,offset) = static_cast(d_wjelem[elem_j]); + my_sna.rcutij(ii,offset) = static_cast((radi + d_radelem[elem_j])*rcutfac); + my_sna.inside(ii,offset) = j; + if (chemflag) + my_sna.element(ii,offset) = elem_j; + else + my_sna.element(ii,offset) = 0; + } + offset++; + } + }); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const { + SNAKokkos my_sna = snaKK; + + const int ii = iatom_mod + iatom_div * vector_length; + if (ii >= chunk_size) return; + + const int ninside = d_ninside(ii); + if (jnbor >= ninside) return; + + my_sna.compute_cayley_klein(iatom_mod,jnbor,iatom_div); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPPreUi, const int iatom_mod, const int j, const int iatom_div) const { + SNAKokkos my_sna = snaKK; + + const int ii = iatom_mod + iatom_div * vector_length; + if (ii >= chunk_size) return; + + int itype = type(ii); + int ielem = d_map[itype]; + + my_sna.pre_ui(iatom_mod, j, ielem, iatom_div); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy::member_type& team) const { + SNAKokkos my_sna = snaKK; + + // extract flattened atom_div / neighbor number / bend location + int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui; + + // extract neighbor index, iatom_div + const int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); + const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1)); + const int jbend = jj_jbend / max_neighs; + const int jj = jj_jbend - jbend * max_neighs; + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), + [&] (const int iatom_mod) { + const int ii = iatom_mod + vector_length * iatom_div; + if (ii >= chunk_size) return; + + const int ninside = d_ninside(ii); + if (jj >= ninside) return; + + my_sna.compute_ui(team,iatom_mod, jbend, jj, iatom_div); + }); + +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const { + SNAKokkos my_sna = snaKK; + + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + + if (idxu > my_sna.idxu_max) return; + + int elem_count = chemflag ? nelements : 1; + + for (int ielem = 0; ielem < elem_count; ielem++) { + + const FullHalfMapper mapper = my_sna.idxu_full_half[idxu]; + + auto utot_re = my_sna.ulisttot_re_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div); + auto utot_im = my_sna.ulisttot_im_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div); + + if (mapper.flip_sign == 1) { + utot_im = -utot_im; + } else if (mapper.flip_sign == -1) { + utot_re = -utot_re; + } + + my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im }; + + if (mapper.flip_sign == 0) { + my_sna.ylist_pack_re(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.; + my_sna.ylist_pack_im(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.; + } + } +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeYi,const int iatom_mod, const int jjz, const int iatom_div) const { + SNAKokkos my_sna = snaKK; + + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + + if (jjz >= my_sna.idxz_max) return; + + my_sna.compute_yi(iatom_mod,jjz,iatom_div,d_beta_pack); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const { + SNAKokkos my_sna = snaKK; + + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + + if (jjz >= my_sna.idxz_max) return; + + my_sna.compute_zi(iatom_mod,jjz,iatom_div); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const { + SNAKokkos my_sna = snaKK; + + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + + if (jjb >= my_sna.idxb_max) return; + + my_sna.compute_bi(iatom_mod,jjb,iatom_div); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const { + SNAKokkos my_sna = snaKK; + + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + + if (idxb >= my_sna.idxb_max) return; + + const int ntriples = my_sna.ntriples; + + for (int itriple = 0; itriple < ntriples; itriple++) { + + const auto blocal = my_sna.blist_pack(iatom_mod, idxb, itriple, iatom_div); + + my_sna.blist(idxb, itriple, iatom) = blocal; + } + +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeFusedDeidrj,const typename Kokkos::TeamPolicy::member_type& team) const { + SNAKokkos my_sna = snaKK; + + // extract flattened atom_div / neighbor number / bend location + int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj; + + // extract neighbor index, iatom_div + const int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); + const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1)); + const int jbend = jj_jbend / max_neighs; + const int jj = jj_jbend - jbend * max_neighs; + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), + [&] (const int iatom_mod) { + const int ii = iatom_mod + vector_length * iatom_div; + if (ii >= chunk_size) return; + + const int ninside = d_ninside(ii); + if (jj >= ninside) return; + + my_sna.compute_fused_deidrj(team, iatom_mod, jbend, jj, iatom_div); + + }); + +} + +/* ---------------------------------------------------------------------- + Begin routines that are unique to the CPU codepath. These do not take + advantage of AoSoA data layouts, but that could be a good point of + future optimization and unification with the above kernels. It's unlikely + that scratch memory optimizations will ever be useful for the CPU due to + different arithmetic intensity requirements for the CPU vs GPU. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPBetaCPU,const int& ii) const { + + const int i = d_ilist[ii + chunk_offset]; + const int itype = type[i]; + const int ielem = d_map[itype]; + SNAKokkos my_sna = snaKK; + + auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL); + + for (int icoeff = 0; icoeff < ncoeff; icoeff++) + d_beta(icoeff,ii) = d_coeffi[icoeff+1]; + + if (quadraticflag) { + const auto idxb_max = my_sna.idxb_max; + int k = ncoeff+1; + for (int icoeff = 0; icoeff < ncoeff; icoeff++) { + const auto idxb = icoeff % idxb_max; + const auto idx_chem = icoeff / idxb_max; + auto bveci = my_sna.blist(idxb,idx_chem,ii); + d_beta(icoeff,ii) += d_coeffi[k]*bveci; + k++; + for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { + const auto jdxb = jcoeff % idxb_max; + const auto jdx_chem = jcoeff / idxb_max; + auto bvecj = my_sna.blist(jdxb,jdx_chem,ii); + d_beta(icoeff,ii) += d_coeffi[k]*bvecj; + d_beta(jcoeff,ii) += d_coeffi[k]*bveci; + k++; + } + } + } +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeNeighCPU,const typename Kokkos::TeamPolicy::member_type& team) const { + int ii = team.league_rank(); const int i = d_ilist[ii + chunk_offset]; - SNAKokkos my_sna = snaKK; + SNAKokkos my_sna = snaKK; const double xtmp = x(i,0); const double ytmp = x(i,1); const double ztmp = x(i,2); @@ -585,7 +957,7 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeNeigh,const typen const int jtype = type(j); const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; - if ( rsq < rnd_cutsq(itype,jtype) ) + if (rsq < rnd_cutsq(itype,jtype)) count++; }); },ninside); @@ -605,22 +977,13 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeNeigh,const typen const F_FLOAT rsq = dx*dx + dy*dy + dz*dz; const int elem_j = d_map[jtype]; - if ( rsq < rnd_cutsq(itype,jtype) ) { + if (rsq < rnd_cutsq(itype,jtype)) { if (final) { -#ifdef LMP_KOKKOS_GPU - if (!host_flag) { - my_sna.compute_cayley_klein(ii, offset, dx, dy, dz, (radi + d_radelem[elem_j])*rcutfac, - d_wjelem[elem_j]); - } else { -#endif - my_sna.rij(ii,offset,0) = dx; - my_sna.rij(ii,offset,1) = dy; - my_sna.rij(ii,offset,2) = dz; - my_sna.wj(ii,offset) = d_wjelem[elem_j]; - my_sna.rcutij(ii,offset) = (radi + d_radelem[elem_j])*rcutfac; -#ifdef LMP_KOKKOS_GPU - } -#endif + my_sna.rij(ii,offset,0) = static_cast(dx); + my_sna.rij(ii,offset,1) = static_cast(dy); + my_sna.rij(ii,offset,2) = static_cast(dz); + my_sna.wj(ii,offset) = static_cast(d_wjelem[elem_j]); + my_sna.rcutij(ii,offset) = static_cast((radi + d_radelem[elem_j])*rcutfac); my_sna.inside(ii,offset) = j; if (chemflag) my_sna.element(ii,offset) = elem_j; @@ -632,282 +995,11 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeNeigh,const typen }); } -/* ---------------------------------------------------------------------- - Begin routines that are unique to the GPU codepath. These take advantage - of AoSoA data layouts and scratch memory for recursive polynomials -------------------------------------------------------------------------- */ -template +template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPBeta,const int& ii) const { - - if (ii >= chunk_size) return; - - const int iatom_mod = ii % 32; - const int iatom_div = ii / 32; - - const int i = d_ilist[ii + chunk_offset]; - const int itype = type[i]; - const int ielem = d_map[itype]; - SNAKokkos my_sna = snaKK; - - Kokkos::View> - d_coeffi(d_coeffelem,ielem,Kokkos::ALL); - - for (int icoeff = 0; icoeff < ncoeff; icoeff++) { - d_beta_pack(iatom_mod,icoeff,iatom_div) = d_coeffi[icoeff+1]; - } - - if (quadraticflag) { - const auto idxb_max = my_sna.idxb_max; - int k = ncoeff+1; - for (int icoeff = 0; icoeff < ncoeff; icoeff++) { - const auto idxb = icoeff % idxb_max; - const auto idx_chem = icoeff / idxb_max; - double bveci = my_sna.blist(idxb, idx_chem, ii); - d_beta_pack(iatom_mod,icoeff,iatom_div) += d_coeffi[k]*bveci; - k++; - for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { - const auto jdxb = jcoeff % idxb_max; - const auto jdx_chem = jcoeff / idxb_max; - double bvecj = my_sna.blist(jdxb, jdx_chem, ii); - d_beta_pack(iatom_mod,icoeff,iatom_div) += d_coeffi[k]*bvecj; - d_beta_pack(iatom_mod,jcoeff,iatom_div) += d_coeffi[k]*bveci; - k++; - } - } - } -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPPreUi,const typename Kokkos::TeamPolicy::member_type& team) const { - SNAKokkos my_sna = snaKK; - - // Extract the atom number - const int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size())); - if (ii >= chunk_size) return; - int itype = type(ii); - int ielem = d_map[itype]; - - my_sna.pre_ui(team,ii,ielem); -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeUi,const typename Kokkos::TeamPolicy::member_type& team) const { - SNAKokkos my_sna = snaKK; - - // Extract the atom number - int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size())); - if (ii >= chunk_size) return; - - // Extract the neighbor number - const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size()); - const int ninside = d_ninside(ii); - if (jj >= ninside) return; - - my_sna.compute_ui(team,ii,jj); -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPTransformUi,const int iatom_mod, const int j, const int iatom_div) const { - SNAKokkos my_sna = snaKK; - - const int iatom = iatom_mod + iatom_div * 32; - if (iatom >= chunk_size) return; - - if (j > twojmax) return; - - int elem_count = chemflag ? nelements : 1; - - for (int ielem = 0; ielem < elem_count; ielem++) { - const int jju_half = my_sna.idxu_half_block(j); - const int jju = my_sna.idxu_block(j); - - for (int mb = 0; 2*mb <= j; mb++) { - for (int ma = 0; ma <= j; ma++) { - // Extract top half - - const int idxu_shift = mb * (j + 1) + ma; - const int idxu_half = jju_half + idxu_shift; - const int idxu = jju + idxu_shift; - - auto utot_re = my_sna.ulisttot_re(idxu_half, ielem, iatom); - auto utot_im = my_sna.ulisttot_im(idxu_half, ielem, iatom); - - // Store - my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im }; - - // Also zero yi - my_sna.ylist_pack_re(iatom_mod, idxu_half, ielem, iatom_div) = 0.; - my_sna.ylist_pack_im(iatom_mod, idxu_half, ielem, iatom_div) = 0.; - - // Symmetric term - const int sign_factor = (((ma+mb)%2==0)?1:-1); - const int idxu_flip = jju + (j + 1 - mb) * (j + 1) - (ma + 1); - - if (sign_factor == 1) { - utot_im = -utot_im; - } else { - utot_re = -utot_re; - } - - my_sna.ulisttot_pack(iatom_mod, idxu_flip, ielem, iatom_div) = { utot_re, utot_im }; - - // No need to zero symmetrized ylist - } - } - } -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeYi,const int iatom_mod, const int jjz, const int iatom_div) const { - SNAKokkos my_sna = snaKK; - - const int iatom = iatom_mod + iatom_div * 32; - if (iatom >= chunk_size) return; - - if (jjz >= my_sna.idxz_max) return; - - my_sna.compute_yi(iatom_mod,jjz,iatom_div,d_beta_pack); -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPTransformYi,const int iatom_mod, const int idxu_half, const int iatom_div) const { - SNAKokkos my_sna = snaKK; - - const int iatom = iatom_mod + iatom_div * 32; - if (iatom >= chunk_size) return; - - if (idxu_half >= my_sna.idxu_half_max) return; - - int elem_count = chemflag ? nelements : 1; - for (int ielem = 0; ielem < elem_count; ielem++) { - const auto y_re = my_sna.ylist_pack_re(iatom_mod, idxu_half, ielem, iatom_div); - const auto y_im = my_sna.ylist_pack_im(iatom_mod, idxu_half, ielem, iatom_div); - - my_sna.ylist(idxu_half, ielem, iatom) = { y_re, y_im }; - } - -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const { - SNAKokkos my_sna = snaKK; - - const int iatom = iatom_mod + iatom_div * 32; - if (iatom >= chunk_size) return; - - if (jjz >= my_sna.idxz_max) return; - - my_sna.compute_zi(iatom_mod,jjz,iatom_div); -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const { - SNAKokkos my_sna = snaKK; - - const int iatom = iatom_mod + iatom_div * 32; - if (iatom >= chunk_size) return; - - if (jjb >= my_sna.idxb_max) return; - - my_sna.compute_bi(iatom_mod,jjb,iatom_div); -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const { - SNAKokkos my_sna = snaKK; - - const int iatom = iatom_mod + iatom_div * 32; - if (iatom >= chunk_size) return; - - if (idxb >= my_sna.idxb_max) return; - - const int ntriples = my_sna.ntriples; - - for (int itriple = 0; itriple < ntriples; itriple++) { - - const auto blocal = my_sna.blist_pack(iatom_mod, idxb, itriple, iatom_div); - - my_sna.blist(idxb, itriple, iatom) = blocal; - } - -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeFusedDeidrj,const typename Kokkos::TeamPolicy::member_type& team) const { - SNAKokkos my_sna = snaKK; - - // Extract the atom number - int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size())); - if (ii >= chunk_size) return; - - // Extract the neighbor number - const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size()); - const int ninside = d_ninside(ii); - if (jj >= ninside) return; - - my_sna.compute_fused_deidrj(team,ii,jj); -} - -/* ---------------------------------------------------------------------- - Begin routines that are unique to the CPU codepath. These do not take - advantage of AoSoA data layouts, but that could be a good point of - future optimization and unification with the above kernels. It's unlikely - that scratch memory optimizations will ever be useful for the CPU due to - different arithmetic intensity requirements for the CPU vs GPU. -------------------------------------------------------------------------- */ - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPBetaCPU,const int& ii) const { - - const int i = d_ilist[ii + chunk_offset]; - const int itype = type[i]; - const int ielem = d_map[itype]; - SNAKokkos my_sna = snaKK; - - Kokkos::View> - d_coeffi(d_coeffelem,ielem,Kokkos::ALL); - - for (int icoeff = 0; icoeff < ncoeff; icoeff++) - d_beta(icoeff,ii) = d_coeffi[icoeff+1]; - - if (quadraticflag) { - const auto idxb_max = my_sna.idxb_max; - int k = ncoeff+1; - for (int icoeff = 0; icoeff < ncoeff; icoeff++) { - const auto idxb = icoeff % idxb_max; - const auto idx_chem = icoeff / idxb_max; - double bveci = my_sna.blist(idxb,idx_chem,ii); - d_beta(icoeff,ii) += d_coeffi[k]*bveci; - k++; - for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { - const auto jdxb = jcoeff % idxb_max; - const auto jdx_chem = jcoeff / idxb_max; - double bvecj = my_sna.blist(jdxb,jdx_chem,ii); - d_beta(icoeff,ii) += d_coeffi[k]*bvecj; - d_beta(jcoeff,ii) += d_coeffi[k]*bveci; - k++; - } - } - } -} - - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPPreUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const { - SNAKokkos my_sna = snaKK; +void PairSNAPKokkos::operator() (TagPairSNAPPreUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const { + SNAKokkos my_sna = snaKK; // Extract the atom number const int ii = team.team_rank() + team.team_size() * team.league_rank(); @@ -920,10 +1012,10 @@ void PairSNAPKokkos::operator() (TagPairSNAPPreUiCPU,const typename -template +template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const { - SNAKokkos my_sna = snaKK; +void PairSNAPKokkos::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const { + SNAKokkos my_sna = snaKK; // Extract the atom number int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size())); @@ -937,10 +1029,10 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeUiCPU,const typen my_sna.compute_ui_cpu(team,ii,jj); } -template +template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPTransformUiCPU, const int j, const int iatom) const { - SNAKokkos my_sna = snaKK; +void PairSNAPKokkos::operator() (TagPairSNAPTransformUiCPU, const int j, const int iatom) const { + SNAKokkos my_sna = snaKK; if (iatom >= chunk_size) return; @@ -987,32 +1079,32 @@ void PairSNAPKokkos::operator() (TagPairSNAPTransformUiCPU, const in } } -template +template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeYiCPU,const int& ii) const { - SNAKokkos my_sna = snaKK; +void PairSNAPKokkos::operator() (TagPairSNAPComputeYiCPU,const int& ii) const { + SNAKokkos my_sna = snaKK; my_sna.compute_yi_cpu(ii,d_beta); } -template +template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeZiCPU,const int& ii) const { - SNAKokkos my_sna = snaKK; +void PairSNAPKokkos::operator() (TagPairSNAPComputeZiCPU,const int& ii) const { + SNAKokkos my_sna = snaKK; my_sna.compute_zi_cpu(ii); } -template +template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeBiCPU,const typename Kokkos::TeamPolicy::member_type& team) const { +void PairSNAPKokkos::operator() (TagPairSNAPComputeBiCPU,const typename Kokkos::TeamPolicy::member_type& team) const { int ii = team.league_rank(); - SNAKokkos my_sna = snaKK; + SNAKokkos my_sna = snaKK; my_sna.compute_bi_cpu(team,ii); } -template +template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const { - SNAKokkos my_sna = snaKK; +void PairSNAPKokkos::operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const { + SNAKokkos my_sna = snaKK; // Extract the atom number int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size())); @@ -1026,10 +1118,10 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeDuidrjCPU,const t my_sna.compute_duidrj_cpu(team,ii,jj); } -template +template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const { - SNAKokkos my_sna = snaKK; +void PairSNAPKokkos::operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const { + SNAKokkos my_sna = snaKK; // Extract the atom number int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size())); @@ -1049,10 +1141,10 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeDeidrjCPU,const t likely not worth it. ------------------------------------------------------------------------- */ -template +template template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeForce,const typename Kokkos::TeamPolicy >::member_type& team, EV_FLOAT& ev) const { +void PairSNAPKokkos::operator() (TagPairSNAPComputeForce,const typename Kokkos::TeamPolicy >::member_type& team, EV_FLOAT& ev) const { // The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial @@ -1061,7 +1153,7 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeForce my_sna = snaKK; + SNAKokkos my_sna = snaKK; const int ninside = d_ninside(ii); Kokkos::parallel_for (Kokkos::TeamThreadRange(team,ninside), @@ -1102,14 +1194,13 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeForce> - d_coeffi(d_coeffelem,ielem,Kokkos::ALL); + auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL); Kokkos::single(Kokkos::PerTeam(team), [&] () { // evdwl = energy of atom I, sum over coeffs_k * Bi_k - double evdwl = d_coeffi[0]; + auto evdwl = d_coeffi[0]; // E = beta.B + 0.5*B^t.alpha.B @@ -1130,13 +1221,13 @@ void PairSNAPKokkos::operator() (TagPairSNAPComputeForce::operator() (TagPairSNAPComputeForce +template template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeForce,const typename Kokkos::TeamPolicy >::member_type& team) const { +void PairSNAPKokkos::operator() (TagPairSNAPComputeForce,const typename Kokkos::TeamPolicy >::member_type& team) const { EV_FLOAT ev; this->template operator()(TagPairSNAPComputeForce(), team, ev); } /* ---------------------------------------------------------------------- */ -template +template template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::v_tally_xyz(EV_FLOAT &ev, const int &i, const int &j, +void PairSNAPKokkos::v_tally_xyz(EV_FLOAT &ev, const int &i, const int &j, const F_FLOAT &fx, const F_FLOAT &fy, const F_FLOAT &fz, const F_FLOAT &delx, const F_FLOAT &dely, const F_FLOAT &delz) const { @@ -1209,24 +1300,24 @@ void PairSNAPKokkos::v_tally_xyz(EV_FLOAT &ev, const int &i, const i memory usage ------------------------------------------------------------------------- */ -template -double PairSNAPKokkos::memory_usage() +template +double PairSNAPKokkos::memory_usage() { double bytes = Pair::memory_usage(); int n = atom->ntypes+1; bytes += n*n*sizeof(int); - bytes += n*n*sizeof(double); - bytes += (2*ncoeffall)*sizeof(double); - bytes += (ncoeff*3)*sizeof(double); + bytes += n*n*sizeof(real_type); + bytes += (2*ncoeffall)*sizeof(real_type); + bytes += (ncoeff*3)*sizeof(real_type); bytes += snaKK.memory_usage(); return bytes; } /* ---------------------------------------------------------------------- */ -template +template template -void PairSNAPKokkos::check_team_size_for(int inum, int &team_size, int vector_length) { +void PairSNAPKokkos::check_team_size_for(int inum, int &team_size) { int team_size_max; team_size_max = Kokkos::TeamPolicy(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag()); @@ -1235,9 +1326,9 @@ void PairSNAPKokkos::check_team_size_for(int inum, int &team_size, i team_size = team_size_max/vector_length; } -template +template template -void PairSNAPKokkos::check_team_size_reduce(int inum, int &team_size, int vector_length) { +void PairSNAPKokkos::check_team_size_reduce(int inum, int &team_size) { int team_size_max; team_size_max = Kokkos::TeamPolicy(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelReduceTag()); @@ -1246,4 +1337,79 @@ void PairSNAPKokkos::check_team_size_reduce(int inum, int &team_size team_size = team_size_max/vector_length; } + +/* ---------------------------------------------------------------------- + routines used by template reference classes +------------------------------------------------------------------------- */ + +template +PairSNAPKokkosDevice::PairSNAPKokkosDevice(class LAMMPS *lmp) + : PairSNAPKokkos(lmp) { ; } + +template +void PairSNAPKokkosDevice::coeff(int narg, char **arg) +{ + Base::coeff(narg, arg); +} + +template +void PairSNAPKokkosDevice::init_style() +{ + Base::init_style(); +} + +template +double PairSNAPKokkosDevice::init_one(int i, int j) +{ + return Base::init_one(i, j); +} + +template +void PairSNAPKokkosDevice::compute(int eflag_in, int vflag_in) +{ + Base::compute(eflag_in, vflag_in); +} + +template +double PairSNAPKokkosDevice::memory_usage() +{ + return Base::memory_usage(); +} + +#ifdef LMP_KOKKOS_GPU +template +PairSNAPKokkosHost::PairSNAPKokkosHost(class LAMMPS *lmp) + : PairSNAPKokkos(lmp) { ; } + +template +void PairSNAPKokkosHost::coeff(int narg, char **arg) +{ + Base::coeff(narg, arg); +} + +template +void PairSNAPKokkosHost::init_style() +{ + Base::init_style(); +} + +template +double PairSNAPKokkosHost::init_one(int i, int j) +{ + return Base::init_one(i, j); +} + +template +void PairSNAPKokkosHost::compute(int eflag_in, int vflag_in) +{ + Base::compute(eflag_in, vflag_in); +} + +template +double PairSNAPKokkosHost::memory_usage() +{ + return Base::memory_usage(); +} +#endif + } diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index 7804aaef18..f183acdb57 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -25,45 +25,78 @@ namespace LAMMPS_NS { -template +template +struct WignerWrapper { + using real_type = real_type_; + using complex = SNAComplex; + static constexpr int vector_length = vector_length_; + + const int offset; // my offset into the vector (0, ..., vector_length - 1) + real_type* buffer; // buffer of real numbers + + KOKKOS_INLINE_FUNCTION + WignerWrapper(complex* buffer_, const int offset_) + : offset(offset_), buffer(reinterpret_cast(buffer_)) + { ; } + + KOKKOS_INLINE_FUNCTION + complex get(const int& ma) { + return complex(buffer[offset + 2 * vector_length * ma], buffer[offset + vector_length + 2 * vector_length * ma]); + } + + KOKKOS_INLINE_FUNCTION + void set(const int& ma, const complex& store) { + buffer[offset + 2 * vector_length * ma] = store.re; + buffer[offset + vector_length + 2 * vector_length * ma] = store.im; + } +}; + +struct alignas(8) FullHalfMapper { + int idxu_half; + int flip_sign; // 0 -> isn't flipped, 1 -> conj, -1 -> -conj +}; + +template class SNAKokkos { public: + using real_type = real_type_; + using complex = SNAComplex; + static constexpr int vector_length = vector_length_; + typedef Kokkos::View t_sna_1i; - typedef Kokkos::View t_sna_1d; - typedef Kokkos::View::value, Kokkos::MemoryTraits > t_sna_1d_atomic; + typedef Kokkos::View t_sna_1d; + typedef Kokkos::View::value, Kokkos::MemoryTraits > t_sna_1d_atomic; typedef Kokkos::View t_sna_2i; - typedef Kokkos::View t_sna_2d; - typedef Kokkos::View t_sna_2d_ll; - typedef Kokkos::View t_sna_3d; - typedef Kokkos::View t_sna_3d_ll; - typedef Kokkos::View t_sna_4d; - typedef Kokkos::View t_sna_4d_ll; - typedef Kokkos::View t_sna_3d3; - typedef Kokkos::View t_sna_5d; + typedef Kokkos::View t_sna_2d; + typedef Kokkos::View t_sna_2d_ll; + typedef Kokkos::View t_sna_3d; + typedef Kokkos::View t_sna_3d_ll; + typedef Kokkos::View t_sna_4d; + typedef Kokkos::View t_sna_4d_ll; + typedef Kokkos::View t_sna_3d3; + typedef Kokkos::View t_sna_5d; - typedef Kokkos::View t_sna_1c; - typedef Kokkos::View::value, Kokkos::MemoryTraits > t_sna_1c_atomic; - typedef Kokkos::View t_sna_2c; - typedef Kokkos::View t_sna_2c_ll; - typedef Kokkos::View t_sna_2c_lr; - typedef Kokkos::View t_sna_3c; - typedef Kokkos::View t_sna_3c_ll; - typedef Kokkos::View t_sna_4c; - typedef Kokkos::View t_sna_4c3_ll; - typedef Kokkos::View t_sna_4c_ll; - typedef Kokkos::View t_sna_3c3; - typedef Kokkos::View t_sna_5c; - - typedef Kokkos::View t_sna_2ckp; + typedef Kokkos::View t_sna_1c; + typedef Kokkos::View::value, Kokkos::MemoryTraits > t_sna_1c_atomic; + typedef Kokkos::View t_sna_2c; + typedef Kokkos::View t_sna_2c_ll; + typedef Kokkos::View t_sna_2c_lr; + typedef Kokkos::View t_sna_3c; + typedef Kokkos::View t_sna_3c_ll; + typedef Kokkos::View t_sna_4c; + typedef Kokkos::View t_sna_4c3_ll; + typedef Kokkos::View t_sna_4c_ll; + typedef Kokkos::View t_sna_3c3; + typedef Kokkos::View t_sna_5c; inline SNAKokkos() {}; KOKKOS_INLINE_FUNCTION - SNAKokkos(const SNAKokkos& sna, const typename Kokkos::TeamPolicy::member_type& team); + SNAKokkos(const SNAKokkos& sna, const typename Kokkos::TeamPolicy::member_type& team); inline - SNAKokkos(double, int, double, int, int, int, int, int, int); + SNAKokkos(real_type, int, real_type, int, int, int, int, int, int); KOKKOS_INLINE_FUNCTION ~SNAKokkos(); @@ -81,17 +114,16 @@ inline // functions for bispectrum coefficients, GPU only KOKKOS_INLINE_FUNCTION - void compute_cayley_klein(const int&, const int&, const double&, const double&, - const double&, const double&, const double&); + void compute_cayley_klein(const int&, const int&, const int&); KOKKOS_INLINE_FUNCTION - void pre_ui(const typename Kokkos::TeamPolicy::member_type& team,const int&,const int&); // ForceSNAP + void pre_ui(const int&, const int&, const int&, const int&); // ForceSNAP KOKKOS_INLINE_FUNCTION - void compute_ui(const typename Kokkos::TeamPolicy::member_type& team, const int, const int); // ForceSNAP + void compute_ui(const typename Kokkos::TeamPolicy::member_type& team, const int, const int, const int, const int); // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_zi(const int&, const int&, const int&); // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_yi(int,int,int, - const Kokkos::View &beta_pack); // ForceSNAP + const Kokkos::View &beta_pack); // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_bi(const int&, const int&, const int&); // ForceSNAP @@ -104,34 +136,33 @@ inline void compute_zi_cpu(const int&); // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_yi_cpu(int, - const Kokkos::View &beta); // ForceSNAP + const Kokkos::View &beta); // ForceSNAP KOKKOS_INLINE_FUNCTION void compute_bi_cpu(const typename Kokkos::TeamPolicy::member_type& team, int); // ForceSNAP // functions for derivatives, GPU only KOKKOS_INLINE_FUNCTION - void compute_fused_deidrj(const typename Kokkos::TeamPolicy::member_type& team, const int, const int); //ForceSNAP + void compute_fused_deidrj(const typename Kokkos::TeamPolicy::member_type& team, const int, const int, const int, const int); //ForceSNAP // functions for derivatives, CPU only KOKKOS_INLINE_FUNCTION void compute_duidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int); //ForceSNAP KOKKOS_INLINE_FUNCTION void compute_deidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int); // ForceSNAP - KOKKOS_INLINE_FUNCTION - double compute_sfac(double, double); // add_uarraytot, compute_duarray - KOKKOS_INLINE_FUNCTION - double compute_dsfac(double, double); // compute_duarray - KOKKOS_INLINE_FUNCTION - void compute_s_dsfac(const double, const double, double&, double&); // compute_cayley_klein - // efficient complex FMA - // efficient caxpy (i.e., y += a x) - static KOKKOS_FORCEINLINE_FUNCTION - void caxpy(const SNAcomplex& a, const SNAcomplex& x, SNAcomplex& y); + KOKKOS_INLINE_FUNCTION + real_type compute_sfac(real_type, real_type); // add_uarraytot, compute_duarray + + KOKKOS_INLINE_FUNCTION + real_type compute_dsfac(real_type, real_type); // compute_duarray + + KOKKOS_INLINE_FUNCTION + void compute_s_dsfac(const real_type, const real_type, real_type&, real_type&); // compute_cayley_klein - // efficient complex FMA, conjugate of scalar static KOKKOS_FORCEINLINE_FUNCTION - void caconjxpy(const SNAcomplex& a, const SNAcomplex& x, SNAcomplex& y); + void sincos_wrapper(double x, double* sin_, double *cos_) { sincos(x, sin_, cos_); } + static KOKKOS_FORCEINLINE_FUNCTION + void sincos_wrapper(float x, float* sin_, float *cos_) { sincosf(x, sin_, cos_); } // Set the direction for split ComputeDuidrj KOKKOS_INLINE_FUNCTION @@ -146,10 +177,6 @@ inline //per sna class instance for OMP use - // Alternative to rij, wj, rcutij... - // just calculate everything up front - t_sna_2ckp cayleyklein; - // Per InFlight Particle t_sna_3d rij; t_sna_2i inside; @@ -175,8 +202,14 @@ inline t_sna_4c3_ll dulist; // Modified structures for GPU backend - t_sna_3d_ll ulisttot_re; // split real, - t_sna_3d_ll ulisttot_im; // imag + t_sna_3c_ll a_pack; // Cayley-Klein `a` + t_sna_3c_ll b_pack; // `b` + t_sna_4c_ll da_pack; // `da` + t_sna_4c_ll db_pack; // `db` + t_sna_4d_ll sfac_pack; // sfac, dsfac_{x,y,z} + + t_sna_4d_ll ulisttot_re_pack; // split real, + t_sna_4d_ll ulisttot_im_pack; // imag, AoSoA, flattened t_sna_4c_ll ulisttot_pack; // AoSoA layout t_sna_4c_ll zlist_pack; // AoSoA layout t_sna_4d_ll blist_pack; @@ -191,7 +224,7 @@ inline int ntriples; private: - double rmin0, rfac0; + real_type rmin0, rfac0; //use indexlist instead of loops, constructor generates these // Same across all SNAKokkos @@ -203,6 +236,7 @@ public: Kokkos::View idxu_block; Kokkos::View idxu_half_block; Kokkos::View idxu_cache_block; + Kokkos::View idxu_full_half; private: Kokkos::View idxz_block; @@ -231,12 +265,12 @@ inline void init_rootpqarray(); // init() KOKKOS_INLINE_FUNCTION - void add_uarraytot(const typename Kokkos::TeamPolicy::member_type& team, int, int, double, double, double, int); // compute_ui + void add_uarraytot(const typename Kokkos::TeamPolicy::member_type& team, int, int, const real_type&, const real_type&, const real_type&, int); // compute_ui KOKKOS_INLINE_FUNCTION void compute_uarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int, - double, double, double, - double, double); // compute_ui_cpu + const real_type&, const real_type&, const real_type&, + const real_type&, const real_type&); // compute_ui_cpu inline @@ -246,8 +280,8 @@ inline int compute_ncoeff(); // SNAKokkos() KOKKOS_INLINE_FUNCTION void compute_duarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int, - double, double, double, // compute_duidrj_cpu - double, double, double, double, double); + const real_type&, const real_type&, const real_type&, // compute_duidrj_cpu + const real_type&, const real_type&, const real_type&, const real_type&, const real_type&); // Sets the style for the switching function // 0 = none @@ -259,11 +293,11 @@ inline int bnorm_flag; // Self-weight - double wself; + real_type wself; int wselfall_flag; int bzero_flag; // 1 if bzero subtracted from barray - Kokkos::View bzero; // array of B values for isolated atoms + Kokkos::View bzero; // array of B values for isolated atoms // for per-direction dulist calculation, specify the direction. int dir; diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index 3060aa9527..23c1670bd8 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -25,16 +25,16 @@ namespace LAMMPS_NS { static const double MY_PI = 3.14159265358979323846; // pi -template +template inline -SNAKokkos::SNAKokkos(double rfac0_in, - int twojmax_in, double rmin0_in, int switch_flag_in, int bzero_flag_in, +SNAKokkos::SNAKokkos(real_type rfac0_in, + int twojmax_in, real_type rmin0_in, int switch_flag_in, int bzero_flag_in, int chem_flag_in, int bnorm_flag_in, int wselfall_flag_in, int nelements_in) { LAMMPS_NS::ExecutionSpace execution_space = ExecutionSpaceFromDevice::space; host_flag = (execution_space == LAMMPS_NS::Host); - wself = 1.0; + wself = static_cast(1.0); rfac0 = rfac0_in; rmin0 = rmin0_in; @@ -63,7 +63,7 @@ SNAKokkos::SNAKokkos(double rfac0_in, cglist = t_sna_1d("SNAKokkos::cglist",idxcg_max); if (bzero_flag) { - bzero = Kokkos::View("sna:bzero",twojmax+1); + bzero = Kokkos::View("sna:bzero",twojmax+1); auto h_bzero = Kokkos::create_mirror_view(bzero); double www = wself*wself*wself; @@ -78,20 +78,20 @@ SNAKokkos::SNAKokkos(double rfac0_in, /* ---------------------------------------------------------------------- */ -template +template KOKKOS_INLINE_FUNCTION -SNAKokkos::~SNAKokkos() +SNAKokkos::~SNAKokkos() { } -template +template inline -void SNAKokkos::build_indexlist() +void SNAKokkos::build_indexlist() { // index list for cglist int jdim = twojmax + 1; - idxcg_block = Kokkos::View("SNAKokkos::idxcg_block",jdim,jdim,jdim); + idxcg_block = Kokkos::View(Kokkos::NoInit("SNAKokkos::idxcg_block"),jdim,jdim,jdim); auto h_idxcg_block = Kokkos::create_mirror_view(idxcg_block); int idxcg_count = 0; @@ -109,7 +109,7 @@ void SNAKokkos::build_indexlist() // index list for uarray // need to include both halves - idxu_block = Kokkos::View("SNAKokkos::idxu_block",jdim); + idxu_block = Kokkos::View(Kokkos::NoInit("SNAKokkos::idxu_block"),jdim); auto h_idxu_block = Kokkos::create_mirror_view(idxu_block); int idxu_count = 0; @@ -124,7 +124,7 @@ void SNAKokkos::build_indexlist() Kokkos::deep_copy(idxu_block,h_idxu_block); // index list for half uarray - idxu_half_block = Kokkos::View("SNAKokkos::idxu_half_block",jdim); + idxu_half_block = Kokkos::View(Kokkos::NoInit("SNAKokkos::idxu_half_block"),jdim); auto h_idxu_half_block = Kokkos::create_mirror_view(idxu_half_block); int idxu_half_count = 0; @@ -137,10 +137,35 @@ void SNAKokkos::build_indexlist() idxu_half_max = idxu_half_count; Kokkos::deep_copy(idxu_half_block, h_idxu_half_block); + // mapping between full and half indexing, encoding flipping + idxu_full_half = Kokkos::View(Kokkos::NoInit("SNAKokkos::idxu_full_half"),idxu_max); + auto h_idxu_full_half = Kokkos::create_mirror_view(idxu_full_half); + + idxu_count = 0; + for(int j = 0; j <= twojmax; j++) { + int jju_half = h_idxu_half_block[j]; + for(int mb = 0; mb <= j; mb++) { + for(int ma = 0; ma <= j; ma++) { + FullHalfMapper mapper; + if (2*mb <= j) { + mapper.idxu_half = jju_half + mb * (j + 1) + ma; + mapper.flip_sign = 0; + } else { + mapper.idxu_half = jju_half + (j + 1 - mb) * (j + 1) - (ma + 1); + mapper.flip_sign = (((ma+mb)%2==0)?1:-1); + } + h_idxu_full_half[idxu_count] = mapper; + idxu_count++; + } + } + } + + Kokkos::deep_copy(idxu_full_half, h_idxu_full_half); + // index list for "cache" uarray // this is the GPU scratch memory requirements // applied the CPU structures - idxu_cache_block = Kokkos::View("SNAKokkos::idxu_cache_block",jdim); + idxu_cache_block = Kokkos::View(Kokkos::NoInit("SNAKokkos::idxu_cache_block"),jdim); auto h_idxu_cache_block = Kokkos::create_mirror_view(idxu_cache_block); int idxu_cache_count = 0; @@ -162,7 +187,7 @@ void SNAKokkos::build_indexlist() if (j >= j1) idxb_count++; idxb_max = idxb_count; - idxb = Kokkos::View("SNAKokkos::idxb",idxb_max); + idxb = Kokkos::View(Kokkos::NoInit("SNAKokkos::idxb"),idxb_max); auto h_idxb = Kokkos::create_mirror_view(idxb); idxb_count = 0; @@ -179,7 +204,7 @@ void SNAKokkos::build_indexlist() // reverse index list for beta and b - idxb_block = Kokkos::View("SNAKokkos::idxb_block",jdim,jdim,jdim); + idxb_block = Kokkos::View(Kokkos::NoInit("SNAKokkos::idxb_block"),jdim,jdim,jdim); auto h_idxb_block = Kokkos::create_mirror_view(idxb_block); idxb_count = 0; @@ -205,10 +230,10 @@ void SNAKokkos::build_indexlist() idxz_count++; idxz_max = idxz_count; - idxz = Kokkos::View("SNAKokkos::idxz",idxz_max); + idxz = Kokkos::View(Kokkos::NoInit("SNAKokkos::idxz"),idxz_max); auto h_idxz = Kokkos::create_mirror_view(idxz); - idxz_block = Kokkos::View("SNAKokkos::idxz_block", jdim,jdim,jdim); + idxz_block = Kokkos::View(Kokkos::NoInit("SNAKokkos::idxz_block"), jdim,jdim,jdim); auto h_idxz_block = Kokkos::create_mirror_view(idxz_block); idxz_count = 0; @@ -249,53 +274,63 @@ void SNAKokkos::build_indexlist() /* ---------------------------------------------------------------------- */ -template +template inline -void SNAKokkos::init() +void SNAKokkos::init() { init_clebsch_gordan(); init_rootpqarray(); } -template +template inline -void SNAKokkos::grow_rij(int newnatom, int newnmax) +void SNAKokkos::grow_rij(int newnatom, int newnmax) { if(newnatom <= natom && newnmax <= nmax) return; natom = newnatom; nmax = newnmax; + int natom_div = (natom + vector_length - 1) / vector_length; + + rij = t_sna_3d(Kokkos::NoInit("sna:rij"),natom,nmax,3); + wj = t_sna_2d(Kokkos::NoInit("sna:wj"),natom,nmax); + rcutij = t_sna_2d(Kokkos::NoInit("sna:rcutij"),natom,nmax); inside = t_sna_2i(Kokkos::NoInit("sna:inside"),natom,nmax); - element = t_sna_2i(Kokkos::NoInit("sna:rcutij"),natom,nmax); + element = t_sna_2i(Kokkos::NoInit("sna:element"),natom,nmax); dedr = t_sna_3d(Kokkos::NoInit("sna:dedr"),natom,nmax,3); #ifdef LMP_KOKKOS_GPU if (!host_flag) { - - cayleyklein = t_sna_2ckp(Kokkos::NoInit("sna:cayleyklein"), natom, nmax); + a_pack = t_sna_3c_ll(Kokkos::NoInit("sna:a_pack"),vector_length,nmax,natom_div); + b_pack = t_sna_3c_ll(Kokkos::NoInit("sna:b_pack"),vector_length,nmax,natom_div); + da_pack = t_sna_4c_ll(Kokkos::NoInit("sna:da_pack"),vector_length,nmax,natom_div,3); + db_pack = t_sna_4c_ll(Kokkos::NoInit("sna:db_pack"),vector_length,nmax,natom_div,3); + sfac_pack = t_sna_4d_ll(Kokkos::NoInit("sna:sfac_pack"),vector_length,nmax,natom_div,4); ulisttot = t_sna_3c_ll(Kokkos::NoInit("sna:ulisttot"),1,1,1); // dummy allocation ulisttot_full = t_sna_3c_ll(Kokkos::NoInit("sna:ulisttot"),1,1,1); - ulisttot_re = t_sna_3d_ll(Kokkos::NoInit("sna:ulisttot_re"),idxu_half_max,nelements,natom); - ulisttot_im = t_sna_3d_ll(Kokkos::NoInit("sna:ulisttot_im"),idxu_half_max,nelements,natom); - ulisttot_pack = t_sna_4c_ll(Kokkos::NoInit("sna:ulisttot_pack"),32,idxu_max,nelements,(natom+32-1)/32); + ulisttot_re_pack = t_sna_4d_ll(Kokkos::NoInit("sna:ulisttot_re_pack"),vector_length,idxu_half_max,nelements,natom_div); + ulisttot_im_pack = t_sna_4d_ll(Kokkos::NoInit("sna:ulisttot_im_pack"),vector_length,idxu_half_max,nelements,natom_div); + ulisttot_pack = t_sna_4c_ll(Kokkos::NoInit("sna:ulisttot_pack"),vector_length,idxu_max,nelements,natom_div); ulist = t_sna_3c_ll(Kokkos::NoInit("sna:ulist"),1,1,1); zlist = t_sna_3c_ll(Kokkos::NoInit("sna:zlist"),1,1,1); - zlist_pack = t_sna_4c_ll(Kokkos::NoInit("sna:zlist_pack"),32,idxz_max,ndoubles,(natom+32-1)/32); + zlist_pack = t_sna_4c_ll(Kokkos::NoInit("sna:zlist_pack"),vector_length,idxz_max,ndoubles,natom_div); blist = t_sna_3d_ll(Kokkos::NoInit("sna:blist"),idxb_max,ntriples,natom); - blist_pack = t_sna_4d_ll(Kokkos::NoInit("sna:blist_pack"),32,idxb_max,ntriples,(natom+32-1)/32); - ylist = t_sna_3c_ll(Kokkos::NoInit("sna:ylist"),idxu_half_max,nelements,natom); - ylist_pack_re = t_sna_4d_ll(Kokkos::NoInit("sna:ylist_pack_re"),32,idxu_half_max,nelements,(natom+32-1)/32); - ylist_pack_im = t_sna_4d_ll(Kokkos::NoInit("sna:ylist_pack_im"),32,idxu_half_max,nelements,(natom+32-1)/32); + blist_pack = t_sna_4d_ll(Kokkos::NoInit("sna:blist_pack"),vector_length,idxb_max,ntriples,natom_div); + ylist = t_sna_3c_ll(Kokkos::NoInit("sna:ylist"),1,1,1); + ylist_pack_re = t_sna_4d_ll(Kokkos::NoInit("sna:ylist_pack_re"),vector_length,idxu_half_max,nelements,natom_div); + ylist_pack_im = t_sna_4d_ll(Kokkos::NoInit("sna:ylist_pack_im"),vector_length,idxu_half_max,nelements,natom_div); dulist = t_sna_4c3_ll(Kokkos::NoInit("sna:dulist"),1,1,1); } else { #endif - rij = t_sna_3d(Kokkos::NoInit("sna:rij"),natom,nmax,3); - wj = t_sna_2d(Kokkos::NoInit("sna:wj"),natom,nmax); - rcutij = t_sna_2d(Kokkos::NoInit("sna:rcutij"),natom,nmax); + a_pack = t_sna_3c_ll(Kokkos::NoInit("sna:a_pack"),1,1,1); + b_pack = t_sna_3c_ll(Kokkos::NoInit("sna:b_pack"),1,1,1); + da_pack = t_sna_4c_ll(Kokkos::NoInit("sna:da_pack"),1,1,1,1); + db_pack = t_sna_4c_ll(Kokkos::NoInit("sna:db_pack"),1,1,1,1); + sfac_pack = t_sna_4d_ll(Kokkos::NoInit("sna:sfac_pack"),1,1,1,1); ulisttot = t_sna_3c_ll(Kokkos::NoInit("sna:ulisttot"),idxu_half_max,nelements,natom); ulisttot_full = t_sna_3c_ll(Kokkos::NoInit("sna:ulisttot_full"),idxu_max,nelements,natom); - ulisttot_re = t_sna_3d_ll(Kokkos::NoInit("sna:ulisttot_re"),1,1,1); - ulisttot_im = t_sna_3d_ll(Kokkos::NoInit("sna:ulisttot_im"),1,1,1); + ulisttot_re_pack = t_sna_4d_ll(Kokkos::NoInit("sna:ulisttot_re"),1,1,1,1); + ulisttot_im_pack = t_sna_4d_ll(Kokkos::NoInit("sna:ulisttot_im"),1,1,1,1); ulisttot_pack = t_sna_4c_ll(Kokkos::NoInit("sna:ulisttot_pack"),1,1,1,1); ulist = t_sna_3c_ll(Kokkos::NoInit("sna:ulist"),idxu_cache_max,natom,nmax); zlist = t_sna_3c_ll(Kokkos::NoInit("sna:zlist"),idxz_max,ndoubles,natom); @@ -323,81 +358,85 @@ void SNAKokkos::grow_rij(int newnatom, int newnmax) ComputeFusedDeidrj, which are one warp per atom-neighbor pair. ------------------------------------------------------------------------- */ -template +template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_cayley_klein(const int& iatom, const int& jnbor, const double& x, const double& y, - const double& z, const double& rcut, const double& wj_local) +void SNAKokkos::compute_cayley_klein(const int& iatom_mod, const int& jnbor, const int& iatom_div) { - //const double x = rij(iatom,jnbor,0); - //const double y = rij(iatom,jnbor,1); - //const double z = rij(iatom,jnbor,2); - const double rsq = x * x + y * y + z * z; - const double r = sqrt(rsq); - //const double rcut = rcutij(iatom, jnbor); - const double rscale0 = rfac0 * MY_PI / (rcut - rmin0); - const double theta0 = (r - rmin0) * rscale0; - double sn, cs; - sincos(theta0, &sn, &cs); - const double z0 = r * cs / sn; - const double dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq; + const int iatom = iatom_mod + vector_length * iatom_div; + const auto x = rij(iatom,jnbor,0); + const auto y = rij(iatom,jnbor,1); + const auto z = rij(iatom,jnbor,2); + const auto rsq = x * x + y * y + z * z; + const auto r = sqrt(rsq); + const auto rcut = rcutij(iatom, jnbor); + const auto rscale0 = rfac0 * static_cast(MY_PI) / (rcut - rmin0); + const auto theta0 = (r - rmin0) * rscale0; + real_type sn, cs; + sincos_wrapper(theta0, &sn, &cs); + const real_type z0 = r * cs / sn; + const real_type dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq; - //const double wj_local = wj(iatom, jnbor); - double sfac, dsfac; + const auto wj_local = wj(iatom, jnbor); + real_type sfac, dsfac; compute_s_dsfac(r, rcut, sfac, dsfac); sfac *= wj_local; dsfac *= wj_local; - const double rinv = 1.0 / r; - const double ux = x * rinv; - const double uy = y * rinv; - const double uz = z * rinv; + const auto rinv = static_cast(1.0) / r; + const auto ux = x * rinv; + const auto uy = y * rinv; + const auto uz = z * rinv; - const double r0inv = 1.0 / sqrt(r * r + z0 * z0); + const auto r0inv = static_cast(1.0) / sqrt(r * r + z0 * z0); - const SNAcomplex a = { z0 * r0inv, -z * r0inv }; - const SNAcomplex b = { r0inv * y, -r0inv * x }; + const complex a = { z0 * r0inv, -z * r0inv }; + const complex b = { r0inv * y, -r0inv * x }; - const double dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr); + const auto dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr); - const double dr0invx = dr0invdr * ux; - const double dr0invy = dr0invdr * uy; - const double dr0invz = dr0invdr * uz; + const auto dr0invx = dr0invdr * ux; + const auto dr0invy = dr0invdr * uy; + const auto dr0invz = dr0invdr * uz; - const double dz0x = dz0dr * ux; - const double dz0y = dz0dr * uy; - const double dz0z = dz0dr * uz; + const auto dz0x = dz0dr * ux; + const auto dz0y = dz0dr * uy; + const auto dz0z = dz0dr * uz; - const SNAcomplex dax = { dz0x * r0inv + z0 * dr0invx, -z * dr0invx }; - const SNAcomplex day = { dz0y * r0inv + z0 * dr0invy, -z * dr0invy }; - const SNAcomplex daz = { dz0z * r0inv + z0 * dr0invz, -z * dr0invz - r0inv }; + const complex dax = { dz0x * r0inv + z0 * dr0invx, -z * dr0invx }; + const complex day = { dz0y * r0inv + z0 * dr0invy, -z * dr0invy }; + const complex daz = { dz0z * r0inv + z0 * dr0invz, -z * dr0invz - r0inv }; - const SNAcomplex dbx = { y * dr0invx, -x * dr0invx - r0inv }; - const SNAcomplex dby = { y * dr0invy + r0inv, -x * dr0invy }; - const SNAcomplex dbz = { y * dr0invz, -x * dr0invz }; + const complex dbx = { y * dr0invx, -x * dr0invx - r0inv }; + const complex dby = { y * dr0invy + r0inv, -x * dr0invy }; + const complex dbz = { y * dr0invz, -x * dr0invz }; - const double dsfacux = dsfac * ux; - const double dsfacuy = dsfac * uy; - const double dsfacuz = dsfac * uz; + const auto dsfacux = dsfac * ux; + const auto dsfacuy = dsfac * uy; + const auto dsfacuz = dsfac * uz; - CayleyKleinPack ckp{}; - ckp.a = a; - ckp.b = b; - ckp.da[0] = dax; - ckp.db[0] = dbx; - ckp.da[1] = day; - ckp.db[1] = dby; - ckp.da[2] = daz; - ckp.db[2] = dbz; - ckp.sfac = sfac; - ckp.dsfacu[0] = dsfacux; - ckp.dsfacu[1] = dsfacuy; - ckp.dsfacu[2] = dsfacuz; + a_pack(iatom_mod,jnbor,iatom_div) = a; + b_pack(iatom_mod,jnbor,iatom_div) = b; + + da_pack(iatom_mod,jnbor,iatom_div,0) = dax; + db_pack(iatom_mod,jnbor,iatom_div,0) = dbx; + + da_pack(iatom_mod,jnbor,iatom_div,1) = day; + db_pack(iatom_mod,jnbor,iatom_div,1) = dby; + + da_pack(iatom_mod,jnbor,iatom_div,2) = daz; + db_pack(iatom_mod,jnbor,iatom_div,2) = dbz; + + sfac_pack(iatom_mod,jnbor,iatom_div,0) = sfac; + sfac_pack(iatom_mod,jnbor,iatom_div,1) = dsfacux; + sfac_pack(iatom_mod,jnbor,iatom_div,2) = dsfacuy; + sfac_pack(iatom_mod,jnbor,iatom_div,3) = dsfacuz; + + // we need to explicitly zero `dedr` somewhere before hitting + // ComputeFusedDeidrj --- this is just a convenient place to do it. + dedr(iatom_mod + vector_length * iatom_div, jnbor, 0) = static_cast(0.); + dedr(iatom_mod + vector_length * iatom_div, jnbor, 1) = static_cast(0.); + dedr(iatom_mod + vector_length * iatom_div, jnbor, 2) = static_cast(0.); - // Yes, this breaks the standard mantra of using SoA - // instead of AoS, but it's net fine because of the - // one warp per atom/neighbor pair for the recursive - // polynomials. There's good L1 reuse, anyway. - cayleyklein(iatom, jnbor) = ckp; } /* ---------------------------------------------------------------------- @@ -406,30 +445,28 @@ void SNAKokkos::compute_cayley_klein(const int& iatom, const int& jn advantage of the symmetry of the Wigner U matrices. ------------------------------------------------------------------------- */ -template +template KOKKOS_INLINE_FUNCTION -void SNAKokkos::pre_ui(const typename Kokkos::TeamPolicy::member_type& team, const int& iatom, const int& ielem) +void SNAKokkos::pre_ui(const int& iatom_mod, const int& j, const int& ielem, const int& iatom_div) { + for (int jelem = 0; jelem < nelements; jelem++) { - for (int j = 0; j <= twojmax; j++) { - const int jju_half = idxu_half_block(j); + int jju_half = idxu_half_block(j); - // Only diagonal elements get initialized - // Top half only: gets symmetrized by TransformUi - // for (int m = 0; m < (j+1)*(j/2+1); m++) - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j/2+1)), - [&] (const int m) { + // Only diagonal elements get initialized + // Top half only: gets symmetrized by TransformUi - const int jjup = jju_half + m; + for (int mb = 0; 2*mb <= j; mb++) { + for (int ma = 0; ma <= j; ma++) { - // if m is on the "diagonal", initialize it with the self energy. - // Otherwise zero it out - double re_part = 0.; - if (m % (j+2) == 0 && (!chem_flag || ielem == jelem || wselfall_flag)) { re_part = wself; } + real_type re_part = static_cast(0.); + if (ma == mb && (!chem_flag || ielem == jelem || wselfall_flag)) { re_part = wself; } - ulisttot_re(jjup, jelem, iatom) = re_part; - ulisttot_im(jjup, jelem, iatom) = 0.; - }); + ulisttot_re_pack(iatom_mod, jju_half, jelem, iatom_div) = re_part; + ulisttot_im_pack(iatom_mod, jju_half, jelem, iatom_div) = static_cast(0.); + + jju_half++; + } } } @@ -440,9 +477,9 @@ void SNAKokkos::pre_ui(const typename Kokkos::TeamPolicy accumulating to the total. GPU only. ------------------------------------------------------------------------- */ -template +template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_ui(const typename Kokkos::TeamPolicy::member_type& team, const int iatom, const int jnbor) +void SNAKokkos::compute_ui(const typename Kokkos::TeamPolicy::member_type& team, const int iatom_mod, const int j_bend, const int jnbor, const int iatom_div) { // utot(j,ma,mb) = 0 for all j,ma,ma @@ -452,119 +489,122 @@ void SNAKokkos::compute_ui(const typename Kokkos::TeamPolicy ulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod); - // Each warp is accessing the same pack: take advantage of - // L1 reuse - const CayleyKleinPack& ckp = cayleyklein(iatom, jnbor); - const SNAcomplex a = ckp.a; - const SNAcomplex b = ckp.b; - const double sfac = ckp.sfac; + // load parameters + const auto a = a_pack(iatom_mod, jnbor, iatom_div); + const auto b = b_pack(iatom_mod, jnbor, iatom_div); + const auto sfac = sfac_pack(iatom_mod, jnbor, iatom_div, 0); - const int jelem = element(iatom, jnbor); + const int jelem = element(iatom_mod + vector_length * iatom_div, jnbor); - // VMK Section 4.8.2 + // we need to "choose" when to bend + // this for loop is here for context --- we expose additional + // parallelism over this loop instead + //for (int j_bend = 0; j_bend <= twojmax; j_bend++) { - // All writes go to global memory and shared memory - // so we can avoid all global memory reads - Kokkos::single(Kokkos::PerThread(team), [=]() { - buf1[0] = {1.,0.}; - Kokkos::atomic_add(&(ulisttot_re(0,jelem,iatom)), sfac); - }); + // level 0 is just 1. + ulist_wrapper.set(0, complex::one()); - for (int j = 1; j <= twojmax; j++) { + // j from before the bend, don't store, mb == 0 + // this is "creeping up the side" + for (int j = 1; j <= j_bend; j++) { - const int jju = idxu_half_block[j]; + int jjup = idxu_half_block[j-1]; - // fill in left side of matrix layer from previous layer + constexpr int mb = 0; // intentional for readability, compiler should optimize this out - // Flatten loop over ma, mb - // for (int ma = 0; ma <= j; ma++) - const int n_ma = j+1; - // for (int mb = 0; 2*mb <= j; mb++) - const int n_mb = j/2+1; + complex ulist_accum = complex::zero(); - const int total_iters = n_ma * n_mb; + int ma; + for (ma = 0; ma < j; ma++) { + const int jjup_index = idxu_half_block[j - 1] + mb * j + ma; - //for (int m = 0; m < total_iters; m++) { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, total_iters), - [&] (const int m) { + // grab the cached value + const complex ulist_prev = ulist_wrapper.get(ma); - // ma fast, mb slow - // Equivalent to `int ma = m % n_ma; int mb = m / n_ma;` IF everything's positive. - const int mb = m / n_ma; - const int ma = m - mb * n_ma; + // ulist_accum += rootpq * a.conj() * ulist_prev; + real_type rootpq = rootpqarray(j - ma, j - mb); + ulist_accum.re += rootpq * (a.re * ulist_prev.re + a.im * ulist_prev.im); + ulist_accum.im += rootpq * (a.re * ulist_prev.im - a.im * ulist_prev.re); - // index into global memory array - const int jju_index = jju+m; + // store ulist_accum, we atomic accumulate values after the bend, so no atomic add here + ulist_wrapper.set(ma, ulist_accum); - // index into shared memory buffer for this level - const int jju_shared_idx = m; + // next value + // ulist_accum = -rootpq * b.conj() * ulist_prev; + rootpq = rootpqarray(ma + 1, j - mb); + ulist_accum.re = -rootpq * (b.re * ulist_prev.re + b.im * ulist_prev.im); + ulist_accum.im = -rootpq * (b.re * ulist_prev.im - b.im * ulist_prev.re); - // index into shared memory buffer for next level - const int jjup_shared_idx = jju_shared_idx - mb; - - SNAcomplex u_accum = {0., 0.}; - - // VMK recursion relation: grab contribution which is multiplied by b* - const double rootpq2 = -rootpqarray(ma, j - mb); - const SNAcomplex u_up2 = rootpq2*((ma > 0)?buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.)); - - // u_accum += conj(b) * u_up2 - caconjxpy(b, u_up2, u_accum); - - // VMK recursion relation: grab contribution which is multiplied by a* - const double rootpq1 = rootpqarray(j - ma, j - mb); - const SNAcomplex u_up1 = rootpq1*((ma < j)?buf1[jjup_shared_idx]:SNAcomplex(0.,0.)); - - // u_accum += conj(a) * u_up1 - caconjxpy(a, u_up1, u_accum); - - // back up into shared memory for next iter - buf2[jju_shared_idx] = u_accum; - - Kokkos::atomic_add(&(ulisttot_re(jju_index,jelem,iatom)), sfac * u_accum.re); - Kokkos::atomic_add(&(ulisttot_im(jju_index,jelem,iatom)), sfac * u_accum.im); - - // copy left side to right side with inversion symmetry VMK 4.4(2) - // u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb)) - // if j is even (-> physical j integer), last element maps to self, skip - - const int sign_factor = (((ma+mb)%2==0)?1:-1); - const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1); - - if (sign_factor == 1) { - u_accum.im = -u_accum.im; - } else { - u_accum.re = -u_accum.re; } - if (j%2==1 && mb+1==n_mb) { - buf2[jju_shared_flip] = u_accum; + ulist_wrapper.set(ma, ulist_accum); + } + + // now we're after the bend, start storing but only up to the "half way point" + const int j_half_way = MIN(2 * j_bend, twojmax); + + int mb = 1; + int j; //= j_bend + 1; // need this value below + for (j = j_bend + 1; j <= j_half_way; j++) { + + const int jjup = idxu_half_block[j-1] + (mb - 1) * j; + + complex ulist_accum = complex::zero(); + + int ma; + for (ma = 0; ma < j; ma++) { + + // grab the cached value + const complex ulist_prev = ulist_wrapper.get(ma); + + // atomic add the previous level here + Kokkos::atomic_add(&(ulisttot_re_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.re * sfac); + Kokkos::atomic_add(&(ulisttot_im_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.im * sfac); + + // ulist_accum += rootpq * b * ulist_prev; + real_type rootpq = rootpqarray(j - ma, mb); + ulist_accum.re += rootpq * (b.re * ulist_prev.re - b.im * ulist_prev.im); + ulist_accum.im += rootpq * (b.re * ulist_prev.im + b.im * ulist_prev.re); + + // store ulist_accum + ulist_wrapper.set(ma, ulist_accum); + + // next value + // ulist_accum = rootpq * a * ulist_prev; + rootpq = rootpqarray(ma + 1, mb); + ulist_accum.re = rootpq * (a.re * ulist_prev.re - a.im * ulist_prev.im); + ulist_accum.im = rootpq * (a.re * ulist_prev.im + a.im * ulist_prev.re); } - // symmetric part of ulisttot is generated in TransformUi + ulist_wrapper.set(ma, ulist_accum); - }); - // In CUDA backend, - // ThreadVectorRange has a __syncwarp (appropriately masked for - // vector lengths < 32) implict at the end + mb++; + } - // swap double buffers - auto tmp = buf1; buf1 = buf2; buf2 = tmp; + // atomic add the last level + const int jjup = idxu_half_block[j-1] + (mb - 1) * j; + + for (int ma = 0; ma < j; ma++) { + const complex ulist_prev = ulist_wrapper.get(ma); + + // atomic add the previous level here + Kokkos::atomic_add(&(ulisttot_re_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.re * sfac); + Kokkos::atomic_add(&(ulisttot_im_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.im * sfac); + } + + //} // end of "reference" loop over j_bend - } } @@ -574,9 +614,9 @@ void SNAKokkos::compute_ui(const typename Kokkos::TeamPolicy +template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_zi(const int& iatom_mod, const int& jjz, const int& iatom_div) +void SNAKokkos::compute_zi(const int& iatom_mod, const int& jjz, const int& iatom_div) { const int j1 = idxz(jjz, 0); @@ -589,14 +629,13 @@ void SNAKokkos::compute_zi(const int& iatom_mod, const int& jjz, con const int na = idxz(jjz, 7); const int nb = idxz(jjz, 8); - const double* cgblock = cglist.data() + idxcg_block(j1, j2, j); + const real_type* cgblock = cglist.data() + idxcg_block(j1, j2, j); int idouble = 0; for (int elem1 = 0; elem1 < nelements; elem1++) { for (int elem2 = 0; elem2 < nelements; elem2++) { - double ztmp_r = 0.; - double ztmp_i = 0.; + complex ztmp = complex::zero(); int jju1 = idxu_block[j1] + (j1+1)*mb1min; int jju2 = idxu_block[j2] + (j2+1)*mb2max; @@ -615,12 +654,12 @@ void SNAKokkos::compute_zi(const int& iatom_mod, const int& jjz, con #pragma unroll #endif for(int ia = 0; ia < na; ia++) { - const SNAcomplex utot1 = ulisttot_pack(iatom_mod, jju1+ma1, elem1, iatom_div); - const SNAcomplex utot2 = ulisttot_pack(iatom_mod, jju2+ma2, elem2, iatom_div); + const auto utot1 = ulisttot_pack(iatom_mod, jju1+ma1, elem1, iatom_div); + const auto utot2 = ulisttot_pack(iatom_mod, jju2+ma2, elem2, iatom_div); const auto cgcoeff_a = cgblock[icga]; const auto cgcoeff_b = cgblock[icgb]; - ztmp_r += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.re - utot1.im * utot2.im); - ztmp_i += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.im + utot1.im * utot2.re); + ztmp.re += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.re - utot1.im * utot2.im); + ztmp.im += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.im + utot1.im * utot2.re); ma1++; ma2--; icga += j2; @@ -632,11 +671,11 @@ void SNAKokkos::compute_zi(const int& iatom_mod, const int& jjz, con } // end loop over ib if (bnorm_flag) { - ztmp_r /= (j + 1); - ztmp_i /= (j + 1); + ztmp.re /= (j + 1); + ztmp.im /= (j + 1); } - zlist_pack(iatom_mod,jjz,idouble,iatom_div) = { ztmp_r, ztmp_i }; + zlist_pack(iatom_mod,jjz,idouble,iatom_div) = ztmp; idouble++; } @@ -649,9 +688,9 @@ void SNAKokkos::compute_zi(const int& iatom_mod, const int& jjz, con divergence. ------------------------------------------------------------------------- */ -template +template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_bi(const int& iatom_mod, const int& jjb, const int& iatom_div) +void SNAKokkos::compute_bi(const int& iatom_mod, const int& jjb, const int& iatom_div) { // for j1 = 0,...,twojmax // for j2 = 0,twojmax @@ -712,10 +751,10 @@ void SNAKokkos::compute_bi(const int& iatom_mod, const int& jjb, con const auto utot = ulisttot_pack(iatom_mod, jju_index, elem3, iatom_div); const auto zloc = zlist_pack(iatom_mod, jjz_index, idouble, iatom_div); - sumzu += 0.5 * (utot.re * zloc.re + utot.im * zloc.im); + sumzu += static_cast(0.5) * (utot.re * zloc.re + utot.im * zloc.im); } // end if jeven - sumzu *= 2.0; + sumzu *= static_cast(2.0); if (bzero_flag) { if (!wselfall_flag) { if (elem1 == elem2 && elem1 == elem3) { @@ -742,12 +781,12 @@ void SNAKokkos::compute_bi(const int& iatom_mod, const int& jjb, con divergence. GPU version. ------------------------------------------------------------------------- */ -template +template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_yi(int iatom_mod, int jjz, int iatom_div, - const Kokkos::View &beta_pack) +void SNAKokkos::compute_yi(int iatom_mod, int jjz, int iatom_div, + const Kokkos::View &beta_pack) { - double betaj; + real_type betaj; const int j1 = idxz(jjz, 0); const int j2 = idxz(jjz, 1); @@ -760,15 +799,15 @@ void SNAKokkos::compute_yi(int iatom_mod, int jjz, int iatom_div, const int nb = idxz(jjz, 8); const int jju_half = idxz(jjz, 9); - const double *cgblock = cglist.data() + idxcg_block(j1,j2,j); + const real_type *cgblock = cglist.data() + idxcg_block(j1,j2,j); //int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2; //int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2; for (int elem1 = 0; elem1 < nelements; elem1++) { for (int elem2 = 0; elem2 < nelements; elem2++) { - double ztmp_r = 0.0; - double ztmp_i = 0.0; + real_type ztmp_r = 0.0; + real_type ztmp_i = 0.0; int jju1 = idxu_block[j1] + (j1 + 1) * mb1min; int jju2 = idxu_block[j2] + (j2 + 1) * mb2max; @@ -787,8 +826,8 @@ void SNAKokkos::compute_yi(int iatom_mod, int jjz, int iatom_div, #pragma unroll #endif for (int ia = 0; ia < na; ia++) { - const SNAcomplex utot1 = ulisttot_pack(iatom_mod,jju1+ma1,elem1,iatom_div); - const SNAcomplex utot2 = ulisttot_pack(iatom_mod,jju2+ma2,elem2,iatom_div); + const auto utot1 = ulisttot_pack(iatom_mod,jju1+ma1,elem1,iatom_div); + const auto utot2 = ulisttot_pack(iatom_mod,jju2+ma2,elem2,iatom_div); const auto cgcoeff_a = cgblock[icga]; const auto cgcoeff_b = cgblock[icgb]; ztmp_r += cgcoeff_a * cgcoeff_b * (utot1.re * utot2.re - utot1.im * utot2.im); @@ -849,169 +888,173 @@ void SNAKokkos::compute_yi(int iatom_mod, int jjz, int iatom_div, and accumulation into dEidRj. GPU only. ------------------------------------------------------------------------- */ -template +template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_fused_deidrj(const typename Kokkos::TeamPolicy::member_type& team, const int iatom, const int jnbor) +void SNAKokkos::compute_fused_deidrj(const typename Kokkos::TeamPolicy::member_type& team, const int iatom_mod, const int j_bend, const int jnbor, const int iatom_div) { // get shared memory offset - const int max_m_tile = (twojmax+1)*(twojmax/2+1); + // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer + const int tile_size = vector_length * (twojmax + 1); + const int team_rank = team.team_rank(); - const int scratch_shift = team_rank * max_m_tile; - const int jelem = element(iatom, jnbor); + const int scratch_shift = team_rank * tile_size; - // See notes on data layouts for shared memory caching - // in `compute_ui`. + // extract, wrap shared memory buffer + WignerWrapper ulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod); + WignerWrapper dulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod); - // double buffer for ulist - SNAcomplex* ulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift; - SNAcomplex* ulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift; + // load parameters + const auto a = a_pack(iatom_mod, jnbor, iatom_div); + const auto b = b_pack(iatom_mod, jnbor, iatom_div); + const auto da = da_pack(iatom_mod, jnbor, iatom_div, dir); + const auto db = db_pack(iatom_mod, jnbor, iatom_div, dir); + const auto sfac = sfac_pack(iatom_mod, jnbor, iatom_div, 0); + const auto dsfacu = sfac_pack(iatom_mod, jnbor, iatom_div, dir + 1); // dsfac * u - // double buffer for dulist - SNAcomplex* dulist_buf1 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift; - SNAcomplex* dulist_buf2 = (SNAcomplex*)team.team_shmem( ).get_shmem(team.team_size()*max_m_tile*sizeof(SNAcomplex), 0) + scratch_shift; + const int jelem = element(iatom_mod + vector_length * iatom_div, jnbor); - const CayleyKleinPack& ckp = cayleyklein(iatom, jnbor); + auto dedr_full_sum = static_cast(0.); - const SNAcomplex a = ckp.a; - const SNAcomplex b = ckp.b; - const SNAcomplex da = ckp.da[dir]; - const SNAcomplex db = ckp.db[dir]; - const double sfac = ckp.sfac; - const double dsfacu = ckp.dsfacu[dir]; // dsfac * u + // we need to "choose" when to bend + // this for loop is here for context --- we expose additional + // parallelism over this loop instead + //for (int j_bend = 0; j_bend <= twojmax; j_bend++) { - // Accumulate the full contribution to dedr on the fly - const SNAcomplex y_local = ylist(0, jelem, iatom); + // level 0 is just 1, 0 + ulist_wrapper.set(0, complex::one()); + dulist_wrapper.set(0, complex::zero()); - // Symmetry factor of 0.5 b/c 0 element is on diagonal for even j==0 - double dedr_full_sum = 0.5 * dsfacu * y_local.re; + // j from before the bend, don't store, mb == 0 + // this is "creeping up the side" + for (int j = 1; j <= j_bend; j++) { - // single has a warp barrier at the end - Kokkos::single(Kokkos::PerThread(team), [=]() { + int jjup = idxu_half_block[j-1]; - ulist_buf1[0] = {1., 0.}; - dulist_buf1[0] = {0., 0.}; - }); + constexpr int mb = 0; // intentional for readability, compiler should optimize this out - for (int j = 1; j <= twojmax; j++) { - int jju_half = idxu_half_block[j]; + complex ulist_accum = complex::zero(); + complex dulist_accum = complex::zero(); - // flatten the loop over ma,mb + int ma; + for (ma = 0; ma < j; ma++) { + const int jjup_index = idxu_half_block[j - 1] + mb * j + ma; - // for (int ma = 0; ma <= j; ma++) - const int n_ma = j+1; - // for (int mb = 0; 2*mb <= j; mb++) - const int n_mb = j/2+1; + // grab the cached value + const complex ulist_prev = ulist_wrapper.get(ma); + const complex dulist_prev = dulist_wrapper.get(ma); - const int total_iters = n_ma * n_mb; + // ulist_accum += rootpq * a.conj() * ulist_prev; + real_type rootpq = rootpqarray(j - ma, j - mb); + ulist_accum.re += rootpq * (a.re * ulist_prev.re + a.im * ulist_prev.im); + ulist_accum.im += rootpq * (a.re * ulist_prev.im - a.im * ulist_prev.re); - double dedr_sum = 0.; // j-local sum + // product rule of above + dulist_accum.re += rootpq * (da.re * ulist_prev.re + da.im * ulist_prev.im + a.re * dulist_prev.re + a.im * dulist_prev.im); + dulist_accum.im += rootpq * (da.re * ulist_prev.im - da.im * ulist_prev.re + a.re * dulist_prev.im - a.im * dulist_prev.re); - //for (int m = 0; m < total_iters; m++) { - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, total_iters), - [&] (const int m, double& sum_tmp) { + // store ulist_accum, we atomic accumulate values after the bend, so no atomic add here + ulist_wrapper.set(ma, ulist_accum); + dulist_wrapper.set(ma, dulist_accum); - // ma fast, mb slow - const int mb = m / n_ma; - const int ma = m - mb * n_ma; + // next value + // ulist_accum = -rootpq * b.conj() * ulist_prev; + rootpq = rootpqarray(ma + 1, j - mb); + ulist_accum.re = -rootpq * (b.re * ulist_prev.re + b.im * ulist_prev.im); + ulist_accum.im = -rootpq * (b.re * ulist_prev.im - b.im * ulist_prev.re); - const int jju_half_index = jju_half+m; + // product rule of above + dulist_accum.re = -rootpq * (db.re * ulist_prev.re + db.im * ulist_prev.im + b.re * dulist_prev.re + b.im * dulist_prev.im); + dulist_accum.im = -rootpq * (db.re * ulist_prev.im - db.im * ulist_prev.re + b.re * dulist_prev.im - b.im * dulist_prev.re); - // Load y_local, apply the symmetry scaling factor - // The "secret" of the shared memory optimization is it eliminates - // all global memory reads to duidrj in lieu of caching values in - // shared memory and otherwise always writing, making the kernel - // ultimately compute bound. We take advantage of that by adding - // some reads back in. - auto y_local = ylist(jju_half_index, jelem, iatom); - if (j % 2 == 0 && 2*mb == j) { - if (ma == mb) { y_local = 0.5*y_local; } - else if (ma > mb) { y_local = { 0., 0. }; } // can probably avoid this outright + } + + ulist_wrapper.set(ma, ulist_accum); + dulist_wrapper.set(ma, dulist_accum); + } + + // now we're after the bend, start storing but only up to the "half way point" + const int j_half_way = MIN(2 * j_bend, twojmax); + + int mb = 1; + int j; //= j_bend + 1; // need this value below + for (j = j_bend + 1; j <= j_half_way; j++) { + + const int jjup = idxu_half_block[j-1] + (mb - 1) * j; + + complex ulist_accum = complex::zero(); + complex dulist_accum = complex::zero(); + + int ma; + for (ma = 0; ma < j; ma++) { + + // grab y_local early + // this will never be the last element of a row, no need to rescale. + auto y_local = complex(ylist_pack_re(iatom_mod, jjup + ma, jelem, iatom_div), ylist_pack_im(iatom_mod, jjup+ma, jelem, iatom_div)); + + // grab the cached value + const complex ulist_prev = ulist_wrapper.get(ma); + const complex dulist_prev = dulist_wrapper.get(ma); + + // ulist_accum += rootpq * b * ulist_prev; + real_type rootpq = rootpqarray(j - ma, mb); + ulist_accum.re += rootpq * (b.re * ulist_prev.re - b.im * ulist_prev.im); + ulist_accum.im += rootpq * (b.re * ulist_prev.im + b.im * ulist_prev.re); + + // product rule of above + dulist_accum.re += rootpq * (db.re * ulist_prev.re - db.im * ulist_prev.im + b.re * dulist_prev.re - b.im * dulist_prev.im); + dulist_accum.im += rootpq * (db.re * ulist_prev.im + db.im * ulist_prev.re + b.re * dulist_prev.im + b.im * dulist_prev.re); + + // store ulist_accum + ulist_wrapper.set(ma, ulist_accum); + dulist_wrapper.set(ma, dulist_accum); + + // Directly accumulate deidrj into sum_tmp + const complex du_prod = (dsfacu * ulist_prev) + (sfac * dulist_prev); + dedr_full_sum += du_prod.re * y_local.re + du_prod.im * y_local.im; + + // next value + // ulist_accum = rootpq * a * ulist_prev; + rootpq = rootpqarray(ma + 1, mb); + ulist_accum.re = rootpq * (a.re * ulist_prev.re - a.im * ulist_prev.im); + ulist_accum.im = rootpq * (a.re * ulist_prev.im + a.im * ulist_prev.re); + + // product rule of above + dulist_accum.re = rootpq * (da.re * ulist_prev.re - da.im * ulist_prev.im + a.re * dulist_prev.re - a.im * dulist_prev.im); + dulist_accum.im = rootpq * (da.re * ulist_prev.im + da.im * ulist_prev.re + a.re * dulist_prev.im + a.im * dulist_prev.re); + + } + + ulist_wrapper.set(ma, ulist_accum); + dulist_wrapper.set(ma, dulist_accum); + + mb++; + } + + // accumulate the last level + const int jjup = idxu_half_block[j-1] + (mb - 1) * j; + + for (int ma = 0; ma < j; ma++) { + // grab y_local early + auto y_local = complex(ylist_pack_re(iatom_mod, jjup + ma, jelem, iatom_div), ylist_pack_im(iatom_mod, jjup+ma, jelem, iatom_div)); + if (j % 2 == 1 && 2*(mb-1) == j-1) { // double check me... + if (ma == (mb-1)) { y_local = static_cast(0.5)*y_local; } + else if (ma > (mb-1)) { y_local.re = static_cast(0.); y_local.im = static_cast(0.); } // can probably avoid this outright // else the ma < mb gets "double counted", cancelling the 0.5. } - // index into shared memory - const int jju_shared_idx = m; - const int jjup_shared_idx = jju_shared_idx - mb; - - // Need to compute and accumulate both u and du (mayhaps, we could probably - // balance some read and compute by reading u each time). - SNAcomplex u_accum = { 0., 0. }; - SNAcomplex du_accum = { 0., 0. }; - - const double rootpq2 = -rootpqarray(ma, j - mb); - const SNAcomplex u_up2 = rootpq2*((ma > 0) ? ulist_buf1[jjup_shared_idx-1]:SNAcomplex(0.,0.)); - - // u_accum += conj(b) * u_up2 - caconjxpy(b, u_up2, u_accum); - - const double rootpq1 = rootpqarray(j - ma, j - mb); - const SNAcomplex u_up1 = rootpq1*((ma < j) ? ulist_buf1[jjup_shared_idx]:SNAcomplex(0.,0.)); - - // u_accum += conj(a) * u_up1 - caconjxpy(a, u_up1, u_accum); - - // Next, spin up du_accum - const SNAcomplex du_up1 = rootpq1*((ma < j) ? dulist_buf1[jjup_shared_idx] : SNAcomplex(0.,0.)); - - // du_accum += conj(da) * u_up1 + conj(a) * du_up1 - caconjxpy(da, u_up1, du_accum); - caconjxpy(a, du_up1, du_accum); - - const SNAcomplex du_up2 = rootpq2*((ma > 0) ? dulist_buf1[jjup_shared_idx-1] : SNAcomplex(0.,0.)); - - // du_accum += conj(db) * u_up2 + conj(b) * du_up2 - caconjxpy(db, u_up2, du_accum); - caconjxpy(b, du_up2, du_accum); - - // Cache u_accum, du_accum to scratch memory. - ulist_buf2[jju_shared_idx] = u_accum; - dulist_buf2[jju_shared_idx] = du_accum; + const complex ulist_prev = ulist_wrapper.get(ma); + const complex dulist_prev = dulist_wrapper.get(ma); // Directly accumulate deidrj into sum_tmp - const SNAcomplex du_prod = (dsfacu * u_accum) + (sfac * du_accum); - sum_tmp += du_prod.re * y_local.re + du_prod.im * y_local.im; + const complex du_prod = (dsfacu * ulist_prev) + (sfac * dulist_prev); + dedr_full_sum += du_prod.re * y_local.re + du_prod.im * y_local.im; - // copy left side to right side with inversion symmetry VMK 4.4(2) - // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb]) - if (j%2==1 && mb+1==n_mb) { - int sign_factor = (((ma+mb)%2==0)?1:-1); + } + //} // end reference loop over j_bend - const int jju_shared_flip = (j+1-mb)*(j+1)-(ma+1); + // dedr gets zeroed out at the start of each iteration in compute_cayley_klein + Kokkos::atomic_add(&(dedr(iatom_mod + vector_length * iatom_div, jnbor, dir)), static_cast(2.0) * dedr_full_sum); - if (sign_factor == 1) { - u_accum.im = -u_accum.im; - du_accum.im = -du_accum.im; - } else { - u_accum.re = -u_accum.re; - du_accum.re = -du_accum.re; - } - - // We don't need the second half of the tile for the deidrj accumulation. - // That's taken care of by the symmetry factor above. - // We do need it for ortho polynomial generation, though - ulist_buf2[jju_shared_flip] = u_accum; - dulist_buf2[jju_shared_flip] = du_accum; - } - - }, dedr_sum); - - // swap buffers - auto tmp = ulist_buf1; ulist_buf1 = ulist_buf2; ulist_buf2 = tmp; - tmp = dulist_buf1; dulist_buf1 = dulist_buf2; dulist_buf2 = tmp; - - // Accumulate dedr. This "should" be in a single, but - // a Kokkos::single call implies a warp sync, and we may - // as well avoid that. This does no harm as long as the - // final assignment is in a single block. - //Kokkos::single(Kokkos::PerThread(team), [=]() { - dedr_full_sum += dedr_sum; - //}); - } - - // Store the accumulated dedr. - Kokkos::single(Kokkos::PerThread(team), [&] () { - dedr(iatom,jnbor,dir) = dedr_full_sum*2.0; - }); } @@ -1025,9 +1068,9 @@ void SNAKokkos::compute_fused_deidrj(const typename Kokkos::TeamPoli advantage of the symmetry of the Wigner U matrices. * ------------------------------------------------------------------------- */ -template +template KOKKOS_INLINE_FUNCTION -void SNAKokkos::pre_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, const int& iatom, const int& ielem) +void SNAKokkos::pre_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, const int& iatom, const int& ielem) { for (int jelem = 0; jelem < nelements; jelem++) { for (int j = 0; j <= twojmax; j++) { @@ -1042,8 +1085,8 @@ void SNAKokkos::pre_ui_cpu(const typename Kokkos::TeamPolicy(0.),static_cast(0.)); + if (m % (j+2) == 0 && (!chem_flag || ielem == jelem || wselfall_flag)) { init.re = wself; } //need to map iatom to element ulisttot(jjup, jelem, iatom) = init; }); @@ -1059,11 +1102,11 @@ void SNAKokkos::pre_ui_cpu(const typename Kokkos::TeamPolicy +template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) +void SNAKokkos::compute_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) { - double rsq, r, x, y, z, z0, theta0; + real_type rsq, r, x, y, z, z0, theta0; // utot(j,ma,mb) = 0 for all j,ma,ma // utot(j,ma,ma) = 1 for all j,ma @@ -1089,9 +1132,9 @@ void SNAKokkos::compute_ui_cpu(const typename Kokkos::TeamPolicy +template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_zi_cpu(const int& iter) +void SNAKokkos::compute_zi_cpu(const int& iter) { const int iatom = iter / idxz_max; const int jjz = iter % idxz_max; @@ -1106,22 +1149,22 @@ void SNAKokkos::compute_zi_cpu(const int& iter) const int na = idxz(jjz, 7); const int nb = idxz(jjz, 8); - const double *cgblock = cglist.data() + idxcg_block(j1,j2,j); + const real_type *cgblock = cglist.data() + idxcg_block(j1,j2,j); int idouble = 0; for (int elem1 = 0; elem1 < nelements; elem1++) { for (int elem2 = 0; elem2 < nelements; elem2++) { - zlist(jjz, idouble, iatom).re = 0.0; - zlist(jjz, idouble, iatom).im = 0.0; + zlist(jjz, idouble, iatom).re = static_cast(0.0); + zlist(jjz, idouble, iatom).im = static_cast(0.0); int jju1 = idxu_block[j1] + (j1+1)*mb1min; int jju2 = idxu_block[j2] + (j2+1)*mb2max; int icgb = mb1min*(j2+1) + mb2max; for(int ib = 0; ib < nb; ib++) { - double suma1_r = 0.0; - double suma1_i = 0.0; + real_type suma1_r = static_cast(0.0); + real_type suma1_i = static_cast(0.0); int ma1 = ma1min; int ma2 = ma2max; @@ -1158,9 +1201,9 @@ void SNAKokkos::compute_zi_cpu(const int& iter) compute Bi by summing conj(Ui)*Zi, CPU version ------------------------------------------------------------------------- */ -template +template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_bi_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom) +void SNAKokkos::compute_bi_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom) { // for j1 = 0,...,twojmax // for j2 = 0,twojmax @@ -1186,11 +1229,11 @@ void SNAKokkos::compute_bi_cpu(const typename Kokkos::TeamPolicy(0.0); + real_type sumzu_temp = static_cast(0.0); const int bound = (j+2)/2; Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,(j+1)*bound), - [&] (const int mbma, double& sum) { + [&] (const int mbma, real_type& sum) { //for(int mb = 0; 2*mb < j; mb++) //for(int ma = 0; ma <= j; ma++) { const int ma = mbma % (j + 1); @@ -1209,7 +1252,7 @@ void SNAKokkos::compute_bi_cpu(const typename Kokkos::TeamPolicy::compute_bi_cpu(const typename Kokkos::TeamPolicy(0.5)* (ulisttot_full(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re + ulisttot_full(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im); } // end if jeven Kokkos::single(Kokkos::PerThread(team), [&] () { - sumzu *= 2.0; + sumzu *= static_cast(2.0); // apply bzero shift @@ -1260,12 +1303,12 @@ void SNAKokkos::compute_bi_cpu(const typename Kokkos::TeamPolicy +template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_yi_cpu(int iter, - const Kokkos::View &beta) +void SNAKokkos::compute_yi_cpu(int iter, + const Kokkos::View &beta) { - double betaj; + real_type betaj; const int iatom = iter / idxz_max; const int jjz = iter % idxz_max; @@ -1280,24 +1323,24 @@ void SNAKokkos::compute_yi_cpu(int iter, const int nb = idxz(jjz, 8); const int jju_half = idxz(jjz, 9); - const double *cgblock = cglist.data() + idxcg_block(j1,j2,j); + const real_type *cgblock = cglist.data() + idxcg_block(j1,j2,j); //int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2; //int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2; for (int elem1 = 0; elem1 < nelements; elem1++) { for (int elem2 = 0; elem2 < nelements; elem2++) { - double ztmp_r = 0.0; - double ztmp_i = 0.0; + real_type ztmp_r = 0.0; + real_type ztmp_i = 0.0; int jju1 = idxu_block[j1] + (j1 + 1) * mb1min; int jju2 = idxu_block[j2] + (j2 + 1) * mb2max; - int icgb = mb1min * (j2 +1 ) + mb2max; + int icgb = mb1min * (j2 +1) + mb2max; for (int ib = 0; ib < nb; ib++) { - double suma1_r = 0.0; - double suma1_i = 0.0; + real_type suma1_r = 0.0; + real_type suma1_i = 0.0; int ma1 = ma1min; int ma2 = ma2max; @@ -1368,22 +1411,21 @@ void SNAKokkos::compute_yi_cpu(int iter, data layout ------------------------------------------------------------------------- */ -template +template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_duidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) +void SNAKokkos::compute_duidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) { - double rsq, r, x, y, z, z0, theta0, cs, sn; - double dz0dr; + real_type rsq, r, x, y, z, z0, theta0, cs, sn; + real_type dz0dr; x = rij(iatom,jnbor,0); y = rij(iatom,jnbor,1); z = rij(iatom,jnbor,2); rsq = x * x + y * y + z * z; r = sqrt(rsq); - double rscale0 = rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0); + auto rscale0 = rfac0 * static_cast(MY_PI) / (rcutij(iatom,jnbor) - rmin0); theta0 = (r - rmin0) * rscale0; - cs = cos(theta0); - sn = sin(theta0); + sincos_wrapper(theta0, &sn, &cs); z0 = r * cs / sn; dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq; @@ -1400,16 +1442,16 @@ void SNAKokkos::compute_duidrj_cpu(const typename Kokkos::TeamPolicy ------------------------------------------------------------------------- */ -template +template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_deidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) +void SNAKokkos::compute_deidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) { - t_scalar3 final_sum; + t_scalar3 final_sum; const int jelem = element(iatom, jnbor); //for(int j = 0; j <= twojmax; j++) { Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,twojmax+1), - [&] (const int& j, t_scalar3& sum_tmp) { + [&] (const int& j, t_scalar3& sum_tmp) { int jju_half = idxu_half_block[j]; int jju_cache = idxu_cache_block[j]; @@ -1467,12 +1509,12 @@ void SNAKokkos::compute_deidrj_cpu(const typename Kokkos::TeamPolicy of the symmetry of the Wigner U matrices. ------------------------------------------------------------------------- */ -template +template KOKKOS_INLINE_FUNCTION -void SNAKokkos::add_uarraytot(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor, - double r, double wj, double rcut, int jelem) +void SNAKokkos::add_uarraytot(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor, + const real_type& r, const real_type& wj, const real_type& rcut, int jelem) { - const double sfac = compute_sfac(r, rcut) * wj; + const auto sfac = compute_sfac(r, rcut) * wj; Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,twojmax+1), [&] (const int& j) { @@ -1497,19 +1539,18 @@ void SNAKokkos::add_uarraytot(const typename Kokkos::TeamPolicy +template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_uarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor, - double x, double y, double z, - double z0, double r) +void SNAKokkos::compute_uarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor, + const real_type& x, const real_type& y, const real_type& z, const real_type& z0, const real_type& r) { - double r0inv; - double a_r, b_r, a_i, b_i; - double rootpq; + real_type r0inv; + real_type a_r, b_r, a_i, b_i; + real_type rootpq; // compute Cayley-Klein parameters for unit quaternion - r0inv = 1.0 / sqrt(r * r + z0 * z0); + r0inv = static_cast(1.0) / sqrt(r * r + z0 * z0); a_r = r0inv * z0; a_i = -r0inv * z; b_r = r0inv * y; @@ -1589,23 +1630,23 @@ void SNAKokkos::compute_uarray_cpu(const typename Kokkos::TeamPolicy Uses same cached data layout of ulist ------------------------------------------------------------------------- */ -template +template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_duarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor, - double x, double y, double z, - double z0, double r, double dz0dr, - double wj, double rcut) +void SNAKokkos::compute_duarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor, + const real_type& x, const real_type& y, const real_type& z, + const real_type& z0, const real_type& r, const real_type& dz0dr, + const real_type& wj, const real_type& rcut) { -double r0inv; - double a_r, a_i, b_r, b_i; - double da_r[3], da_i[3], db_r[3], db_i[3]; - double dz0[3], dr0inv[3], dr0invdr; - double rootpq; + real_type r0inv; + real_type a_r, a_i, b_r, b_i; + real_type da_r[3], da_i[3], db_r[3], db_i[3]; + real_type dz0[3], dr0inv[3], dr0invdr; + real_type rootpq; - double rinv = 1.0 / r; - double ux = x * rinv; - double uy = y * rinv; - double uz = z * rinv; + real_type rinv = 1.0 / r; + real_type ux = x * rinv; + real_type uy = y * rinv; + real_type uz = z * rinv; r0inv = 1.0 / sqrt(r * r + z0 * z0); a_r = z0 * r0inv; @@ -1720,8 +1761,8 @@ double r0inv; }); } - double sfac = compute_sfac(r, rcut); - double dsfac = compute_dsfac(r, rcut); + real_type sfac = compute_sfac(r, rcut); + real_type dsfac = compute_dsfac(r, rcut); sfac *= wj; dsfac *= wj; @@ -1755,9 +1796,9 @@ double r0inv; factorial n, wrapper for precomputed table ------------------------------------------------------------------------- */ -template +template inline -double SNAKokkos::factorial(int n) +double SNAKokkos::factorial(int n) { //if (n < 0 || n > nmaxfactorial) { // char str[128]; @@ -1772,8 +1813,8 @@ double SNAKokkos::factorial(int n) factorial n table, size SNA::nmaxfactorial+1 ------------------------------------------------------------------------- */ -template -const double SNAKokkos::nfac_table[] = { +template +const double SNAKokkos::nfac_table[] = { 1, 1, 2, @@ -1948,9 +1989,9 @@ const double SNAKokkos::nfac_table[] = { the function delta given by VMK Eq. 8.2(1) ------------------------------------------------------------------------- */ -template +template inline -double SNAKokkos::deltacg(int j1, int j2, int j) +double SNAKokkos::deltacg(int j1, int j2, int j) { double sfaccg = factorial((j1 + j2 + j) / 2 + 1); return sqrt(factorial((j1 + j2 - j) / 2) * @@ -1963,9 +2004,9 @@ double SNAKokkos::deltacg(int j1, int j2, int j) the quasi-binomial formula VMK 8.2.1(3) ------------------------------------------------------------------------- */ -template +template inline -void SNAKokkos::init_clebsch_gordan() +void SNAKokkos::init_clebsch_gordan() { auto h_cglist = Kokkos::create_mirror_view(cglist); @@ -2033,23 +2074,23 @@ void SNAKokkos::init_clebsch_gordan() the p = 0, q = 0 entries are allocated and skipped for convenience. ------------------------------------------------------------------------- */ -template +template inline -void SNAKokkos::init_rootpqarray() +void SNAKokkos::init_rootpqarray() { auto h_rootpqarray = Kokkos::create_mirror_view(rootpqarray); for (int p = 1; p <= twojmax; p++) for (int q = 1; q <= twojmax; q++) - h_rootpqarray(p,q) = sqrt(static_cast(p)/q); + h_rootpqarray(p,q) = static_cast(sqrt(static_cast(p)/q)); Kokkos::deep_copy(rootpqarray,h_rootpqarray); } /* ---------------------------------------------------------------------- */ -template +template inline -int SNAKokkos::compute_ncoeff() +int SNAKokkos::compute_ncoeff() { int ncount; @@ -2070,88 +2111,72 @@ int SNAKokkos::compute_ncoeff() /* ---------------------------------------------------------------------- */ -template +template KOKKOS_INLINE_FUNCTION -double SNAKokkos::compute_sfac(double r, double rcut) +real_type SNAKokkos::compute_sfac(real_type r, real_type rcut) { - if (switch_flag == 0) return 1.0; + constexpr real_type one = static_cast(1.0); + constexpr real_type zero = static_cast(0.0); + constexpr real_type onehalf = static_cast(0.5); + if (switch_flag == 0) return one; if (switch_flag == 1) { - if(r <= rmin0) return 1.0; - else if(r > rcut) return 0.0; + if(r <= rmin0) return one; + else if(r > rcut) return zero; else { - double rcutfac = MY_PI / (rcut - rmin0); - return 0.5 * (cos((r - rmin0) * rcutfac) + 1.0); + auto rcutfac = static_cast(MY_PI) / (rcut - rmin0); + return onehalf * (cos((r - rmin0) * rcutfac) + one); } } - return 0.0; + return zero; } /* ---------------------------------------------------------------------- */ -template +template KOKKOS_INLINE_FUNCTION -double SNAKokkos::compute_dsfac(double r, double rcut) +real_type SNAKokkos::compute_dsfac(real_type r, real_type rcut) { - if (switch_flag == 0) return 0.0; + constexpr real_type zero = static_cast(0.0); + constexpr real_type onehalf = static_cast(0.5); + if (switch_flag == 0) return zero; if (switch_flag == 1) { - if(r <= rmin0) return 0.0; - else if(r > rcut) return 0.0; + if(r <= rmin0) return zero; + else if(r > rcut) return zero; else { - double rcutfac = MY_PI / (rcut - rmin0); - return -0.5 * sin((r - rmin0) * rcutfac) * rcutfac; + auto rcutfac = static_cast(MY_PI) / (rcut - rmin0); + return -onehalf * sin((r - rmin0) * rcutfac) * rcutfac; } } - return 0.0; + return zero; } -template +template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_s_dsfac(const double r, const double rcut, double& sfac, double& dsfac) { - if (switch_flag == 0) { sfac = 0.; dsfac = 0.; } +void SNAKokkos::compute_s_dsfac(const real_type r, const real_type rcut, real_type& sfac, real_type& dsfac) { + constexpr real_type one = static_cast(1.0); + constexpr real_type zero = static_cast(0.0); + constexpr real_type onehalf = static_cast(0.5); + if (switch_flag == 0) { sfac = zero; dsfac = zero; } else if (switch_flag == 1) { - if (r <= rmin0) { sfac = 1.0; dsfac = 0.0; } - else if (r > rcut) { sfac = 0.; dsfac = 0.; } + if (r <= rmin0) { sfac = one; dsfac = zero; } + else if (r > rcut) { sfac = zero; dsfac = zero; } else { - const double rcutfac = MY_PI / (rcut - rmin0); - double sn, cs; - sincos((r - rmin0) * rcutfac, &sn, &cs); - sfac = 0.5 * (cs + 1.0); - dsfac = -0.5 * sn * rcutfac; + const auto rcutfac = static_cast(MY_PI) / (rcut - rmin0); + real_type sn, cs; + sincos_wrapper((r - rmin0) * rcutfac, &sn, &cs); // need to create a wrapper + sfac = onehalf * (cs + one); + dsfac = -onehalf * sn * rcutfac; } - } else { sfac = 0.; dsfac = 0.; } -} - -/* ---------------------------------------------------------------------- */ - -// efficient complex FMA (i.e., y += a x) -template -KOKKOS_FORCEINLINE_FUNCTION -void SNAKokkos::caxpy(const SNAcomplex& a, const SNAcomplex& x, SNAcomplex& y) { - y.re += a.re * x.re; - y.re -= a.im * x.im; - y.im += a.im * x.re; - y.im += a.re * x.im; -} - -/* ---------------------------------------------------------------------- */ - -// efficient complex FMA, conjugate of scalar (i.e.) y += (a.re - i a.im) x) -template -KOKKOS_FORCEINLINE_FUNCTION -void SNAKokkos::caconjxpy(const SNAcomplex& a, const SNAcomplex& x, SNAcomplex& y) { - y.re += a.re * x.re; - y.re += a.im * x.im; - y.im -= a.im * x.re; - y.im += a.re * x.im; + } else { sfac = zero; dsfac = zero; } } /* ---------------------------------------------------------------------- */ // set direction of batched Duidrj -template +template KOKKOS_FORCEINLINE_FUNCTION -void SNAKokkos::set_dir(int dir_) { +void SNAKokkos::set_dir(int dir_) { dir = dir_; } @@ -2159,8 +2184,8 @@ void SNAKokkos::set_dir(int dir_) { memory usage of arrays ------------------------------------------------------------------------- */ -template -double SNAKokkos::memory_usage() +template +double SNAKokkos::memory_usage() { int jdimpq = twojmax + 2; int jdim = twojmax + 1; @@ -2168,45 +2193,53 @@ double SNAKokkos::memory_usage() bytes = 0; - bytes += jdimpq*jdimpq * sizeof(double); // pqarray - bytes += idxcg_max * sizeof(double); // cglist + bytes += jdimpq*jdimpq * sizeof(real_type); // pqarray + bytes += idxcg_max * sizeof(real_type); // cglist #ifdef LMP_KOKKOS_GPU if (!host_flag) { - auto natom_pad = (natom+32-1)/32; + auto natom_pad = (natom+vector_length-1)/vector_length; - bytes += natom * idxu_half_max * nelements * sizeof(double); // ulisttot_re - bytes += natom * idxu_half_max * nelements * sizeof(double); // ulisttot_im - bytes += natom_pad * idxu_max * nelements * sizeof(double) * 2; // ulisttot_pack + bytes += natom_pad * nmax * sizeof(real_type) * 2; // a_pack + bytes += natom_pad * nmax * sizeof(real_type) * 2; // b_pack + bytes += natom_pad * nmax * 3 * sizeof(real_type) * 2; // da_pack + bytes += natom_pad * nmax * 3 * sizeof(real_type) * 2; // db_pack + bytes += natom_pad * nmax * 4 * sizeof(real_type); // sfac_pack - bytes += natom_pad * idxz_max * ndoubles * sizeof(double) * 2; // zlist_pack - bytes += natom_pad * idxb_max * ntriples * sizeof(double); // blist_pack - bytes += natom_pad * idxu_half_max * nelements * sizeof(double); // ylist_pack_re - bytes += natom_pad * idxu_half_max * nelements * sizeof(double); // ylist_pack_im - bytes += natom * idxu_half_max * nelements * sizeof(double) * 2; // ylist + bytes += natom_pad * idxu_half_max * nelements * sizeof(real_type); // ulisttot_re_pack + bytes += natom_pad * idxu_half_max * nelements * sizeof(real_type); // ulisttot_im_pack + bytes += natom_pad * idxu_max * nelements * sizeof(real_type) * 2; // ulisttot_pack + + bytes += natom_pad * idxz_max * ndoubles * sizeof(real_type) * 2; // zlist_pack + bytes += natom_pad * idxb_max * ntriples * sizeof(real_type); // blist_pack + + bytes += natom_pad * idxu_half_max * nelements * sizeof(real_type); // ylist_pack_re + bytes += natom_pad * idxu_half_max * nelements * sizeof(real_type); // ylist_pack_im } else { #endif - bytes += natom * nmax * idxu_cache_max * sizeof(double) * 2; // ulist - bytes += natom * idxu_half_max * nelements * sizeof(double) * 2; // ulisttot + bytes += natom * nmax * idxu_cache_max * sizeof(real_type) * 2; // ulist + bytes += natom * idxu_half_max * nelements * sizeof(real_type) * 2; // ulisttot + bytes += natom * idxu_max * nelements * sizeof(real_type) * 2; // ulisttot_full - bytes += natom * idxz_max * ndoubles * sizeof(double) * 2; // zlist - bytes += natom * idxb_max * ntriples * sizeof(double); // blist + bytes += natom * idxz_max * ndoubles * sizeof(real_type) * 2; // zlist + bytes += natom * idxb_max * ntriples * sizeof(real_type); // blist - bytes += natom * idxu_half_max * nelements * sizeof(double) * 2; // ylist + bytes += natom * idxu_half_max * nelements * sizeof(real_type) * 2; // ylist - bytes += natom * nmax * idxu_cache_max * 3 * sizeof(double) * 2; // dulist + bytes += natom * nmax * idxu_cache_max * 3 * sizeof(real_type) * 2; // dulist #ifdef LMP_KOKKOS_GPU } #endif - bytes += natom * nmax * 3 * sizeof(double); // dedr + bytes += natom * nmax * 3 * sizeof(real_type); // dedr bytes += jdim * jdim * jdim * sizeof(int); // idxcg_block bytes += jdim * sizeof(int); // idxu_block bytes += jdim * sizeof(int); // idxu_half_block + bytes += idxu_max * sizeof(FullHalfMapper); // idxu_full_half bytes += jdim * sizeof(int); // idxu_cache_block bytes += jdim * jdim * jdim * sizeof(int); // idxz_block bytes += jdim * jdim * jdim * sizeof(int); // idxb_block @@ -2214,12 +2247,12 @@ double SNAKokkos::memory_usage() bytes += idxz_max * 10 * sizeof(int); // idxz bytes += idxb_max * 3 * sizeof(int); // idxb - bytes += jdim * sizeof(double); // bzero + bytes += jdim * sizeof(real_type); // bzero - bytes += natom * nmax * 3 * sizeof(double); // rij - bytes += natom * nmax * sizeof(int); // inside - bytes += natom * nmax * sizeof(double); // wj - bytes += natom * nmax * sizeof(double); // rcutij + bytes += natom * nmax * 3 * sizeof(real_type); // rij + bytes += natom * nmax * sizeof(real_type); // inside + bytes += natom * nmax * sizeof(real_type); // wj + bytes += natom * nmax * sizeof(real_type); // rcutij return bytes; } diff --git a/src/SNAP/pair_snap.cpp b/src/SNAP/pair_snap.cpp index a7ea6de591..741c237ca8 100644 --- a/src/SNAP/pair_snap.cpp +++ b/src/SNAP/pair_snap.cpp @@ -657,7 +657,7 @@ void PairSNAP::read_files(char *coefffilename, char *paramfilename) chemflag = 0; bnormflag = 0; wselfallflag = 0; - chunksize = 2000; + chunksize = 4096; // open SNAP parameter file on proc 0