diff --git a/src/KOKKOS/pair_snap_kokkos.h b/src/KOKKOS/pair_snap_kokkos.h index ca1884bfd2..660503eed8 100644 --- a/src/KOKKOS/pair_snap_kokkos.h +++ b/src/KOKKOS/pair_snap_kokkos.h @@ -30,29 +30,34 @@ PairStyle(snap/kk/host,PairSNAPKokkosDevice); #include "pair_snap.h" #include "kokkos_type.h" #include "neigh_list_kokkos.h" -#include "sna_kokkos.h" #include "pair_kokkos.h" +namespace LAMMPS_NS { +// pre-declare so sna_kokkos.h can refer to it +template class PairSNAPKokkos; +}; + +#include "sna_kokkos.h" + namespace LAMMPS_NS { // Routines for both the CPU and GPU backend +struct TagPairSNAPPreUi{}; +struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist +template struct TagPairSNAPComputeZi{}; +template struct TagPairSNAPComputeBi{}; +struct TagPairSNAPComputeBetaLinear{}; +struct TagPairSNAPComputeBetaQuadratic{}; +template struct TagPairSNAPComputeYi{}; +template struct TagPairSNAPComputeYiWithZlist{}; template struct TagPairSNAPComputeForce{}; - // GPU backend only struct TagPairSNAPComputeNeigh{}; struct TagPairSNAPComputeCayleyKlein{}; -struct TagPairSNAPPreUi{}; struct TagPairSNAPComputeUiSmall{}; // more parallelism, more divergence struct TagPairSNAPComputeUiLarge{}; // less parallelism, no divergence -struct TagPairSNAPTransformUi{}; // re-order ulisttot from SoA to AoSoA, zero ylist -struct TagPairSNAPComputeZi{}; -struct TagPairSNAPBeta{}; -struct TagPairSNAPComputeBi{}; -struct TagPairSNAPTransformBi{}; // re-order blist from AoSoA to AoS -struct TagPairSNAPComputeYi{}; -struct TagPairSNAPComputeYiWithZlist{}; template struct TagPairSNAPComputeFusedDeidrjSmall{}; // more parallelism, more divergence template @@ -60,14 +65,7 @@ struct TagPairSNAPComputeFusedDeidrjLarge{}; // less parallelism, no divergence // CPU backend only struct TagPairSNAPComputeNeighCPU{}; -struct TagPairSNAPPreUiCPU{}; struct TagPairSNAPComputeUiCPU{}; -struct TagPairSNAPTransformUiCPU{}; -struct TagPairSNAPComputeZiCPU{}; -struct TagPairSNAPBetaCPU{}; -struct TagPairSNAPComputeBiCPU{}; -struct TagPairSNAPZeroYiCPU{}; -struct TagPairSNAPComputeYiCPU{}; struct TagPairSNAPComputeDuidrjCPU{}; struct TagPairSNAPComputeDeidrjCPU{}; @@ -80,6 +78,8 @@ class PairSNAPKokkos : public PairSNAP { typedef ArrayTypes AT; typedef EV_FLOAT value_type; + static constexpr LAMMPS_NS::ExecutionSpace execution_space = ExecutionSpaceFromDevice::space; + static constexpr int host_flag = (execution_space == LAMMPS_NS::Host); static constexpr int vector_length = vector_length_; using real_type = real_type_; using complex = SNAComplex; @@ -93,9 +93,11 @@ class PairSNAPKokkos : public PairSNAP { static constexpr int team_size_compute_ui = 2; static constexpr int tile_size_transform_ui = 2; static constexpr int tile_size_compute_zi = 2; + static constexpr int min_blocks_compute_zi = 0; // no minimum bound static constexpr int tile_size_compute_bi = 2; - static constexpr int tile_size_transform_bi = 2; + static constexpr int tile_size_compute_beta = 2; static constexpr int tile_size_compute_yi = 2; + static constexpr int min_blocks_compute_yi = 0; // no minimum bound static constexpr int team_size_compute_fused_deidrj = 2; #elif defined(KOKKOS_ENABLE_SYCL) static constexpr int team_size_compute_neigh = 4; @@ -104,9 +106,11 @@ class PairSNAPKokkos : public PairSNAP { static constexpr int team_size_compute_ui = 8; static constexpr int tile_size_transform_ui = 8; static constexpr int tile_size_compute_zi = 4; + static constexpr int min_blocks_compute_zi = 0; // no minimum bound static constexpr int tile_size_compute_bi = 4; - static constexpr int tile_size_transform_bi = 4; + static constexpr int tile_size_compute_beta = 8; static constexpr int tile_size_compute_yi = 8; + static constexpr int min_blocks_compute_yi = 0; // no minimum bound static constexpr int team_size_compute_fused_deidrj = 4; #else static constexpr int team_size_compute_neigh = 4; @@ -116,17 +120,21 @@ class PairSNAPKokkos : public PairSNAP { static constexpr int tile_size_transform_ui = 4; static constexpr int tile_size_compute_zi = 8; static constexpr int tile_size_compute_bi = 4; - static constexpr int tile_size_transform_bi = 4; + static constexpr int tile_size_compute_beta = 4; static constexpr int tile_size_compute_yi = 8; static constexpr int team_size_compute_fused_deidrj = sizeof(real_type) == 4 ? 4 : 2; + + // this empirically reduces perf fluctuations from compiler version to compiler version + static constexpr int min_blocks_compute_zi = 4; + static constexpr int min_blocks_compute_yi = 4; #endif // Custom MDRangePolicy, Rank3, to reduce verbosity of kernel launches // This hides the Kokkos::IndexType and Kokkos::Rank<3...> // and reduces the verbosity of the LaunchBound by hiding the explicit // multiplication by vector_length - template - using Snap3DRangePolicy = typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds, TagPairSNAP>; + template + using Snap3DRangePolicy = typename Kokkos::MDRangePolicy, Kokkos::Rank<3, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, Kokkos::LaunchBounds, TagPairSNAP>; // Custom SnapAoSoATeamPolicy to reduce the verbosity of kernel launches // This hides the LaunchBounds abstraction by hiding the explicit @@ -134,6 +142,29 @@ class PairSNAPKokkos : public PairSNAP { template using SnapAoSoATeamPolicy = typename Kokkos::TeamPolicy, TagPairSNAP>; + // Custom MDRangePolicy, Rank2, on the host, to reduce verbosity of kernel launches. The striding of this launch is intentionally + // different from the tiled 3D range policy on the device. + template + using Snap2DHostRangePolicy = typename Kokkos::MDRangePolicy, Kokkos::IndexType, Kokkos::Rank<2, Kokkos::Iterate::Right, Kokkos::Iterate::Right>, TagPairSNAP>; + + // Custom RangePolicy, Rank2, on the host, to reduce verbosity of kernel launches + template + using Snap1DHostRangePolicy = typename Kokkos::RangePolicy, TagPairSNAP>; + + // Helper routine that returns a CPU or a GPU policy as appropriate + template + auto snap_get_policy(const int& chunk_size_div, const int& second_loop) { + if constexpr (host_flag) { + return Snap1DHostRangePolicy(0, chunk_size_div * vector_length); + + // the 2-d policy is still correct but it has atomics so it's slower on the CPU + //return Snap2DHostRangePolicy({0, 0}, {chunk_size_div * vector_length, second_loop}); + } else + return Snap3DRangePolicy({0, 0, 0}, + {vector_length, second_loop, chunk_size_div}, + {vector_length, num_tiles, 1}); + } + PairSNAPKokkos(class LAMMPS *); ~PairSNAPKokkos() override; @@ -149,6 +180,7 @@ class PairSNAPKokkos : public PairSNAP { template void check_team_size_reduce(int, int&); + // CPU and GPU backend template KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeForce,const int& ii) const; @@ -157,18 +189,23 @@ class PairSNAPKokkos : public PairSNAP { KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeForce,const int& ii, EV_FLOAT&) const; - KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPBetaCPU,const int& ii) const; - // GPU backend only KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const; + // GPU backend only KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeCayleyKlein, const int iatom_mod, const int jnbor, const int iatom_div) const; + // CPU and GPU KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPPreUi,const int iatom_mod, const int j, const int iatom_div) const; + void operator() (TagPairSNAPPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPPreUi, const int& iatom, const int& j) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPPreUi, const int& iatom) const; KOKKOS_INLINE_FUNCTION void operator() (TagPairSNAPComputeUiSmall,const typename Kokkos::TeamPolicy::member_type& team) const; @@ -177,25 +214,67 @@ class PairSNAPKokkos : public PairSNAP { void operator() (TagPairSNAPComputeUiLarge,const typename Kokkos::TeamPolicy::member_type& team) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPTransformUi,const int iatom_mod, const int j, const int iatom_div) const; + void operator() (TagPairSNAPTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeZi,const int iatom_mod, const int idxz, const int iatom_div) const; + void operator() (TagPairSNAPTransformUi, const int& iatom, const int& idxu) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPBeta, const int& ii) const; + void operator() (TagPairSNAPTransformUi, const int& iatom) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeZi, const int& iatom_mod, const int& idxz, const int& iatom_div) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeZi, const int& iatom, const int& idxz) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeZi, const int& iatom) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeBi, const int& iatom_mod, const int& idxb, const int& iatom_div) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeBi, const int& iatom, const int& idxb) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeBi, const int& iatom) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeBi,const int iatom_mod, const int idxb, const int iatom_div) const; + void operator() (TagPairSNAPComputeBetaLinear, const int& iatom_mod, const int& idxb, const int& iatom_div) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const; + void operator() (TagPairSNAPComputeBetaLinear, const int& iatom, const int& idxb) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeYi,const int iatom_mod, const int idxz, const int iatom_div) const; + void operator() (TagPairSNAPComputeBetaLinear, const int& iatom) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeYiWithZlist,const int iatom_mod, const int idxz, const int iatom_div) const; + void operator() (TagPairSNAPComputeBetaQuadratic, const int& iatom_mod, const int& idxb, const int& iatom_div) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeBetaQuadratic, const int& iatom, const int& idxb) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeBetaQuadratic, const int& iatom) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeYi, const int& iatom_mod, const int& idxz, const int& iatom_div) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeYi, const int& iatom, const int& idxz) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeYi, const int& iatom) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeYiWithZlist, const int& iatom_mod, const int& idxz, const int& iatom_div) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeYiWithZlist, const int& iatom, const int& idxz) const; + + template KOKKOS_INLINE_FUNCTION + void operator() (TagPairSNAPComputeYiWithZlist, const int& iatom) const; template KOKKOS_INLINE_FUNCTION @@ -210,28 +289,22 @@ class PairSNAPKokkos : public PairSNAP { void operator() (TagPairSNAPComputeNeighCPU,const typename Kokkos::TeamPolicy::member_type& team) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPPreUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagPairSNAPComputeUiCPU, const int& iatom, const int& jnbor) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagPairSNAPComputeUiCPU, const int& iatom) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPTransformUiCPU, const int j, const int iatom) const; + void operator() (TagPairSNAPComputeDuidrjCPU, const int& iatom, const int& jnbor) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeZiCPU,const int& ii) const; + void operator() (TagPairSNAPComputeDuidrjCPU, const int& iatom) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeBiCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagPairSNAPComputeDeidrjCPU, const int& iatom, const int& jnbor) const; KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeYiCPU,const int& ii) const; - - KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const; - - KOKKOS_INLINE_FUNCTION - void operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const; + void operator() (TagPairSNAPComputeDeidrjCPU, const int& iatom) const; template KOKKOS_INLINE_FUNCTION @@ -252,7 +325,7 @@ class PairSNAPKokkos : public PairSNAP { SNAKokkos snaKK; int inum,max_neighs,chunk_size,chunk_offset; - int host_flag,neighflag; + int neighflag; int eflag,vflag; @@ -260,13 +333,12 @@ class PairSNAPKokkos : public PairSNAP { Kokkos::View d_radelem; // element radii Kokkos::View d_wjelem; // elements weights - Kokkos::View d_coeffelem; // element bispectrum coefficients + typename SNAKokkos::t_sna_2d_lr d_coeffelem; // element bispectrum coefficients Kokkos::View d_sinnerelem; // element inner cutoff midpoint Kokkos::View d_dinnerelem; // element inner cutoff half-width Kokkos::View d_map; // mapping from atom types to elements Kokkos::View d_ninside; // ninside for all atoms in list - Kokkos::View d_beta; // betas for all atoms in list - Kokkos::View d_beta_pack; // betas for all atoms in list, GPU + typename SNAKokkos::t_sna_2d d_beta; // betas for all atoms in list typedef Kokkos::DualView tdual_fparams; tdual_fparams k_cutsq; @@ -301,6 +373,9 @@ class PairSNAPKokkos : public PairSNAP { template int scratch_size_helper(int values_per_team); + // Make SNAKokkos a friend + friend class SNAKokkos; + }; diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 97c7d17ea9..2b9b862645 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -59,11 +59,8 @@ PairSNAPKokkos::PairSNAPKokkos(LAMMPS *lmp kokkosable = 1; atomKK = (AtomKokkos *) atom; - execution_space = ExecutionSpaceFromDevice::space; datamask_read = EMPTY_MASK; datamask_modify = EMPTY_MASK; - - host_flag = (execution_space == Host); } /* ---------------------------------------------------------------------- */ @@ -85,15 +82,6 @@ PairSNAPKokkos::~PairSNAPKokkos() template void PairSNAPKokkos::init_style() { - if (host_flag) { - if (lmp->kokkos->nthreads > 1) - error->all(FLERR,"Pair style snap/kk can currently only run on a single " - "CPU thread"); - - PairSNAP::init_style(); - return; - } - if (force->newton_pair == 0) error->all(FLERR,"Pair style SNAP requires newton pair on"); @@ -134,13 +122,6 @@ struct FindMaxNumNeighs { template void PairSNAPKokkos::compute(int eflag_in, int vflag_in) { - if (host_flag) { - atomKK->sync(Host,X_MASK|F_MASK|TYPE_MASK); - PairSNAP::compute(eflag_in,vflag_in); - atomKK->modified(Host,F_MASK); - return; - } - eflag = eflag_in; vflag = vflag_in; @@ -197,21 +178,23 @@ void PairSNAPKokkos::compute(int eflag_in, Kokkos::parallel_reduce("PairSNAPKokkos::find_max_neighs",inum, FindMaxNumNeighs(k_list), Kokkos::Max(max_neighs)); int team_size_default = 1; - if (!host_flag) + if constexpr (!host_flag) team_size_default = 32;//max_neighs; if (beta_max < inum) { beta_max = inum; - MemKK::realloc_kokkos(d_beta,"PairSNAPKokkos:beta",ncoeff,inum); - if (!host_flag) - MemKK::realloc_kokkos(d_beta_pack,"PairSNAPKokkos:beta_pack",vector_length,ncoeff,(inum + vector_length - 1) / vector_length); - MemKK::realloc_kokkos(d_ninside,"PairSNAPKokkos:ninside",inum); + // padded allocation, similar to within grow_rij + const int inum_div = (inum + vector_length - 1) / vector_length; + const int inum_pad = inum_div * vector_length; + MemKK::realloc_kokkos(d_beta,"PairSNAPKokkos:beta", inum_pad, ncoeff); + snaKK.d_beta = d_beta; + MemKK::realloc_kokkos(d_ninside,"PairSNAPKokkos:ninside", inum_pad); } - chunk_size = MIN(chunksize,inum); // "chunksize" variable is set by user + chunk_size = MIN(chunksize, inum); // "chunksize" variable is set by user chunk_offset = 0; - snaKK.grow_rij(chunk_size,max_neighs); + snaKK.grow_rij(chunk_size, max_neighs); EV_FLOAT ev; @@ -222,249 +205,195 @@ void PairSNAPKokkos::compute(int eflag_in, if (chunk_size > inum - chunk_offset) chunk_size = inum - chunk_offset; - if (host_flag) + // pre-compute ceil(chunk_size / vector_length) and the padded chunk size for convenience + const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length; + //const int chunk_size_pad = chunk_size_div * vector_length; + + // ComputeNeigh + if constexpr (host_flag) { + int team_size = team_size_default; + check_team_size_for(chunk_size,team_size); + typename Kokkos::TeamPolicy policy_neigh(chunk_size,team_size,vector_length); + Kokkos::parallel_for("ComputeNeighCPU",policy_neigh,*this); + } else { + // team_size_compute_neigh is defined in `pair_snap_kokkos.h` + int scratch_size = scratch_size_helper(team_size_compute_neigh * max_neighs); + + SnapAoSoATeamPolicy policy_neigh(chunk_size,team_size_compute_neigh,vector_length); + policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this); + } + + // ComputeCayleyKlein; this is only called on the GPU + if constexpr (!host_flag) { + // tile_size_compute_ck is defined in `pair_snap_kokkos.h` + Snap3DRangePolicy + policy_compute_ck({0,0,0},{vector_length,max_neighs,chunk_size_div},{vector_length,tile_size_compute_ck,1}); + Kokkos::parallel_for("ComputeCayleyKlein", policy_compute_ck, *this); + } + + // PreUi; same CPU and GPU codepath { - // Host codepath - - //ComputeNeigh - { - int team_size = team_size_default; - check_team_size_for(chunk_size,team_size); - typename Kokkos::TeamPolicy policy_neigh(chunk_size,team_size,vector_length); - Kokkos::parallel_for("ComputeNeighCPU",policy_neigh,*this); - } - - //PreUi - { - int team_size = team_size_default; - check_team_size_for(chunk_size,team_size); - typename Kokkos::TeamPolicy policy_preui_cpu((chunk_size+team_size-1)/team_size,team_size,vector_length); - Kokkos::parallel_for("PreUiCPU",policy_preui_cpu,*this); - } - - // ComputeUi - { - int team_size = team_size_default; - // Fused calculation of ulist and accumulation into ulisttot using atomics - typename Kokkos::TeamPolicy policy_ui_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); - Kokkos::parallel_for("ComputeUiCPU",policy_ui_cpu,*this); - } - - { - // Expand ulisttot -> ulisttot_full - // Zero out ylist - typename Kokkos::MDRangePolicy, Kokkos::Rank<2, Kokkos::Iterate::Left, Kokkos::Iterate::Left>, TagPairSNAPTransformUiCPU> policy_transform_ui_cpu({0,0},{twojmax+1,chunk_size}); - Kokkos::parallel_for("TransformUiCPU",policy_transform_ui_cpu,*this); - } - - //Compute bispectrum - if (quadraticflag || eflag) { - //ComputeZi - int idxz_max = snaKK.idxz_max; - typename Kokkos::RangePolicy policy_zi_cpu(0,chunk_size*idxz_max); - Kokkos::parallel_for("ComputeZiCPU",policy_zi_cpu,*this); - - //ComputeBi - int team_size = team_size_default; - check_team_size_for(chunk_size,team_size); - typename Kokkos::TeamPolicy policy_bi_cpu(chunk_size,team_size,vector_length); - Kokkos::parallel_for("ComputeBiCPU",policy_bi_cpu,*this); - } - - //ComputeYi - { - //Compute beta = dE_i/dB_i for all i in list - typename Kokkos::RangePolicy policy_beta(0,chunk_size); - Kokkos::parallel_for("ComputeBetaCPU",policy_beta,*this); - - //ComputeYi - int idxz_max = snaKK.idxz_max; - typename Kokkos::RangePolicy policy_yi_cpu(0,chunk_size*idxz_max); - Kokkos::parallel_for("ComputeYiCPU",policy_yi_cpu,*this); - } // host flag - - //ComputeDuidrj and Deidrj - { - int team_size = team_size_default; - - typename Kokkos::TeamPolicy policy_duidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); - Kokkos::parallel_for("ComputeDuidrjCPU",policy_duidrj_cpu,*this); - - typename Kokkos::TeamPolicy policy_deidrj_cpu(((chunk_size+team_size-1)/team_size)*max_neighs,team_size,vector_length); - - Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this); - } - - } else { // GPU - -#ifdef LMP_KOKKOS_GPU - - // Pre-compute ceil(chunk_size / vector_length) for code cleanliness - const int chunk_size_div = (chunk_size + vector_length - 1) / vector_length; - - //ComputeNeigh - { - // team_size_compute_neigh is defined in `pair_snap_kokkos.h` - int scratch_size = scratch_size_helper(team_size_compute_neigh * max_neighs); - - SnapAoSoATeamPolicy policy_neigh(chunk_size,team_size_compute_neigh,vector_length); - policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); - Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this); - } - - //ComputeCayleyKlein - { - // tile_size_compute_ck is defined in `pair_snap_kokkos.h` - Snap3DRangePolicy - policy_compute_ck({0,0,0},{vector_length,max_neighs,chunk_size_div},{vector_length,tile_size_compute_ck,1}); - Kokkos::parallel_for("ComputeCayleyKlein",policy_compute_ck,*this); - } - - //PreUi - { - // tile_size_pre_ui is defined in `pair_snap_kokkos.h` - Snap3DRangePolicy - policy_preui({0,0,0},{vector_length,twojmax+1,chunk_size_div},{vector_length,tile_size_pre_ui,1}); - Kokkos::parallel_for("PreUi",policy_preui,*this); - } + auto policy_pre_ui = snap_get_policy(chunk_size_div, twojmax + 1); + Kokkos::parallel_for("PreUi", policy_pre_ui, *this); + } + // ComputeUi; separate CPU, GPU codepaths + if constexpr (host_flag) { + // Fused calculation of ulist and accumulation into ulisttot using atomics + auto policy_ui_cpu = snap_get_policy(chunk_size_div, max_neighs); + Kokkos::parallel_for("ComputeUiCPU", policy_ui_cpu, *this); + } else { // ComputeUi w/vector parallelism, shared memory, direct atomicAdd into ulisttot + + // team_size_compute_ui is defined in `pair_snap_kokkos.h` + // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer + const int tile_size = vector_length * (twojmax + 1); + const int scratch_size = scratch_size_helper(team_size_compute_ui * tile_size); + + if (chunk_size < parallel_thresh) { - // team_size_compute_ui is defined in `pair_snap_kokkos.h` - // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer - const int tile_size = vector_length * (twojmax + 1); - const int scratch_size = scratch_size_helper(team_size_compute_ui * tile_size); + // Version with parallelism over j_bend - if (chunk_size < parallel_thresh) - { - // Version with parallelism over j_bend + // total number of teams needed: (natoms / 32) * (max_neighs) * ("bend" locations) + const int n_teams = chunk_size_div * max_neighs * (twojmax + 1); + const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui; - // total number of teams needed: (natoms / 32) * (max_neighs) * ("bend" locations) - const int n_teams = chunk_size_div * max_neighs * (twojmax + 1); - const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui; + SnapAoSoATeamPolicy policy_ui(n_teams_div, team_size_compute_ui, vector_length); + policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeUiSmall",policy_ui,*this); + } else { + // Version w/out parallelism over j_bend - SnapAoSoATeamPolicy policy_ui(n_teams_div, team_size_compute_ui, vector_length); - policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); - Kokkos::parallel_for("ComputeUiSmall",policy_ui,*this); - } else { - // Version w/out parallelism over j_bend + // total number of teams needed: (natoms / 32) * (max_neighs) + const int n_teams = chunk_size_div * max_neighs; + const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui; - // total number of teams needed: (natoms / 32) * (max_neighs) - const int n_teams = chunk_size_div * max_neighs; - const int n_teams_div = (n_teams + team_size_compute_ui - 1) / team_size_compute_ui; + SnapAoSoATeamPolicy policy_ui(n_teams_div, team_size_compute_ui, vector_length); + policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeUiLarge",policy_ui,*this); + } + } - SnapAoSoATeamPolicy policy_ui(n_teams_div, team_size_compute_ui, vector_length); - policy_ui = policy_ui.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); - Kokkos::parallel_for("ComputeUiLarge",policy_ui,*this); - } + { + // Expand ulisttot_re,_im -> ulisttot + // Zero out ylist + auto policy_transform_ui = snap_get_policy(chunk_size_div, snaKK.idxu_max); + Kokkos::parallel_for("TransformUi", policy_transform_ui, *this); + } + + //Compute bispectrum + if (quadraticflag || eflag) { + // team_size_[compute_zi, compute_bi, transform_bi] are defined in `pair_snap_kokkos.h` + + //ComputeZi and Bi + if (nelements > 1) { + auto policy_compute_zi = snap_get_policy, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max); + Kokkos::parallel_for("ComputeZiChemsnap", policy_compute_zi, *this); + + auto policy_compute_bi = snap_get_policy>(chunk_size_div, snaKK.idxb_max); + Kokkos::parallel_for("ComputeBiChemsnap", policy_compute_bi, *this); + } else { + auto policy_compute_zi = snap_get_policy, min_blocks_compute_zi>(chunk_size_div, snaKK.idxz_max); + Kokkos::parallel_for("ComputeZi", policy_compute_zi, *this); + + auto policy_compute_bi = snap_get_policy>(chunk_size_div, snaKK.idxb_max); + Kokkos::parallel_for("ComputeBi", policy_compute_bi, *this); } - //TransformUi: un-"fold" ulisttot, zero ylist - { - // team_size_transform_ui is defined in `pair_snap_kokkos.h` - Snap3DRangePolicy - policy_transform_ui({0,0,0},{vector_length,snaKK.idxu_max,chunk_size_div},{vector_length,tile_size_transform_ui,1}); - Kokkos::parallel_for("TransformUi",policy_transform_ui,*this); - } + } - //Compute bispectrum in AoSoA data layout, transform Bi - if (quadraticflag || eflag) { - // team_size_[compute_zi, compute_bi, transform_bi] are defined in `pair_snap_kokkos.h` + { + //Compute beta = dE_i/dB_i for all i in list; linear portion only + auto policy_compute_beta_linear = snap_get_policy(chunk_size_div, snaKK.idxb_max); + Kokkos::parallel_for("ComputeBetaLinear", policy_compute_beta_linear, *this); - //ComputeZi - const int idxz_max = snaKK.idxz_max; - Snap3DRangePolicy - policy_compute_zi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_zi,1}); - Kokkos::parallel_for("ComputeZi",policy_compute_zi,*this); - - //ComputeBi - const int idxb_max = snaKK.idxb_max; - Snap3DRangePolicy - policy_compute_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_compute_bi,1}); - Kokkos::parallel_for("ComputeBi",policy_compute_bi,*this); - - //Transform data layout of blist out of AoSoA - //We need this because `blist` gets used in ComputeForce which doesn't - //take advantage of AoSoA, which at best would only be beneficial on the margins - Snap3DRangePolicy - policy_transform_bi({0,0,0},{vector_length,idxb_max,chunk_size_div},{vector_length,tile_size_transform_bi,1}); - Kokkos::parallel_for("TransformBi",policy_transform_bi,*this); + if (quadraticflag) { + // Compute the quadratic correction + auto policy_compute_beta_quadratic = snap_get_policy(chunk_size_div, snaKK.idxb_max); + Kokkos::parallel_for("ComputeBetaQuadratic", policy_compute_beta_quadratic, *this); } //Note zeroing `ylist` is fused into `TransformUi`. - { - //Compute beta = dE_i/dB_i for all i in list - typename Kokkos::RangePolicy policy_beta(0,chunk_size); - Kokkos::parallel_for("ComputeBeta",policy_beta,*this); - const int idxz_max = snaKK.idxz_max; - if (quadraticflag || eflag) { - Snap3DRangePolicy - policy_compute_yi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_yi,1}); - Kokkos::parallel_for("ComputeYiWithZlist",policy_compute_yi,*this); + if (quadraticflag || eflag) { + if (nelements > 1) { + auto policy_compute_yi = snap_get_policy>(chunk_size_div, snaKK.idxz_max); + Kokkos::parallel_for("ComputeYiWithZlistChemsnap", policy_compute_yi, *this); } else { - Snap3DRangePolicy - policy_compute_yi({0,0,0},{vector_length,idxz_max,chunk_size_div},{vector_length,tile_size_compute_yi,1}); - Kokkos::parallel_for("ComputeYi",policy_compute_yi,*this); + auto policy_compute_yi = snap_get_policy>(chunk_size_div, snaKK.idxz_max); + Kokkos::parallel_for("ComputeYiWithZlist", policy_compute_yi, *this); + } + } else { + if (nelements > 1) { + auto policy_compute_yi = snap_get_policy, min_blocks_compute_yi>(chunk_size_div, snaKK.idxz_max); + Kokkos::parallel_for("ComputeYiChemsnap", policy_compute_yi, *this); + } else { + auto policy_compute_yi = snap_get_policy, min_blocks_compute_yi>(chunk_size_div, snaKK.idxz_max); + Kokkos::parallel_for("ComputeYi", policy_compute_yi, *this); } } + } + if constexpr (host_flag) { + //ComputeDuidrj and Deidrj + auto policy_duidrj_cpu = snap_get_policy(chunk_size_div, max_neighs); + Kokkos::parallel_for("ComputeDuidrjCPU", policy_duidrj_cpu, *this); + + auto policy_deidrj_cpu = snap_get_policy(chunk_size_div, max_neighs); + Kokkos::parallel_for("ComputeDeidrjCPU",policy_deidrj_cpu,*this); + } else { // GPU // Fused ComputeDuidrj, ComputeDeidrj + // team_size_compute_fused_deidrj is defined in `pair_snap_kokkos.h` + + // scratch size: 32 atoms * (twojmax+1) cached values * 2 for u, du, no double buffer + const int tile_size = vector_length * (twojmax + 1); + const int scratch_size = scratch_size_helper(2 * team_size_compute_fused_deidrj * tile_size); + + if (chunk_size < parallel_thresh) { - // team_size_compute_fused_deidrj is defined in `pair_snap_kokkos.h` + // Version with parallelism over j_bend - // scratch size: 32 atoms * (twojmax+1) cached values * 2 for u, du, no double buffer - const int tile_size = vector_length * (twojmax + 1); - const int scratch_size = scratch_size_helper(2 * team_size_compute_fused_deidrj * tile_size); + // total number of teams needed: (natoms / 32) * (max_neighs) * ("bend" locations) + const int n_teams = chunk_size_div * max_neighs * (twojmax + 1); + const int n_teams_div = (n_teams + team_size_compute_fused_deidrj - 1) / team_size_compute_fused_deidrj; - if (chunk_size < parallel_thresh) - { - // Version with parallelism over j_bend + // x direction + SnapAoSoATeamPolicy > policy_fused_deidrj_x(n_teams_div,team_size_compute_fused_deidrj,vector_length); + policy_fused_deidrj_x = policy_fused_deidrj_x.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeFusedDeidrjSmall<0>",policy_fused_deidrj_x,*this); - // total number of teams needed: (natoms / 32) * (max_neighs) * ("bend" locations) - const int n_teams = chunk_size_div * max_neighs * (twojmax + 1); - const int n_teams_div = (n_teams + team_size_compute_fused_deidrj - 1) / team_size_compute_fused_deidrj; + // y direction + SnapAoSoATeamPolicy > policy_fused_deidrj_y(n_teams_div,team_size_compute_fused_deidrj,vector_length); + policy_fused_deidrj_y = policy_fused_deidrj_y.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeFusedDeidrjSmall<1>",policy_fused_deidrj_y,*this); - // x direction - SnapAoSoATeamPolicy > policy_fused_deidrj_x(n_teams_div,team_size_compute_fused_deidrj,vector_length); - policy_fused_deidrj_x = policy_fused_deidrj_x.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); - Kokkos::parallel_for("ComputeFusedDeidrjSmall<0>",policy_fused_deidrj_x,*this); + // z direction + SnapAoSoATeamPolicy > policy_fused_deidrj_z(n_teams_div,team_size_compute_fused_deidrj,vector_length); + policy_fused_deidrj_z = policy_fused_deidrj_z.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeFusedDeidrjSmall<2>",policy_fused_deidrj_z,*this); + } else { + // Version w/out parallelism over j_bend - // y direction - SnapAoSoATeamPolicy > policy_fused_deidrj_y(n_teams_div,team_size_compute_fused_deidrj,vector_length); - policy_fused_deidrj_y = policy_fused_deidrj_y.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); - Kokkos::parallel_for("ComputeFusedDeidrjSmall<1>",policy_fused_deidrj_y,*this); + // total number of teams needed: (natoms / 32) * (max_neighs) + const int n_teams = chunk_size_div * max_neighs; + const int n_teams_div = (n_teams + team_size_compute_fused_deidrj - 1) / team_size_compute_fused_deidrj; - // z direction - SnapAoSoATeamPolicy > policy_fused_deidrj_z(n_teams_div,team_size_compute_fused_deidrj,vector_length); - policy_fused_deidrj_z = policy_fused_deidrj_z.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); - Kokkos::parallel_for("ComputeFusedDeidrjSmall<2>",policy_fused_deidrj_z,*this); - } else { - // Version w/out parallelism over j_bend + // x direction + SnapAoSoATeamPolicy > policy_fused_deidrj_x(n_teams_div,team_size_compute_fused_deidrj,vector_length); + policy_fused_deidrj_x = policy_fused_deidrj_x.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeFusedDeidrjLarge<0>",policy_fused_deidrj_x,*this); - // total number of teams needed: (natoms / 32) * (max_neighs) - const int n_teams = chunk_size_div * max_neighs; - const int n_teams_div = (n_teams + team_size_compute_fused_deidrj - 1) / team_size_compute_fused_deidrj; + // y direction + SnapAoSoATeamPolicy > policy_fused_deidrj_y(n_teams_div,team_size_compute_fused_deidrj,vector_length); + policy_fused_deidrj_y = policy_fused_deidrj_y.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeFusedDeidrjLarge<1>",policy_fused_deidrj_y,*this); - // x direction - SnapAoSoATeamPolicy > policy_fused_deidrj_x(n_teams_div,team_size_compute_fused_deidrj,vector_length); - policy_fused_deidrj_x = policy_fused_deidrj_x.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); - Kokkos::parallel_for("ComputeFusedDeidrjLarge<0>",policy_fused_deidrj_x,*this); + // z direction + SnapAoSoATeamPolicy > policy_fused_deidrj_z(n_teams_div,team_size_compute_fused_deidrj,vector_length); + policy_fused_deidrj_z = policy_fused_deidrj_z.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeFusedDeidrjLarge<2>",policy_fused_deidrj_z,*this); - // y direction - SnapAoSoATeamPolicy > policy_fused_deidrj_y(n_teams_div,team_size_compute_fused_deidrj,vector_length); - policy_fused_deidrj_y = policy_fused_deidrj_y.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); - Kokkos::parallel_for("ComputeFusedDeidrjLarge<1>",policy_fused_deidrj_y,*this); - - // z direction - SnapAoSoATeamPolicy > policy_fused_deidrj_z(n_teams_div,team_size_compute_fused_deidrj,vector_length); - policy_fused_deidrj_z = policy_fused_deidrj_z.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); - Kokkos::parallel_for("ComputeFusedDeidrjLarge<2>",policy_fused_deidrj_z,*this); - - } } - -#endif // LMP_KOKKOS_GPU - } //ComputeForce @@ -607,8 +536,8 @@ void PairSNAPKokkos::coeff(int narg, char Kokkos::deep_copy(d_dinnerelem,h_dinnerelem); Kokkos::deep_copy(d_map,h_map); - snaKK = SNAKokkos(rfac0,twojmax, - rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements,switchinnerflag); + snaKK = SNAKokkos(*this); //rfac0,twojmax, + //rmin0,switchflag,bzeroflag,chemflag,bnormflag,wselfallflag,nelements,switchinnerflag); snaKK.grow_rij(0,0); snaKK.init(); } @@ -618,53 +547,10 @@ void PairSNAPKokkos::coeff(int narg, char of AoSoA data layouts and scratch memory for recursive polynomials ------------------------------------------------------------------------- */ -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPBeta,const int& ii) const { - - if (ii >= chunk_size) return; - - const int iatom_mod = ii % vector_length; - const int iatom_div = ii / vector_length; - - const int i = d_ilist[ii + chunk_offset]; - const int itype = type[i]; - const int ielem = d_map[itype]; - SNAKokkos my_sna = snaKK; - - auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL); - - for (int icoeff = 0; icoeff < ncoeff; icoeff++) { - d_beta_pack(iatom_mod,icoeff,iatom_div) = d_coeffi[icoeff+1]; - } - - if (quadraticflag) { - const auto idxb_max = my_sna.idxb_max; - int k = ncoeff+1; - for (int icoeff = 0; icoeff < ncoeff; icoeff++) { - const auto idxb = icoeff % idxb_max; - const auto idx_chem = icoeff / idxb_max; - real_type bveci = my_sna.blist(ii, idx_chem, idxb); - d_beta_pack(iatom_mod,icoeff,iatom_div) += d_coeffi[k]*bveci; - k++; - for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { - const auto jdxb = jcoeff % idxb_max; - const auto jdx_chem = jcoeff / idxb_max; - real_type bvecj = my_sna.blist(ii, jdx_chem, jdxb); - d_beta_pack(iatom_mod,icoeff,iatom_div) += d_coeffi[k]*bvecj; - d_beta_pack(iatom_mod,jcoeff,iatom_div) += d_coeffi[k]*bveci; - k++; - } - } - } -} - template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const { - SNAKokkos my_sna = snaKK; - // extract atom number int ii = team.team_rank() + team.league_rank() * team.team_size(); if (ii >= chunk_size) return; @@ -731,320 +617,34 @@ void PairSNAPKokkos::operator() (TagPairSN const F_FLOAT dy = x(j,1) - ytmp; const F_FLOAT dz = x(j,2) - ztmp; const int jelem = d_map[jtype]; - my_sna.rij(ii,offset,0) = static_cast(dx); - my_sna.rij(ii,offset,1) = static_cast(dy); - my_sna.rij(ii,offset,2) = static_cast(dz); - my_sna.wj(ii,offset) = static_cast(d_wjelem[jelem]); - my_sna.rcutij(ii,offset) = static_cast((radi + d_radelem[jelem])*rcutfac); - my_sna.inside(ii,offset) = j; + snaKK.rij(ii,offset,0) = static_cast(dx); + snaKK.rij(ii,offset,1) = static_cast(dy); + snaKK.rij(ii,offset,2) = static_cast(dz); + snaKK.wj(ii,offset) = static_cast(d_wjelem[jelem]); + snaKK.rcutij(ii,offset) = static_cast((radi + d_radelem[jelem])*rcutfac); + snaKK.inside(ii,offset) = j; if (switchinnerflag) { - my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]); - my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]); + snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]); + snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]); } if (chemflag) - my_sna.element(ii,offset) = jelem; + snaKK.element(ii,offset) = jelem; else - my_sna.element(ii,offset) = 0; + snaKK.element(ii,offset) = 0; } offset++; } }); } -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const { - SNAKokkos my_sna = snaKK; - - const int ii = iatom_mod + iatom_div * vector_length; - if (ii >= chunk_size) return; - - const int ninside = d_ninside(ii); - if (jnbor >= ninside) return; - - my_sna.compute_cayley_klein(iatom_mod,jnbor,iatom_div); -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPPreUi, const int iatom_mod, const int j, const int iatom_div) const { - SNAKokkos my_sna = snaKK; - - const int ii = iatom_mod + iatom_div * vector_length; - if (ii >= chunk_size) return; - - int itype = type(ii); - int ielem = d_map[itype]; - - my_sna.pre_ui(iatom_mod, j, ielem, iatom_div); -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeUiSmall,const typename Kokkos::TeamPolicy::member_type& team) const { - SNAKokkos my_sna = snaKK; - - // extract flattened atom_div / neighbor number / bend location - int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui; - - // extract neighbor index, iatom_div - int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug - const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1)); - const int jbend = jj_jbend / max_neighs; - int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug - - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), - [&] (const int iatom_mod) { - const int ii = iatom_mod + vector_length * iatom_div; - if (ii >= chunk_size) return; - - const int ninside = d_ninside(ii); - if (jj >= ninside) return; - - my_sna.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div); - }); - -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeUiLarge,const typename Kokkos::TeamPolicy::member_type& team) const { - SNAKokkos my_sna = snaKK; - - // extract flattened atom_div / neighbor number / bend location - int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui; - - // extract neighbor index, iatom_div - int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug - int jj = flattened_idx - iatom_div * max_neighs; - - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), - [&] (const int iatom_mod) { - const int ii = iatom_mod + vector_length * iatom_div; - if (ii >= chunk_size) return; - - const int ninside = d_ninside(ii); - if (jj >= ninside) return; - - my_sna.compute_ui_large(team,iatom_mod, jj, iatom_div); - }); - -} - - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPTransformUi,const int iatom_mod, const int idxu, const int iatom_div) const { - SNAKokkos my_sna = snaKK; - - const int iatom = iatom_mod + iatom_div * vector_length; - if (iatom >= chunk_size) return; - - if (idxu > my_sna.idxu_max) return; - - int elem_count = chemflag ? nelements : 1; - - for (int ielem = 0; ielem < elem_count; ielem++) { - - const FullHalfMapper mapper = my_sna.idxu_full_half[idxu]; - - auto utot_re = my_sna.ulisttot_re_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div); - auto utot_im = my_sna.ulisttot_im_pack(iatom_mod, mapper.idxu_half, ielem, iatom_div); - - if (mapper.flip_sign == 1) { - utot_im = -utot_im; - } else if (mapper.flip_sign == -1) { - utot_re = -utot_re; - } - - my_sna.ulisttot_pack(iatom_mod, idxu, ielem, iatom_div) = { utot_re, utot_im }; - - if (mapper.flip_sign == 0) { - my_sna.ylist_pack_re(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.; - my_sna.ylist_pack_im(iatom_mod, mapper.idxu_half, ielem, iatom_div) = 0.; - } - } -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeYi,const int iatom_mod, const int jjz, const int iatom_div) const { - SNAKokkos my_sna = snaKK; - - const int iatom = iatom_mod + iatom_div * vector_length; - if (iatom >= chunk_size) return; - - if (jjz >= my_sna.idxz_max) return; - - my_sna.compute_yi(iatom_mod,jjz,iatom_div,d_beta_pack); -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeYiWithZlist,const int iatom_mod, const int jjz, const int iatom_div) const { - SNAKokkos my_sna = snaKK; - - const int iatom = iatom_mod + iatom_div * vector_length; - if (iatom >= chunk_size) return; - - if (jjz >= my_sna.idxz_max) return; - - my_sna.compute_yi_with_zlist(iatom_mod,jjz,iatom_div,d_beta_pack); -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeZi,const int iatom_mod, const int jjz, const int iatom_div) const { - SNAKokkos my_sna = snaKK; - - const int iatom = iatom_mod + iatom_div * vector_length; - if (iatom >= chunk_size) return; - - if (jjz >= my_sna.idxz_max) return; - - my_sna.compute_zi(iatom_mod,jjz,iatom_div); -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeBi,const int iatom_mod, const int jjb, const int iatom_div) const { - SNAKokkos my_sna = snaKK; - - const int iatom = iatom_mod + iatom_div * vector_length; - if (iatom >= chunk_size) return; - - if (jjb >= my_sna.idxb_max) return; - - my_sna.compute_bi(iatom_mod,jjb,iatom_div); -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPTransformBi,const int iatom_mod, const int idxb, const int iatom_div) const { - SNAKokkos my_sna = snaKK; - - const int iatom = iatom_mod + iatom_div * vector_length; - if (iatom >= chunk_size) return; - - if (idxb >= my_sna.idxb_max) return; - - const int ntriples = my_sna.ntriples; - - for (int itriple = 0; itriple < ntriples; itriple++) { - - const real_type blocal = my_sna.blist_pack(iatom_mod, idxb, itriple, iatom_div); - - my_sna.blist(iatom, itriple, idxb) = blocal; - } - -} - -template -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeFusedDeidrjSmall,const typename Kokkos::TeamPolicy >::member_type& team) const { - SNAKokkos my_sna = snaKK; - - // extract flattened atom_div / neighbor number / bend location - int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj; - - // extract neighbor index, iatom_div - int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug - const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1)); - const int jbend = jj_jbend / max_neighs; - int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug - - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), - [&] (const int iatom_mod) { - const int ii = iatom_mod + vector_length * iatom_div; - if (ii >= chunk_size) return; - - const int ninside = d_ninside(ii); - if (jj >= ninside) return; - - my_sna.template compute_fused_deidrj_small(team, iatom_mod, jbend, jj, iatom_div); - - }); - -} - -template -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeFusedDeidrjLarge,const typename Kokkos::TeamPolicy >::member_type& team) const { - SNAKokkos my_sna = snaKK; - - // extract flattened atom_div / neighbor number / bend location - int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj; - - // extract neighbor index, iatom_div - int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug - int jj = flattened_idx - max_neighs * iatom_div; - - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), - [&] (const int iatom_mod) { - const int ii = iatom_mod + vector_length * iatom_div; - if (ii >= chunk_size) return; - - const int ninside = d_ninside(ii); - if (jj >= ninside) return; - - my_sna.template compute_fused_deidrj_large(team, iatom_mod, jj, iatom_div); - - }); -} - -/* ---------------------------------------------------------------------- - Begin routines that are unique to the CPU codepath. These do not take - advantage of AoSoA data layouts, but that could be a good point of - future optimization and unification with the above kernels. It's unlikely - that scratch memory optimizations will ever be useful for the CPU due to - different arithmetic intensity requirements for the CPU vs GPU. -------------------------------------------------------------------------- */ - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPBetaCPU,const int& ii) const { - - const int i = d_ilist[ii + chunk_offset]; - const int itype = type[i]; - const int ielem = d_map[itype]; - SNAKokkos my_sna = snaKK; - - auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL); - - for (int icoeff = 0; icoeff < ncoeff; icoeff++) - d_beta(icoeff,ii) = d_coeffi[icoeff+1]; - - if (quadraticflag) { - const auto idxb_max = my_sna.idxb_max; - int k = ncoeff+1; - for (int icoeff = 0; icoeff < ncoeff; icoeff++) { - const auto idxb = icoeff % idxb_max; - const auto idx_chem = icoeff / idxb_max; - real_type bveci = my_sna.blist(ii,idx_chem,idxb); - d_beta(icoeff,ii) += d_coeffi[k]*bveci; - k++; - for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { - const auto jdxb = jcoeff % idxb_max; - const auto jdx_chem = jcoeff / idxb_max; - real_type bvecj = my_sna.blist(ii,jdx_chem,jdxb); - d_beta(icoeff,ii) += d_coeffi[k]*bvecj; - d_beta(jcoeff,ii) += d_coeffi[k]*bveci; - k++; - } - } - } -} - template KOKKOS_INLINE_FUNCTION void PairSNAPKokkos::operator() (TagPairSNAPComputeNeighCPU,const typename Kokkos::TeamPolicy::member_type& team) const { - int ii = team.league_rank(); + if (ii >= chunk_size) return; + const int i = d_ilist[ii + chunk_offset]; - SNAKokkos my_sna = snaKK; const double xtmp = x(i,0); const double ytmp = x(i,1); const double ztmp = x(i,2); @@ -1094,170 +694,510 @@ void PairSNAPKokkos::operator() (TagPairSN if (rsq < rnd_cutsq(itype,jtype)) { if (final) { - my_sna.rij(ii,offset,0) = static_cast(dx); - my_sna.rij(ii,offset,1) = static_cast(dy); - my_sna.rij(ii,offset,2) = static_cast(dz); - my_sna.wj(ii,offset) = static_cast(d_wjelem[jelem]); - my_sna.rcutij(ii,offset) = static_cast((radi + d_radelem[jelem])*rcutfac); - my_sna.inside(ii,offset) = j; + snaKK.rij(ii,offset,0) = static_cast(dx); + snaKK.rij(ii,offset,1) = static_cast(dy); + snaKK.rij(ii,offset,2) = static_cast(dz); + snaKK.wj(ii,offset) = static_cast(d_wjelem[jelem]); + snaKK.rcutij(ii,offset) = static_cast((radi + d_radelem[jelem])*rcutfac); + snaKK.inside(ii,offset) = j; if (switchinnerflag) { - my_sna.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]); - my_sna.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]); + snaKK.sinnerij(ii,offset) = 0.5*(d_sinnerelem[ielem] + d_sinnerelem[jelem]); + snaKK.dinnerij(ii,offset) = 0.5*(d_dinnerelem[ielem] + d_dinnerelem[jelem]); } if (chemflag) - my_sna.element(ii,offset) = jelem; + snaKK.element(ii,offset) = jelem; else - my_sna.element(ii,offset) = 0; + snaKK.element(ii,offset) = 0; } offset++; } }); } +/* ---------------------------------------------------------------------- + Pre-compute the Cayley-Klein parameters for reuse in later routines. + GPU only. +------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPPreUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const { - SNAKokkos my_sna = snaKK; - - // Extract the atom number - const int ii = team.team_rank() + team.team_size() * team.league_rank(); - if (ii >= chunk_size) return; - int itype = type(ii); - int ielem = d_map[itype]; - - my_sna.pre_ui_cpu(team,ii,ielem); -} - - - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeUiCPU,const typename Kokkos::TeamPolicy::member_type& team) const { - SNAKokkos my_sna = snaKK; - - // Extract the atom number - int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size())); - if (ii >= chunk_size) return; - - // Extract the neighbor number - const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size()); - const int ninside = d_ninside(ii); - if (jj >= ninside) return; - - my_sna.compute_ui_cpu(team,ii,jj); -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPTransformUiCPU, const int j, const int iatom) const { - SNAKokkos my_sna = snaKK; +void PairSNAPKokkos::operator() (TagPairSNAPComputeCayleyKlein,const int iatom_mod, const int jnbor, const int iatom_div) const { + const int iatom = iatom_mod + iatom_div * vector_length; if (iatom >= chunk_size) return; - if (j > twojmax) return; + const int ninside = d_ninside(iatom); + if (jnbor >= ninside) return; - int elem_count = chemflag ? nelements : 1; - - // De-symmetrize ulisttot - for (int ielem = 0; ielem < elem_count; ielem++) { - - const int jju_half = my_sna.idxu_half_block(j); - const int jju = my_sna.idxu_block(j); - - for (int mb = 0; 2*mb <= j; mb++) { - for (int ma = 0; ma <= j; ma++) { - // Extract top half - - const int idxu_shift = mb * (j + 1) + ma; - const int idxu_half = jju_half + idxu_shift; - const int idxu = jju + idxu_shift; - - // Load ulist - auto utot = my_sna.ulisttot(idxu_half, ielem, iatom); - - // Store - my_sna.ulisttot_full(idxu, ielem, iatom) = utot; - - // Zero Yi - my_sna.ylist(idxu_half, ielem, iatom) = {0., 0.}; - - // Symmetric term - const int sign_factor = (((ma+mb)%2==0)?1:-1); - const int idxu_flip = jju + (j + 1 - mb) * (j + 1) - (ma + 1); - - if (sign_factor == 1) { - utot.im = -utot.im; - } else { - utot.re = -utot.re; - } - - my_sna.ulisttot_full(idxu_flip, ielem, iatom) = utot; - } - } - } -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeYiCPU,const int& ii) const { - SNAKokkos my_sna = snaKK; - my_sna.compute_yi_cpu(ii,d_beta); -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeZiCPU,const int& ii) const { - SNAKokkos my_sna = snaKK; - my_sna.compute_zi_cpu(ii); -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeBiCPU,const typename Kokkos::TeamPolicy::member_type& team) const { - int ii = team.league_rank(); - SNAKokkos my_sna = snaKK; - my_sna.compute_bi_cpu(team,ii); -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeDuidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const { - SNAKokkos my_sna = snaKK; - - // Extract the atom number - int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size())); - if (ii >= chunk_size) return; - - // Extract the neighbor number - const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size()); - const int ninside = d_ninside(ii); - if (jj >= ninside) return; - - my_sna.compute_duidrj_cpu(team,ii,jj); -} - -template -KOKKOS_INLINE_FUNCTION -void PairSNAPKokkos::operator() (TagPairSNAPComputeDeidrjCPU,const typename Kokkos::TeamPolicy::member_type& team) const { - SNAKokkos my_sna = snaKK; - - // Extract the atom number - int ii = team.team_rank() + team.team_size() * (team.league_rank() % ((chunk_size+team.team_size()-1)/team.team_size())); - if (ii >= chunk_size) return; - - // Extract the neighbor number - const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size()); - const int ninside = d_ninside(ii); - if (jj >= ninside) return; - - my_sna.compute_deidrj_cpu(team,ii,jj); + snaKK.compute_cayley_klein(iatom, jnbor); } /* ---------------------------------------------------------------------- - Also used for both CPU and GPU codepaths. Could maybe benefit from a - separate GPU/CPU codepath, but this kernel takes so little time it's - likely not worth it. + Initialize the "ulisttot" structure with non-zero on-diagonal terms + and zero terms elsewhere; both CPU and GPU. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPPreUi, const int& iatom_mod, const int& j, const int& iatom_div) const { + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + + int itype = type(iatom); + int ielem = d_map[itype]; + + snaKK.pre_ui(iatom, j, ielem); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPPreUi, const int& iatom, const int& j) const { + if (iatom >= chunk_size) return; + + int itype = type(iatom); + int ielem = d_map[itype]; + + snaKK.pre_ui(iatom, j, ielem); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPPreUi, const int& iatom) const { + if (iatom >= chunk_size) return; + + const int itype = type(iatom); + const int ielem = d_map[itype]; + + for (int j = 0; j <= twojmax; j++) + snaKK.pre_ui(iatom, j, ielem); +} + +/* ---------------------------------------------------------------------- + Accumulate the spectral contributions from atom, neighbor pairs into + ulisttot_re and _im. These routines are GPU only and use scratch memory + staging. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeUiSmall,const typename Kokkos::TeamPolicy::member_type& team) const { + + // extract flattened atom_div / neighbor number / bend location + int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui; + + // extract neighbor index, iatom_div + int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug + const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1)); + const int jbend = jj_jbend / max_neighs; + int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), + [&] (const int iatom_mod) { + const int ii = iatom_mod + vector_length * iatom_div; + if (ii >= chunk_size) return; + + const int ninside = d_ninside(ii); + if (jj >= ninside) return; + + snaKK.compute_ui_small(team, iatom_mod, jbend, jj, iatom_div); + }); + +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeUiLarge,const typename Kokkos::TeamPolicy::member_type& team) const { + + // extract flattened atom_div / neighbor number / bend location + int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_ui; + + // extract neighbor index, iatom_div + int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug + int jj = flattened_idx - iatom_div * max_neighs; + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), + [&] (const int iatom_mod) { + const int ii = iatom_mod + vector_length * iatom_div; + if (ii >= chunk_size) return; + + const int ninside = d_ninside(ii); + if (jj >= ninside) return; + + snaKK.compute_ui_large(team,iatom_mod, jj, iatom_div); + }); +} + +/* ---------------------------------------------------------------------- + Accumulate the spectral contributions from atom, neighbor pairs into + ulisttot_re and _im. This routine is CPU only and does not use staging. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeUiCPU, const int& iatom, const int& jnbor) const { + if (iatom >= chunk_size) return; + const int ninside = d_ninside(iatom); + if (jnbor >= ninside) return; + snaKK.template compute_ui_cpu(iatom, jnbor); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeUiCPU, const int& iatom) const { + if (iatom >= chunk_size) return; + const int ninside = d_ninside(iatom); + for (int jnbor = 0; jnbor < ninside; jnbor++) + snaKK.template compute_ui_cpu(iatom, jnbor); +} + +/* ---------------------------------------------------------------------- + De-symmetrize ulisttot_re and _im and pack it into a unified ulisttot + structure. Zero-initialize ylist. CPU and GPU. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPTransformUi, const int& iatom_mod, const int& idxu, const int& iatom_div) const { + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + if (idxu >= snaKK.idxu_max) return; + snaKK.transform_ui(iatom, idxu); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPTransformUi, const int& iatom, const int& idxu) const { + if (iatom >= chunk_size) return; + snaKK.transform_ui(iatom, idxu); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPTransformUi, const int& iatom) const { + if (iatom >= chunk_size) return; + for (int idxu = 0; idxu < snaKK.idxu_max; idxu++) + snaKK.transform_ui(iatom, idxu); +} + +/* ---------------------------------------------------------------------- + Compute all elements of the Z tensor and store them into the `zlist` + View. This is only used for energy timesteps or quadratic SNAP. + CPU and GPU. +------------------------------------------------------------------------- */ + +template +template KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeZi, const int& iatom_mod, const int& jjz, const int& iatom_div) const { + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + if (jjz >= snaKK.idxz_max) return; + snaKK.template compute_zi(iatom, jjz); +} + +template +template KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeZi, const int& iatom, const int& jjz) const { + if (iatom >= chunk_size) return; + snaKK.template compute_zi(iatom, jjz); +} + +template +template KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeZi, const int& iatom) const { + if (iatom >= chunk_size) return; + for (int jjz = 0; jjz < snaKK.idxz_max; jjz++) + snaKK.template compute_zi(iatom, jjz); +} + +/* ---------------------------------------------------------------------- + Compute the energy triple products and store in the "blist" View. + CPU and GPU. +------------------------------------------------------------------------- */ + +template +template KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeBi, const int& iatom_mod, const int& jjb, const int& iatom_div) const { + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + if (jjb >= snaKK.idxb_max) return; + snaKK.template compute_bi(iatom, jjb); +} + +template +template KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeBi, const int& iatom, const int& jjb) const { + if (iatom >= chunk_size) return; + snaKK.template compute_bi(iatom, jjb); +} + +template +template KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeBi, const int& iatom) const { + if (iatom >= chunk_size) return; + for (int jjb = 0; jjb < snaKK.idxb_max; jjb++) + snaKK.template compute_bi(iatom, jjb); +} + +/* ---------------------------------------------------------------------- + Assemble the "beta" coefficients that enter the computation of the + adjoint matrices Y. This is just for a linear potential. A quadratic + contribution is added in a subsequent kernel. CPU and GPU. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeBetaLinear, const int& iatom_mod, const int& idxb, const int& iatom_div) const { + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + if (idxb >= snaKK.idxb_max) return; + + const int i = d_ilist[iatom + chunk_offset]; + const int itype = type[i]; + const int ielem = d_map[itype]; + + snaKK.compute_beta_linear(iatom, idxb, ielem); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeBetaLinear, const int& iatom, const int& idxb) const { + if (iatom >= chunk_size) return; + + const int i = d_ilist[iatom + chunk_offset]; + const int itype = type[i]; + const int ielem = d_map[itype]; + + snaKK.compute_beta_linear(iatom, idxb, ielem); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeBetaLinear, const int& iatom) const { + if (iatom >= chunk_size) return; + + const int i = d_ilist[iatom + chunk_offset]; + const int itype = type[i]; + const int ielem = d_map[itype]; + + for (int idxb = 0; idxb < snaKK.idxb_max; idxb++) + snaKK.compute_beta_linear(iatom, idxb, ielem); +} + +/* ---------------------------------------------------------------------- + Accumulate the qudratic terms which includes accumulating + energy triple products into an "effective" beta that encodes the + quadratic terms with otherwise linear compute work. + CPU and GPU. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeBetaQuadratic, const int& iatom_mod, const int& idxb, const int& iatom_div) const { + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + if (idxb >= snaKK.idxb_max) return; + + const int i = d_ilist[iatom + chunk_offset]; + const int itype = type[i]; + const int ielem = d_map[itype]; + + snaKK.template compute_beta_quadratic(iatom, idxb, ielem); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeBetaQuadratic, const int& iatom, const int& idxb) const { + if (iatom >= chunk_size) return; + + const int i = d_ilist[iatom + chunk_offset]; + const int itype = type[i]; + const int ielem = d_map[itype]; + + snaKK.template compute_beta_quadratic(iatom, idxb, ielem); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeBetaQuadratic, const int& iatom) const { + if (iatom >= chunk_size) return; + + const int i = d_ilist[iatom + chunk_offset]; + const int itype = type[i]; + const int ielem = d_map[itype]; + + for (int idxb = 0; idxb < snaKK.idxb_max; idxb++) + snaKK.template compute_beta_quadratic(iatom, idxb, ielem); +} + +/* ---------------------------------------------------------------------- + Compute all elements of the Z tensor and accumultate them into the + adjoint matrices Y (ylist_re, _im) on non-energy timesteps. CPU and GPU. +------------------------------------------------------------------------- */ + +template +template KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeYi, const int& iatom_mod, const int& jjz, const int& iatom_div) const { + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + if (jjz >= snaKK.idxz_max) return; + snaKK.template compute_yi(iatom, jjz); +} + +template +template KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeYi, const int& iatom, const int& jjz) const { + if (iatom >= chunk_size) return; + snaKK.template compute_yi(iatom, jjz); +} + +template +template KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeYi, const int& iatom) const { + if (iatom >= chunk_size) return; + for (int jjz = 0; jjz < snaKK.idxz_max; jjz++) + snaKK.template compute_yi(iatom, jjz); +} + +/* ---------------------------------------------------------------------- + Accumulate the pre-computed elements of the Z tensor into the adjoint + matrices Y (ylist_re, _im) on non-energy timesteps. CPU and GPU. +------------------------------------------------------------------------- */ + +template +template KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeYiWithZlist, const int& iatom_mod, const int& jjz, const int& iatom_div) const { + const int iatom = iatom_mod + iatom_div * vector_length; + if (iatom >= chunk_size) return; + if (jjz >= snaKK.idxz_max) return; + snaKK.template compute_yi_with_zlist(iatom, jjz); +} + +template +template KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeYiWithZlist, const int& iatom, const int& jjz) const { + if (iatom >= chunk_size) return; + snaKK.template compute_yi_with_zlist(iatom, jjz); +} + +template +template KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeYiWithZlist, const int& iatom) const { + if (iatom >= chunk_size) return; + for (int jjz = 0; jjz < snaKK.idxz_max; jjz++) + snaKK.template compute_yi_with_zlist(iatom, jjz); +} + +/* ---------------------------------------------------------------------- + Assemble the force contributions for each atom, neighbor pair by + contracting the adjoint matrices Y with derivatives of the Wigner + matrices U. These routines are GPU only and use scratch memory + staging. +------------------------------------------------------------------------- */ + +template +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeFusedDeidrjSmall,const typename Kokkos::TeamPolicy >::member_type& team) const { + + // extract flattened atom_div / neighbor number / bend location + int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj; + + // extract neighbor index, iatom_div + int iatom_div = flattened_idx / (max_neighs * (twojmax + 1)); // removed "const" to work around GCC 7 bug + const int jj_jbend = flattened_idx - iatom_div * (max_neighs * (twojmax + 1)); + const int jbend = jj_jbend / max_neighs; + int jj = jj_jbend - jbend * max_neighs; // removed "const" to work around GCC 7 bug + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), + [&] (const int iatom_mod) { + const int ii = iatom_mod + vector_length * iatom_div; + if (ii >= chunk_size) return; + + const int ninside = d_ninside(ii); + if (jj >= ninside) return; + + snaKK.template compute_fused_deidrj_small(team, iatom_mod, jbend, jj, iatom_div); + + }); + +} + +template +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeFusedDeidrjLarge,const typename Kokkos::TeamPolicy >::member_type& team) const { + + // extract flattened atom_div / neighbor number / bend location + int flattened_idx = team.team_rank() + team.league_rank() * team_size_compute_fused_deidrj; + + // extract neighbor index, iatom_div + int iatom_div = flattened_idx / max_neighs; // removed "const" to work around GCC 7 bug + int jj = flattened_idx - max_neighs * iatom_div; + + Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, vector_length), + [&] (const int iatom_mod) { + const int ii = iatom_mod + vector_length * iatom_div; + if (ii >= chunk_size) return; + + const int ninside = d_ninside(ii); + if (jj >= ninside) return; + + snaKK.template compute_fused_deidrj_large(team, iatom_mod, jj, iatom_div); + + }); +} + +/* ---------------------------------------------------------------------- + Assemble the derivatives of the Winger matrices U into the View + "dulist". CPU only. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeDuidrjCPU, const int& iatom, const int& jnbor) const { + if (iatom >= chunk_size) return; + const int ninside = d_ninside(iatom); + if (jnbor >= ninside) return; + snaKK.compute_duidrj_cpu(iatom, jnbor); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeDuidrjCPU, const int& iatom) const { + if (iatom >= chunk_size) return; + const int ninside = d_ninside(iatom); + for (int jnbor = 0; jnbor < ninside; jnbor++) + snaKK.compute_duidrj_cpu(iatom, jnbor); +} + +/* ---------------------------------------------------------------------- + Assemble the force contributions for each atom, neighbor pair by + contracting the adjoint matrices Y with the pre-computed derivatives + of the Wigner matrices U. CPU only. +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeDeidrjCPU, const int& iatom, const int& jnbor) const { + if (iatom >= chunk_size) return; + const int ninside = d_ninside(iatom); + if (jnbor >= ninside) return; + snaKK.compute_deidrj_cpu(iatom, jnbor); +} + +template +KOKKOS_INLINE_FUNCTION +void PairSNAPKokkos::operator() (TagPairSNAPComputeDeidrjCPU, const int& iatom) const { + if (iatom >= chunk_size) return; + const int ninside = d_ninside(iatom); + for (int jnbor = 0; jnbor < ninside; jnbor++) + snaKK.compute_deidrj_cpu(iatom, jnbor); +} + +/* ---------------------------------------------------------------------- + This routine formally accumulates the "chunked" force contributions + into the broader LAMMPS "f" force View. As appropriate it + also accumulates the total energy and the virial. CPU and GPU. ------------------------------------------------------------------------- */ template @@ -1271,17 +1211,15 @@ void PairSNAPKokkos::operator() (TagPairSN const int i = d_ilist[ii + chunk_offset]; - SNAKokkos my_sna = snaKK; - const int ninside = d_ninside(ii); for (int jj = 0; jj < ninside; jj++) { - int j = my_sna.inside(ii,jj); + int j = snaKK.inside(ii,jj); F_FLOAT fij[3]; - fij[0] = my_sna.dedr(ii,jj,0); - fij[1] = my_sna.dedr(ii,jj,1); - fij[2] = my_sna.dedr(ii,jj,2); + fij[0] = snaKK.dedr(ii,jj,0); + fij[1] = snaKK.dedr(ii,jj,1); + fij[2] = snaKK.dedr(ii,jj,2); a_f(i,0) += fij[0]; a_f(i,1) += fij[1]; @@ -1294,8 +1232,8 @@ void PairSNAPKokkos::operator() (TagPairSN if (vflag_either) { v_tally_xyz(ev,i,j, fij[0],fij[1],fij[2], - -my_sna.rij(ii,jj,0),-my_sna.rij(ii,jj,1), - -my_sna.rij(ii,jj,2)); + -snaKK.rij(ii,jj,0),-snaKK.rij(ii,jj,1), + -snaKK.rij(ii,jj,2)); } } @@ -1322,7 +1260,7 @@ void PairSNAPKokkos::operator() (TagPairSN for (int icoeff = 0; icoeff < ncoeff; icoeff++) { const auto idxb = icoeff % idxb_max; const auto idx_chem = icoeff / idxb_max; - evdwl += d_coeffi[icoeff+1]*my_sna.blist(ii,idx_chem,idxb); + evdwl += d_coeffi[icoeff+1]*snaKK.blist(ii,idx_chem,idxb); } // quadratic contributions @@ -1331,12 +1269,12 @@ void PairSNAPKokkos::operator() (TagPairSN for (int icoeff = 0; icoeff < ncoeff; icoeff++) { const auto idxb = icoeff % idxb_max; const auto idx_chem = icoeff / idxb_max; - real_type bveci = my_sna.blist(ii,idx_chem,idxb); + real_type bveci = snaKK.blist(ii,idx_chem,idxb); evdwl += 0.5*d_coeffi[k++]*bveci*bveci; for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { auto jdxb = jcoeff % idxb_max; auto jdx_chem = jcoeff / idxb_max; - auto bvecj = my_sna.blist(ii,jdx_chem,jdxb); + auto bvecj = snaKK.blist(ii,jdx_chem,jdxb); evdwl += d_coeffi[k++]*bveci*bvecj; } } @@ -1411,8 +1349,6 @@ double PairSNAPKokkos::memory_usage() { double bytes = Pair::memory_usage(); bytes += MemKK::memory_usage(d_beta); - if (!host_flag) - bytes += MemKK::memory_usage(d_beta_pack); bytes += MemKK::memory_usage(d_ninside); bytes += MemKK::memory_usage(d_map); bytes += MemKK::memory_usage(d_radelem); diff --git a/src/KOKKOS/sna_kokkos.h b/src/KOKKOS/sna_kokkos.h index 78a3dfa669..a438ccd25e 100644 --- a/src/KOKKOS/sna_kokkos.h +++ b/src/KOKKOS/sna_kokkos.h @@ -134,6 +134,8 @@ class SNAKokkos { static constexpr int vector_length = vector_length_; using KKDeviceType = typename KKDevice::value; + static constexpr LAMMPS_NS::ExecutionSpace execution_space = ExecutionSpaceFromDevice::space; + static constexpr int host_flag = (execution_space == LAMMPS_NS::Host); typedef Kokkos::View t_sna_1i; typedef Kokkos::View t_sna_1d; @@ -141,6 +143,7 @@ class SNAKokkos { typedef Kokkos::View t_sna_2i; typedef Kokkos::View t_sna_2d; typedef Kokkos::View t_sna_2d_ll; + typedef Kokkos::View t_sna_2d_lr; typedef Kokkos::View t_sna_3d; typedef Kokkos::View t_sna_3d_ll; typedef Kokkos::View t_sna_4d; @@ -156,7 +159,7 @@ class SNAKokkos { typedef Kokkos::View t_sna_3c; typedef Kokkos::View t_sna_3c_ll; typedef Kokkos::View t_sna_4c; - typedef Kokkos::View t_sna_4c3_ll; + typedef Kokkos::View t_sna_4c3; typedef Kokkos::View t_sna_4c_ll; typedef Kokkos::View t_sna_3c3; typedef Kokkos::View t_sna_5c; @@ -168,7 +171,8 @@ class SNAKokkos { SNAKokkos(const SNAKokkos& sna, const typename Kokkos::TeamPolicy::member_type& team); inline - SNAKokkos(real_type, int, real_type, int, int, int, int, int, int, int); + //SNAKokkos(real_type, int, real_type, int, int, int, int, int, int, int); + SNAKokkos(const PairSNAPKokkos&); KOKKOS_INLINE_FUNCTION ~SNAKokkos(); @@ -182,88 +186,87 @@ class SNAKokkos { double memory_usage(); int ncoeff; - int host_flag; // functions for bispectrum coefficients, GPU only KOKKOS_INLINE_FUNCTION - void compute_cayley_klein(const int&, const int&, const int&); + void compute_cayley_klein(const int&, const int&) const; KOKKOS_INLINE_FUNCTION - void pre_ui(const int&, const int&, const int&, const int&); // ForceSNAP + void pre_ui(const int&, const int&, const int&) const; // ForceSNAP // version of the code with parallelism over j_bend KOKKOS_INLINE_FUNCTION - void compute_ui_small(const typename Kokkos::TeamPolicy::member_type& team, const int, const int, const int, const int); // ForceSNAP + void compute_ui_small(const typename Kokkos::TeamPolicy::member_type& team, const int, const int, const int, const int) const; // ForceSNAP // version of the code without parallelism over j_bend KOKKOS_INLINE_FUNCTION - void compute_ui_large(const typename Kokkos::TeamPolicy::member_type& team, const int, const int, const int); // ForceSNAP + void compute_ui_large(const typename Kokkos::TeamPolicy::member_type& team, const int, const int, const int) const; // ForceSNAP + // desymmetrize ulisttot KOKKOS_INLINE_FUNCTION - void compute_zi(const int&, const int&, const int&); // ForceSNAP + void transform_ui(const int&, const int&) const; + + template KOKKOS_INLINE_FUNCTION + void compute_zi(const int&, const int&) const; // ForceSNAP + template KOKKOS_INLINE_FUNCTION + void compute_yi(const int&, const int&) const; // ForceSNAP + template KOKKOS_INLINE_FUNCTION + void compute_yi_with_zlist(const int&, const int&) const; // ForceSNAP + template KOKKOS_INLINE_FUNCTION + void compute_bi(const int&, const int&) const; // ForceSNAP KOKKOS_INLINE_FUNCTION - void compute_yi(int,int,int, - const Kokkos::View &beta_pack); // ForceSNAP - KOKKOS_INLINE_FUNCTION - void compute_yi_with_zlist(int,int,int, - const Kokkos::View &beta_pack); // ForceSNAP - KOKKOS_INLINE_FUNCTION - void compute_bi(const int&, const int&, const int&); // ForceSNAP + void compute_beta_linear(const int&, const int&, const int&) const; + template KOKKOS_INLINE_FUNCTION + void compute_beta_quadratic(const int&, const int&, const int&) const; // functions for derivatives, GPU only // version of the code with parallelism over j_bend template KOKKOS_INLINE_FUNCTION - void compute_fused_deidrj_small(const typename Kokkos::TeamPolicy::member_type& team, const int, const int, const int, const int); //ForceSNAP + void compute_fused_deidrj_small(const typename Kokkos::TeamPolicy::member_type& team, const int, const int, const int, const int) const; //ForceSNAP // version of the code without parallelism over j_bend template KOKKOS_INLINE_FUNCTION - void compute_fused_deidrj_large(const typename Kokkos::TeamPolicy::member_type& team, const int, const int, const int); //ForceSNAP + void compute_fused_deidrj_large(const typename Kokkos::TeamPolicy::member_type& team, const int, const int, const int) const; //ForceSNAP // core "evaluation" functions that get plugged into "compute" functions // plugged into compute_ui_small, compute_ui_large KOKKOS_FORCEINLINE_FUNCTION void evaluate_ui_jbend(const WignerWrapper&, const complex&, const complex&, const real_type&, const int&, - const int&, const int&, const int&); + const int&, const int&) const; // plugged into compute_zi, compute_yi KOKKOS_FORCEINLINE_FUNCTION complex evaluate_zi(const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&, - const int&, const int&, const int&, const int&, const real_type*); - // plugged into compute_yi, compute_yi_with_zlist + const int&, const int&, const int&, const real_type*) const; + // plugged into compute_bi KOKKOS_FORCEINLINE_FUNCTION - real_type evaluate_beta_scaled(const int&, const int&, const int&, const int&, const int&, const int&, const int&, const int&, - const Kokkos::View &); + real_type evaluate_bi(const int&, const int&, const int&, const int&, + const int&, const int&, const int&) const; + // plugged into compute_yi, compute_yi_with_zlist + template KOKKOS_FORCEINLINE_FUNCTION + real_type evaluate_beta_scaled(const int&, const int&, const int&, const int&, const int&, const int&, const int&) const; // plugged into compute_fused_deidrj_small, compute_fused_deidrj_large KOKKOS_FORCEINLINE_FUNCTION real_type evaluate_duidrj_jbend(const WignerWrapper&, const complex&, const complex&, const real_type&, const WignerWrapper&, const complex&, const complex&, const real_type&, - const int&, const int&, const int&, const int&); + const int&, const int&, const int&) const; // functions for bispectrum coefficients, CPU only - KOKKOS_INLINE_FUNCTION - void pre_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team,const int&,const int&); // ForceSNAP - KOKKOS_INLINE_FUNCTION - void compute_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int); // ForceSNAP - KOKKOS_INLINE_FUNCTION - void compute_zi_cpu(const int&); // ForceSNAP - KOKKOS_INLINE_FUNCTION - void compute_yi_cpu(int, - const Kokkos::View &beta); // ForceSNAP - KOKKOS_INLINE_FUNCTION - void compute_bi_cpu(const typename Kokkos::TeamPolicy::member_type& team, int); // ForceSNAP + template KOKKOS_INLINE_FUNCTION + void compute_ui_cpu(const int&, const int&) const; // ForceSNAP // functions for derivatives, CPU only KOKKOS_INLINE_FUNCTION - void compute_duidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int); //ForceSNAP + void compute_duidrj_cpu(const int&, const int&) const; //ForceSNAP KOKKOS_INLINE_FUNCTION - void compute_deidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int); // ForceSNAP + void compute_deidrj_cpu(const int&, const int&) const; // ForceSNAP KOKKOS_INLINE_FUNCTION - real_type compute_sfac(real_type, real_type, real_type, real_type); // add_uarraytot, compute_duarray + real_type compute_sfac(real_type, real_type, real_type, real_type) const; // add_uarraytot, compute_duarray KOKKOS_INLINE_FUNCTION - real_type compute_dsfac(real_type, real_type, real_type, real_type); // compute_duarray + real_type compute_dsfac(real_type, real_type, real_type, real_type) const; // compute_duarray KOKKOS_INLINE_FUNCTION - void compute_s_dsfac(const real_type, const real_type, const real_type, const real_type, real_type&, real_type&); // compute_cayley_klein + void compute_s_dsfac(const real_type, const real_type, const real_type, const real_type, real_type&, real_type&) const; // compute_cayley_klein #ifdef TIMING_INFO double* timers; @@ -283,37 +286,41 @@ class SNAKokkos { t_sna_2d dinnerij; t_sna_2i element; t_sna_3d dedr; - int natom, nmax; + int natom, natom_pad, nmax; void grow_rij(int, int); int twojmax, diagonalstyle; + // Input beta coefficients; aliases the object in PairSnapKokkos + t_sna_2d_lr d_coeffelem; + + // Beta for all atoms in list; aliases the object in PairSnapKokkos + // for qSNAP the quadratic terms get accumulated into it + // in compute_bi + t_sna_2d d_beta; + + // Structures for both the CPU, GPU backend + t_sna_3d ulisttot_re; + t_sna_3d ulisttot_im; + t_sna_3c ulisttot; // un-folded ulisttot + + t_sna_3c zlist; t_sna_3d blist; - t_sna_3c_ll ulisttot; - t_sna_3c_ll ulisttot_full; // un-folded ulisttot, cpu only - t_sna_3c_ll zlist; - t_sna_3c_ll ulist; - t_sna_3c_ll ylist; + t_sna_3d ylist_re; + t_sna_3d ylist_im; - // derivatives of data - t_sna_4c3_ll dulist; + // Structures for the CPU backend only + t_sna_3c ulist_cpu; + t_sna_4c3 dulist_cpu; // Modified structures for GPU backend - t_sna_3c_ll a_pack; // Cayley-Klein `a` - t_sna_3c_ll b_pack; // `b` - t_sna_4c_ll da_pack; // `da` - t_sna_4c_ll db_pack; // `db` - t_sna_4d_ll sfac_pack; // sfac, dsfac_{x,y,z} - - t_sna_4d_ll ulisttot_re_pack; // split real, - t_sna_4d_ll ulisttot_im_pack; // imag, AoSoA, flattened - t_sna_4c_ll ulisttot_pack; // AoSoA layout - t_sna_4c_ll zlist_pack; // AoSoA layout - t_sna_4d_ll blist_pack; - t_sna_4d_ll ylist_pack_re; // split real, - t_sna_4d_ll ylist_pack_im; // imag AoSoA layout + t_sna_2c a_gpu; // Cayley-Klein `a` + t_sna_2c b_gpu; // `b` + t_sna_3c da_gpu; // `da` + t_sna_3c db_gpu; // `db` + t_sna_3d sfac_gpu; // sfac, dsfac_{x,y,z} int idxcg_max, idxu_max, idxu_half_max, idxu_cache_max, idxz_max, idxb_max; @@ -363,25 +370,11 @@ class SNAKokkos { inline void init_rootpqarray(); // init() - KOKKOS_INLINE_FUNCTION - void add_uarraytot(const typename Kokkos::TeamPolicy::member_type& team, int, int, const real_type&, const real_type&, const real_type&, const real_type&, const real_type&, int); // compute_ui - - KOKKOS_INLINE_FUNCTION - void compute_uarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int, - const real_type&, const real_type&, const real_type&, - const real_type&, const real_type&); // compute_ui_cpu - - inline double deltacg(int, int, int); // init_clebsch_gordan inline int compute_ncoeff(); // SNAKokkos() - KOKKOS_INLINE_FUNCTION - void compute_duarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int, int, - const real_type&, const real_type&, const real_type&, // compute_duidrj_cpu - const real_type&, const real_type&, const real_type&, const real_type&, const real_type&, - const real_type&, const real_type&); // Sets the style for the switching function // 0 = none @@ -401,6 +394,9 @@ class SNAKokkos { real_type wself; int wselfall_flag; + // quadratic flag + int quadratic_flag; + int bzero_flag; // 1 if bzero subtracted from barray Kokkos::View bzero; // array of B values for isolated atoms }; @@ -409,4 +405,3 @@ class SNAKokkos { #include "sna_kokkos_impl.h" #endif - diff --git a/src/KOKKOS/sna_kokkos_impl.h b/src/KOKKOS/sna_kokkos_impl.h index d5690ea60a..9a97f229b5 100644 --- a/src/KOKKOS/sna_kokkos_impl.h +++ b/src/KOKKOS/sna_kokkos_impl.h @@ -30,35 +30,24 @@ static const double MY_PI2 = 1.57079632679489661923; // pi/2 template inline -SNAKokkos::SNAKokkos(real_type rfac0_in, - int twojmax_in, real_type rmin0_in, int switch_flag_in, int bzero_flag_in, - int chem_flag_in, int bnorm_flag_in, int wselfall_flag_in, int nelements_in, int switch_inner_flag_in) +SNAKokkos::SNAKokkos(const PairSNAPKokkos& psk) + : rfac0(psk.rfac0), rmin0(psk.rmin0), switch_flag(psk.switchflag), + bzero_flag(psk.bzeroflag), chem_flag(psk.chemflag), bnorm_flag(psk.bnormflag), + wselfall_flag(psk.wselfallflag), switch_inner_flag(psk.switchinnerflag), + quadratic_flag(psk.quadraticflag), twojmax(psk.twojmax), d_coeffelem(psk.d_coeffelem) { - LAMMPS_NS::ExecutionSpace execution_space = ExecutionSpaceFromDevice::space; - host_flag = (execution_space == LAMMPS_NS::Host); - wself = static_cast(1.0); - rfac0 = rfac0_in; - rmin0 = rmin0_in; - switch_flag = switch_flag_in; - switch_inner_flag = switch_inner_flag_in; - bzero_flag = bzero_flag_in; - - chem_flag = chem_flag_in; if (chem_flag) - nelements = nelements_in; + nelements = psk.nelements; else nelements = 1; - bnorm_flag = bnorm_flag_in; - wselfall_flag = wselfall_flag_in; - - twojmax = twojmax_in; ncoeff = compute_ncoeff(); nmax = 0; natom = 0; + natom_pad = 0; build_indexlist(); @@ -301,65 +290,47 @@ void SNAKokkos::grow_rij(int newnatom, int { if (newnatom <= natom && newnmax <= nmax) return; natom = newnatom; + // Create padded structures + const int natom_div = (natom + vector_length - 1) / vector_length; + natom_pad = natom_div * vector_length; nmax = newnmax; - MemKK::realloc_kokkos(rij,"sna:rij",natom,nmax,3); - MemKK::realloc_kokkos(wj,"sna:wj",natom,nmax); - MemKK::realloc_kokkos(rcutij,"sna:rcutij",natom,nmax); - MemKK::realloc_kokkos(sinnerij,"sna:sinnerij",natom,nmax); - MemKK::realloc_kokkos(dinnerij,"sna:dinnerij",natom,nmax); - MemKK::realloc_kokkos(inside,"sna:inside",natom,nmax); - MemKK::realloc_kokkos(element,"sna:element",natom,nmax); - MemKK::realloc_kokkos(dedr,"sna:dedr",natom,nmax,3); + MemKK::realloc_kokkos(rij,"sna:rij",natom_pad,nmax,3); + MemKK::realloc_kokkos(wj,"sna:wj",natom_pad,nmax); + MemKK::realloc_kokkos(rcutij,"sna:rcutij",natom_pad,nmax); + MemKK::realloc_kokkos(sinnerij,"sna:sinnerij",natom_pad,nmax); + MemKK::realloc_kokkos(dinnerij,"sna:dinnerij",natom_pad,nmax); + MemKK::realloc_kokkos(inside,"sna:inside",natom_pad,nmax); + MemKK::realloc_kokkos(element,"sna:element",natom_pad,nmax); + MemKK::realloc_kokkos(dedr,"sna:dedr",natom_pad,nmax,3); -#ifdef LMP_KOKKOS_GPU - if (!host_flag) { - const int natom_div = (natom + vector_length - 1) / vector_length; + MemKK::realloc_kokkos(ulisttot_re,"sna:ulisttot_re", natom_pad, nelements, idxu_half_max); + MemKK::realloc_kokkos(ulisttot_im,"sna:ulisttot_im", natom_pad, nelements, idxu_half_max); + MemKK::realloc_kokkos(ulisttot,"sna:ulisttot", natom_pad, nelements, idxu_max); - MemKK::realloc_kokkos(a_pack,"sna:a_pack",vector_length,nmax,natom_div); - MemKK::realloc_kokkos(b_pack,"sna:b_pack",vector_length,nmax,natom_div); - MemKK::realloc_kokkos(da_pack,"sna:da_pack",vector_length,nmax,natom_div,3); - MemKK::realloc_kokkos(db_pack,"sna:db_pack",vector_length,nmax,natom_div,3); - MemKK::realloc_kokkos(sfac_pack,"sna:sfac_pack",vector_length,nmax,natom_div,4); - MemKK::realloc_kokkos(ulisttot,"sna:ulisttot",1,1,1); // dummy allocation - MemKK::realloc_kokkos(ulisttot_full,"sna:ulisttot",1,1,1); - MemKK::realloc_kokkos(ulisttot_re_pack,"sna:ulisttot_re_pack",vector_length,idxu_half_max,nelements,natom_div); - MemKK::realloc_kokkos(ulisttot_im_pack,"sna:ulisttot_im_pack",vector_length,idxu_half_max,nelements,natom_div); - MemKK::realloc_kokkos(ulisttot_pack,"sna:ulisttot_pack",vector_length,idxu_max,nelements,natom_div); - MemKK::realloc_kokkos(ulist,"sna:ulist",1,1,1); - MemKK::realloc_kokkos(zlist,"sna:zlist",1,1,1); - MemKK::realloc_kokkos(zlist_pack,"sna:zlist_pack",vector_length,idxz_max,ndoubles,natom_div); - MemKK::realloc_kokkos(blist,"sna:blist",natom,ntriples,idxb_max); - MemKK::realloc_kokkos(blist_pack,"sna:blist_pack",vector_length,idxb_max,ntriples,natom_div); - MemKK::realloc_kokkos(ylist,"sna:ylist",1,1,1); - MemKK::realloc_kokkos(ylist_pack_re,"sna:ylist_pack_re",vector_length,idxu_half_max,nelements,natom_div); - MemKK::realloc_kokkos(ylist_pack_im,"sna:ylist_pack_im",vector_length,idxu_half_max,nelements,natom_div); - MemKK::realloc_kokkos(dulist,"sna:dulist",1,1,1); + MemKK::realloc_kokkos(zlist,"sna:zlist", natom_pad, ndoubles, idxz_max); + MemKK::realloc_kokkos(blist,"sna:blist", natom_pad, ntriples, idxb_max); + + MemKK::realloc_kokkos(ylist_re,"sna:ylist_re", natom_pad, nelements, idxu_half_max); + MemKK::realloc_kokkos(ylist_im,"sna:ylist_im", natom_pad, nelements, idxu_half_max); + + if constexpr (!host_flag) { + MemKK::realloc_kokkos(a_gpu,"sna:a_gpu",natom_pad,nmax); + MemKK::realloc_kokkos(b_gpu,"sna:b_gpu",natom_pad,nmax); + MemKK::realloc_kokkos(da_gpu,"sna:da_gpu",natom_pad,nmax,3); + MemKK::realloc_kokkos(db_gpu,"sna:db_gpu",natom_pad,nmax,3); + MemKK::realloc_kokkos(sfac_gpu,"sna:sfac_gpu",natom_pad,nmax,4); + MemKK::realloc_kokkos(ulist_cpu,"sna:ulist_cpu",1,1,1); + MemKK::realloc_kokkos(dulist_cpu,"sna:dulist_cpu",1,1,1); } else { -#endif - MemKK::realloc_kokkos(a_pack,"sna:a_pack",1,1,1); - MemKK::realloc_kokkos(b_pack,"sna:b_pack",1,1,1); - MemKK::realloc_kokkos(da_pack,"sna:da_pack",1,1,1,1); - MemKK::realloc_kokkos(db_pack,"sna:db_pack",1,1,1,1); - MemKK::realloc_kokkos(sfac_pack,"sna:sfac_pack",1,1,1,1); - MemKK::realloc_kokkos(ulisttot,"sna:ulisttot",idxu_half_max,nelements,natom); - MemKK::realloc_kokkos(ulisttot_full,"sna:ulisttot_full",idxu_max,nelements,natom); - MemKK::realloc_kokkos(ulisttot_re_pack,"sna:ulisttot_re",1,1,1,1); - MemKK::realloc_kokkos(ulisttot_im_pack,"sna:ulisttot_im",1,1,1,1); - MemKK::realloc_kokkos(ulisttot_pack,"sna:ulisttot_pack",1,1,1,1); - MemKK::realloc_kokkos(ulist,"sna:ulist",idxu_cache_max,natom,nmax); - MemKK::realloc_kokkos(zlist,"sna:zlist",idxz_max,ndoubles,natom); - MemKK::realloc_kokkos(zlist_pack,"sna:zlist_pack",1,1,1,1); - MemKK::realloc_kokkos(blist,"sna:blist",natom,ntriples,idxb_max); - MemKK::realloc_kokkos(blist_pack,"sna:blist_pack",1,1,1,1); - MemKK::realloc_kokkos(ylist,"sna:ylist",idxu_half_max,nelements,natom); - MemKK::realloc_kokkos(ylist_pack_re,"sna:ylist_pack_re",1,1,1,1); - MemKK::realloc_kokkos(ylist_pack_im,"sna:ylist_pack_im",1,1,1,1); - MemKK::realloc_kokkos(dulist,"sna:dulist",idxu_cache_max,natom,nmax); - -#ifdef LMP_KOKKOS_GPU + MemKK::realloc_kokkos(a_gpu,"sna:a_gpu",1,1); + MemKK::realloc_kokkos(b_gpu,"sna:b_gpu",1,1); + MemKK::realloc_kokkos(da_gpu,"sna:da_gpu",1,1,1); + MemKK::realloc_kokkos(db_gpu,"sna:db_gpu",1,1,1); + MemKK::realloc_kokkos(sfac_gpu,"sna:sfac_gpu",1,1,1); + MemKK::realloc_kokkos(ulist_cpu,"sna:ulist_cpu", natom_pad, nmax, idxu_cache_max); + MemKK::realloc_kokkos(dulist_cpu,"sna:dulist_cpu", natom_pad, nmax, idxu_cache_max); } -#endif } /* ---------------------------------------------------------------------- @@ -375,9 +346,8 @@ void SNAKokkos::grow_rij(int newnatom, int template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_cayley_klein(const int& iatom_mod, const int& jnbor, const int& iatom_div) +void SNAKokkos::compute_cayley_klein(const int& iatom, const int& jnbor) const { - const int iatom = iatom_mod + vector_length * iatom_div; const real_type x = rij(iatom,jnbor,0); const real_type y = rij(iatom,jnbor,1); const real_type z = rij(iatom,jnbor,2); @@ -431,28 +401,28 @@ void SNAKokkos::compute_cayley_klein(const const real_type dsfacuy = dsfac * uy; const real_type dsfacuz = dsfac * uz; - a_pack(iatom_mod,jnbor,iatom_div) = a; - b_pack(iatom_mod,jnbor,iatom_div) = b; + a_gpu(iatom,jnbor) = a; + b_gpu(iatom,jnbor) = b; - da_pack(iatom_mod,jnbor,iatom_div,0) = dax; - db_pack(iatom_mod,jnbor,iatom_div,0) = dbx; + da_gpu(iatom,jnbor,0) = dax; + db_gpu(iatom,jnbor,0) = dbx; - da_pack(iatom_mod,jnbor,iatom_div,1) = day; - db_pack(iatom_mod,jnbor,iatom_div,1) = dby; + da_gpu(iatom,jnbor,1) = day; + db_gpu(iatom,jnbor,1) = dby; - da_pack(iatom_mod,jnbor,iatom_div,2) = daz; - db_pack(iatom_mod,jnbor,iatom_div,2) = dbz; + da_gpu(iatom,jnbor,2) = daz; + db_gpu(iatom,jnbor,2) = dbz; - sfac_pack(iatom_mod,jnbor,iatom_div,0) = sfac; - sfac_pack(iatom_mod,jnbor,iatom_div,1) = dsfacux; - sfac_pack(iatom_mod,jnbor,iatom_div,2) = dsfacuy; - sfac_pack(iatom_mod,jnbor,iatom_div,3) = dsfacuz; + sfac_gpu(iatom,jnbor,0) = sfac; + sfac_gpu(iatom,jnbor,1) = dsfacux; + sfac_gpu(iatom,jnbor,2) = dsfacuy; + sfac_gpu(iatom,jnbor,3) = dsfacuz; // we need to explicitly zero `dedr` somewhere before hitting // ComputeFusedDeidrj --- this is just a convenient place to do it. - dedr(iatom_mod + vector_length * iatom_div, jnbor, 0) = static_cast(0.); - dedr(iatom_mod + vector_length * iatom_div, jnbor, 1) = static_cast(0.); - dedr(iatom_mod + vector_length * iatom_div, jnbor, 2) = static_cast(0.); + dedr(iatom, jnbor, 0) = static_cast(0.); + dedr(iatom, jnbor, 1) = static_cast(0.); + dedr(iatom, jnbor, 2) = static_cast(0.); } @@ -464,9 +434,8 @@ void SNAKokkos::compute_cayley_klein(const template KOKKOS_INLINE_FUNCTION -void SNAKokkos::pre_ui(const int& iatom_mod, const int& j, const int& ielem, const int& iatom_div) +void SNAKokkos::pre_ui(const int& iatom, const int& j, const int& ielem) const { - for (int jelem = 0; jelem < nelements; jelem++) { int jju_half = idxu_half_block(j); @@ -479,14 +448,13 @@ void SNAKokkos::pre_ui(const int& iatom_mo real_type re_part = static_cast(0.); if (ma == mb && (!chem_flag || ielem == jelem || wselfall_flag)) { re_part = wself; } - ulisttot_re_pack(iatom_mod, jju_half, jelem, iatom_div) = re_part; - ulisttot_im_pack(iatom_mod, jju_half, jelem, iatom_div) = static_cast(0.); + ulisttot_re(iatom, jelem, jju_half) = re_part; + ulisttot_im(iatom, jelem, jju_half) = 0; jju_half++; } } } - } /* ---------------------------------------------------------------------- @@ -498,9 +466,9 @@ void SNAKokkos::pre_ui(const int& iatom_mo template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_ui_small(const typename Kokkos::TeamPolicy::member_type& team, const int iatom_mod, const int j_bend, const int jnbor, const int iatom_div) +void SNAKokkos::compute_ui_small(const typename Kokkos::TeamPolicy::member_type& team, const int iatom_mod, const int j_bend, const int jnbor, const int iatom_div) const { - + const int iatom = iatom_mod + vector_length * iatom_div; // get shared memory offset // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer const int tile_size = vector_length * (twojmax + 1); @@ -512,25 +480,26 @@ void SNAKokkos::compute_ui_small(const typ const WignerWrapper ulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod); // load parameters - const complex a = a_pack(iatom_mod, jnbor, iatom_div); - const complex b = b_pack(iatom_mod, jnbor, iatom_div); - const real_type sfac = sfac_pack(iatom_mod, jnbor, iatom_div, 0); + const complex a = a_gpu(iatom, jnbor); + const complex b = b_gpu(iatom, jnbor); + const real_type sfac = sfac_gpu(iatom, jnbor, 0); - const int jelem = element(iatom_mod + vector_length * iatom_div, jnbor); + const int jelem = element(iatom, jnbor); // we need to "choose" when to bend // this for loop is here for context --- we expose additional // parallelism over this loop instead //for (int j_bend = 0; j_bend <= twojmax; j_bend++) { - evaluate_ui_jbend(ulist_wrapper, a, b, sfac, jelem, iatom_mod, j_bend, iatom_div); + evaluate_ui_jbend(ulist_wrapper, a, b, sfac, jelem, iatom, j_bend); } // Version of the code that loops over all `j_bend` values which reduces integer arithmetic // and some amount of load imbalance, at the expense of reducing parallelism template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_ui_large(const typename Kokkos::TeamPolicy::member_type& team, const int iatom_mod, const int jnbor, const int iatom_div) +void SNAKokkos::compute_ui_large(const typename Kokkos::TeamPolicy::member_type& team, const int iatom_mod, const int jnbor, const int iatom_div) const { + const int iatom = iatom_mod + vector_length * iatom_div; // get shared memory offset // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer const int tile_size = vector_length * (twojmax + 1); @@ -542,18 +511,18 @@ void SNAKokkos::compute_ui_large(const typ const WignerWrapper ulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod); // load parameters - const complex a = a_pack(iatom_mod, jnbor, iatom_div); - const complex b = b_pack(iatom_mod, jnbor, iatom_div); - const real_type sfac = sfac_pack(iatom_mod, jnbor, iatom_div, 0); + const complex a = a_gpu(iatom, jnbor); + const complex b = b_gpu(iatom, jnbor); + const real_type sfac = sfac_gpu(iatom, jnbor, 0); - const int jelem = element(iatom_mod + vector_length * iatom_div, jnbor); + const int jelem = element(iatom, jnbor); // we need to "choose" when to bend #ifdef LMP_KK_DEVICE_COMPILE #pragma unroll #endif for (int j_bend = 0; j_bend <= twojmax; j_bend++) { - evaluate_ui_jbend(ulist_wrapper, a, b, sfac, jelem, iatom_mod, j_bend, iatom_div); + evaluate_ui_jbend(ulist_wrapper, a, b, sfac, jelem, iatom, j_bend); } } @@ -562,9 +531,8 @@ template KOKKOS_FORCEINLINE_FUNCTION void SNAKokkos::evaluate_ui_jbend(const WignerWrapper& ulist_wrapper, const complex& a, const complex& b, const real_type& sfac, const int& jelem, - const int& iatom_mod, const int& j_bend, const int& iatom_div) + const int& iatom, const int& j_bend) const { - // utot(j,ma,mb) = 0 for all j,ma,ma // utot(j,ma,ma) = 1 for all j,ma // for j in neighbors of i: @@ -624,8 +592,8 @@ void SNAKokkos::evaluate_ui_jbend(const Wi const complex ulist_prev = ulist_wrapper.get(ma); // atomic add the previous level here - Kokkos::atomic_add(&(ulisttot_re_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.re * sfac); - Kokkos::atomic_add(&(ulisttot_im_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.im * sfac); + Kokkos::atomic_add(&(ulisttot_re(iatom, jelem, jjup + ma)), ulist_prev.re * sfac); + Kokkos::atomic_add(&(ulisttot_im(iatom, jelem, jjup + ma)), ulist_prev.im * sfac); // ulist_accum += rootpq * b * ulist_prev; real_type rootpq = rootpqarray(j - ma, mb); @@ -654,217 +622,209 @@ void SNAKokkos::evaluate_ui_jbend(const Wi const complex ulist_prev = ulist_wrapper.get(ma); // atomic add the previous level here - Kokkos::atomic_add(&(ulisttot_re_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.re * sfac); - Kokkos::atomic_add(&(ulisttot_im_pack(iatom_mod, jjup + ma, jelem, iatom_div)), ulist_prev.im * sfac); + Kokkos::atomic_add(&(ulisttot_re(iatom, jelem, jjup + ma)), ulist_prev.re * sfac); + Kokkos::atomic_add(&(ulisttot_im(iatom, jelem, jjup + ma)), ulist_prev.im * sfac); } - } /* ---------------------------------------------------------------------- - compute Zi by summing over products of Ui, - AoSoA data layout to take advantage of coalescing, avoiding warp - divergence. GPU version + compute Ui by summing over bispectrum components. CPU only. + This first computes Wigner U-functions for one neighbor. + `ulisttot` uses a "cached" data layout, matching the amount of + information stored between layers via scratch memory on the GPU path. + Next, it adds Wigner U-functions for each neighbor to ulisttot which is + in a "half" data layout, which is a compressed layout + which still keeps the recursive calculation simple. ------------------------------------------------------------------------- */ template -KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_zi(const int& iatom_mod, const int& jjz, const int& iatom_div) +template KOKKOS_INLINE_FUNCTION +void SNAKokkos::compute_ui_cpu(const int& iatom, const int& jnbor) const { + // utot(j,ma,mb) = 0 for all j,ma,ma + // utot(j,ma,ma) = 1 for all j,ma + // for j in neighbors of i: + // compute r0 = (x,y,z,z0) + // utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb - int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, idxcg; - idxz(jjz).get_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, idxcg); + const real_type x = rij(iatom,jnbor,0); + const real_type y = rij(iatom,jnbor,1); + const real_type z = rij(iatom,jnbor,2); + const real_type rsq = x * x + y * y + z * z; + const real_type r = sqrt(rsq); - const real_type* cgblock = cglist.data() + idxcg; + const real_type theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0); + // theta0 = (r - rmin0) * rscale0; + const real_type z0 = r / tan(theta0); - int idouble = 0; + // begin what was "compute_uarray_cpu" - for (int elem1 = 0; elem1 < nelements; elem1++) { - for (int elem2 = 0; elem2 < nelements; elem2++) { + // compute Cayley-Klein parameters for unit quaternion + real_type r0inv = static_cast(1.0) / sqrt(r * r + z0 * z0); + complex a = { r0inv * z0, -r0inv * z }; + complex b = { r0inv * y, -r0inv * x }; - zlist_pack(iatom_mod,jjz,idouble,iatom_div) = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom_mod, elem1, elem2, iatom_div, cgblock); + // VMK Section 4.8.2 + ulist_cpu(iatom, jnbor, 0) = complex::one(); - idouble++; + for (int j = 1; j <= twojmax; j++) { + int jju = idxu_cache_block[j]; // removed "const" to work around GCC 7 bug + int jjup = idxu_cache_block[j-1]; // removed "const" to work around GCC 7 bug + + // fill in left side of matrix layer from previous layer + for (int mb = 0; 2*mb <= j; mb++) { + int jju_index = jju + (j + 1) * mb; + int jjup_index = jjup + j * mb; + + complex ui = complex::zero(); + + for (int ma = 0; ma < j; ma++) { + complex ui_prev = ulist_cpu(iatom, jnbor, jjup_index); + + real_type rootpq = rootpqarray(j - ma, j - mb); + ui.re += rootpq * (a.re * ui_prev.re + a.im * ui_prev.im); + ui.im += rootpq * (a.re * ui_prev.im - a.im * ui_prev.re); + + ulist_cpu(iatom, jnbor, jju_index) = ui; + + rootpq = rootpqarray(ma + 1, j - mb); + ui.re = -rootpq * (b.re * ui_prev.re + b.im * ui_prev.im); + ui.im = -rootpq * (b.re * ui_prev.im - b.im * ui_prev.re); + + jju_index++; + jjup_index++; + } + + ulist_cpu(iatom, jnbor, jju_index) = ui; + } + + // If j is odd (half-integer in the mathematical convention), we need + // to add one more row for convenience (for now). This can either be done + // via symmetry (see the commented code below), or by the equations to fill + // from the left instead of the right + if (j % 2 == 1) { + int mb = j / 2; + // begin filling in the extra row + int jju_index = jju + (mb + 1) * (j + 1); + int jjup_index = jjup + mb * j; + + complex ui = complex::zero(); + + for (int ma = 0; ma < j; ma++) { + complex ui_prev = ulist_cpu(iatom, jnbor, jjup_index); + + real_type rootpq = rootpqarray(j - ma, mb + 1); + ui.re += rootpq * (b.re * ui_prev.re - b.im * ui_prev.im); + ui.im += rootpq * (b.re * ui_prev.im + b.im * ui_prev.re); + + ulist_cpu(iatom, jnbor, jju_index) = ui; + + rootpq = rootpqarray(ma + 1, mb + 1); + ui.re = rootpq * (a.re * ui_prev.re - a.im * ui_prev.im); + ui.im = rootpq * (a.re * ui_prev.im + a.im * ui_prev.re); + + jju_index++; + jjup_index++; + } + + ulist_cpu(iatom, jnbor, jju_index) = ui; + } + } + + // begin what was add_uarraytot + + const real_type sfac = compute_sfac(r, rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor)) * wj(iatom,jnbor); + const auto jelem = element(iatom, jnbor); + + for (int j = 0; j <= twojmax; j++) { + int jju_half = idxu_half_block[j]; // index into ulisttot + int jju_cache = idxu_cache_block[j]; // index into ulist + + int count = 0; + for (int mb = 0; 2*mb <= j; mb++) { + for (int ma = 0; ma <= j; ma++) { + if constexpr (need_atomics) { + Kokkos::atomic_add(&(ulisttot_re(iatom, jelem, jju_half+count)), sfac * ulist_cpu(iatom, jnbor, jju_cache+count).re); + Kokkos::atomic_add(&(ulisttot_im(iatom, jelem, jju_half+count)), sfac * ulist_cpu(iatom, jnbor, jju_cache+count).im); + } else { + ulisttot_re(iatom, jelem, jju_half+count) += sfac * ulist_cpu(iatom, jnbor, jju_cache+count).re; + ulisttot_im(iatom, jelem, jju_half+count) += sfac * ulist_cpu(iatom, jnbor, jju_cache+count).im; + } + count++; + } } } } /* ---------------------------------------------------------------------- - compute Bi by summing conj(Ui)*Zi - AoSoA data layout to take advantage of coalescing, avoiding warp - divergence. + De-symmetrize ulisttot_re and _im and pack it into a unified ulisttot + structure, fused in with zeroing ylist ------------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_bi(const int& iatom_mod, const int& jjb, const int& iatom_div) +void SNAKokkos::transform_ui(const int& iatom, const int& idxu) const { - // for j1 = 0,...,twojmax - // for j2 = 0,twojmax - // for j = |j1-j2|,Min(twojmax,j1+j2),2 - // b(j1,j2,j) = 0 - // for mb = 0,...,jmid - // for ma = 0,...,j - // b(j1,j2,j) += - // 2*Conj(u(j,ma,mb))*z(j1,j2,j,ma,mb) + int elem_count = chem_flag ? nelements : 1; - const int j1 = idxb(jjb,0); - const int j2 = idxb(jjb,1); - const int j = idxb(jjb,2); + for (int ielem = 0; ielem < elem_count; ielem++) { - const int jjz = idxz_block(j1,j2,j); - const int jju = idxu_block[j]; + const FullHalfMapper mapper = idxu_full_half[idxu]; - int itriple = 0; - int idouble = 0; - for (int elem1 = 0; elem1 < nelements; elem1++) { - for (int elem2 = 0; elem2 < nelements; elem2++) { - for (int elem3 = 0; elem3 < nelements; elem3++) { + auto utot_re = ulisttot_re(iatom, ielem, mapper.idxu_half); + auto utot_im = ulisttot_im(iatom, ielem, mapper.idxu_half); - double sumzu = 0.0; - double sumzu_temp = 0.0; + if (mapper.flip_sign == 1) { + utot_im = -utot_im; + } else if (mapper.flip_sign == -1) { + utot_re = -utot_re; + } - for (int mb = 0; 2*mb < j; mb++) { - for (int ma = 0; ma <= j; ma++) { - const int jju_index = jju+mb*(j+1)+ma; - const int jjz_index = jjz+mb*(j+1)+ma; - if (2*mb == j) return; // I think we can remove this? - const complex utot = ulisttot_pack(iatom_mod, jju_index, elem3, iatom_div); - const complex zloc = zlist_pack(iatom_mod, jjz_index, idouble, iatom_div); - sumzu_temp += utot.re * zloc.re + utot.im * zloc.im; - } - } - sumzu += sumzu_temp; + ulisttot(iatom, ielem, idxu) = { utot_re, utot_im }; - // For j even, special treatment for middle column - if (j%2 == 0) { - sumzu_temp = 0.; - - const int mb = j/2; - for (int ma = 0; ma < mb; ma++) { - const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma; - const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; - - const complex utot = ulisttot_pack(iatom_mod, jju_index, elem3, iatom_div); - const complex zloc = zlist_pack(iatom_mod, jjz_index, idouble, iatom_div); - sumzu_temp += utot.re * zloc.re + utot.im * zloc.im; - - } - sumzu += sumzu_temp; - - const int ma = mb; - const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma; - const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; - - const complex utot = ulisttot_pack(iatom_mod, jju_index, elem3, iatom_div); - const complex zloc = zlist_pack(iatom_mod, jjz_index, idouble, iatom_div); - sumzu += static_cast(0.5) * (utot.re * zloc.re + utot.im * zloc.im); - } // end if jeven - - sumzu *= static_cast(2.0); - if (bzero_flag) { - if (!wselfall_flag) { - if (elem1 == elem2 && elem1 == elem3) { - sumzu -= bzero[j]; - } - } else { - sumzu -= bzero[j]; - } - } - blist_pack(iatom_mod, jjb, itriple, iatom_div) = sumzu; - //} // end loop over j - //} // end loop over j1, j2 - itriple++; - } // end loop over elem3 - idouble++; - } // end loop over elem2 - } // end loop over elem1 + if (mapper.flip_sign == 0) { + ylist_re(iatom, ielem, mapper.idxu_half) = 0.; + ylist_im(iatom, ielem, mapper.idxu_half) = 0.; + } + } } - /* ---------------------------------------------------------------------- - compute Yi from Ui without storing Zi, looping over zlist indices. - AoSoA data layout to take advantage of coalescing, avoiding warp - divergence. GPU version. + compute Zi by summing over products of Ui ------------------------------------------------------------------------- */ template -KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_yi(int iatom_mod, int jjz, int iatom_div, - const Kokkos::View &beta_pack) +template KOKKOS_INLINE_FUNCTION +void SNAKokkos::compute_zi(const int& iatom, const int& jjz) const { - - int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju_half, idxcg; - idxz(jjz).get_yi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju_half, idxcg); + int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, idxcg; + idxz(jjz).get_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, idxcg); const real_type *cgblock = cglist.data() + idxcg; - //int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2; - //int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2; - for (int elem1 = 0; elem1 < nelements; elem1++) { - for (int elem2 = 0; elem2 < nelements; elem2++) { - - const complex ztmp = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom_mod, elem1, elem2, iatom_div, cgblock); - - // apply to z(j1,j2,j,ma,mb) to unique element of y(j) - // find right y_list[jju] and beta(iatom,jjb) entries - // multiply and divide by j+1 factors - // account for multiplicity of 1, 2, or 3 - - // pick out right beta value - for (int elem3 = 0; elem3 < nelements; elem3++) { - - const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom_mod, elem1, elem2, elem3, iatom_div, beta_pack); - - Kokkos::atomic_add(&(ylist_pack_re(iatom_mod, jju_half, elem3, iatom_div)), betaj * ztmp.re); - Kokkos::atomic_add(&(ylist_pack_im(iatom_mod, jju_half, elem3, iatom_div)), betaj * ztmp.im); - } // end loop over elem3 - } // end loop over elem2 - } // end loop over elem1 + if constexpr (chemsnap) { + int idouble = 0; + for (int elem1 = 0; elem1 < nelements; elem1++) { + for (int elem2 = 0; elem2 < nelements; elem2++) { + zlist(iatom, idouble, jjz) = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom, elem1, elem2, cgblock); + idouble++; + } // end loop over elem2 + } // end loop over elem1 + } else { + zlist(iatom, 0, jjz) = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom, 0, 0, cgblock); + } } /* ---------------------------------------------------------------------- - compute Yi from Ui without storing Zi, looping over zlist indices. - AoSoA data layout to take advantage of coalescing, avoiding warp - divergence. GPU version. + Core "evaluation" kernel that computes a single zlist value. + This gets used in both `compute_zi` and `compute_yi` ------------------------------------------------------------------------- */ -template -KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_yi_with_zlist(int iatom_mod, int jjz, int iatom_div, - const Kokkos::View &beta_pack) -{ - int j1, j2, j, jju_half; - idxz(jjz).get_yi_with_zlist(j1, j2, j, jju_half); - - int idouble = 0; - for (int elem1 = 0; elem1 < nelements; elem1++) { - for (int elem2 = 0; elem2 < nelements; elem2++) { - const complex ztmp = zlist_pack(iatom_mod,jjz,idouble,iatom_div); - // apply to z(j1,j2,j,ma,mb) to unique element of y(j) - // find right y_list[jju] and beta(iatom,jjb) entries - // multiply and divide by j+1 factors - // account for multiplicity of 1, 2, or 3 - // pick out right beta value - for (int elem3 = 0; elem3 < nelements; elem3++) { - - const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom_mod, elem1, elem2, elem3, iatom_div, beta_pack); - - Kokkos::atomic_add(&(ylist_pack_re(iatom_mod, jju_half, elem3, iatom_div)), betaj * ztmp.re); - Kokkos::atomic_add(&(ylist_pack_im(iatom_mod, jju_half, elem3, iatom_div)), betaj * ztmp.im); - } // end loop over elem3 - idouble++; - } // end loop over elem2 - } // end loop over elem1 -} - -// Core "evaluation" kernel that computes a single zlist value -// which gets used in both `compute_zi` and `compute_yi` template KOKKOS_FORCEINLINE_FUNCTION typename SNAKokkos::complex SNAKokkos::evaluate_zi(const int& j1, const int& j2, const int& j, const int& ma1min, const int& ma2max, const int& mb1min, const int& mb2max, const int& na, const int& nb, - const int& iatom_mod, const int& elem1, const int& elem2, const int& iatom_div, const real_type* cgblock) { - + const int& iatom, const int& elem1, const int& elem2, const real_type* cgblock) const { complex ztmp = complex::zero(); int jju1 = idxu_block[j1] + (j1+1)*mb1min; @@ -884,8 +844,8 @@ typename SNAKokkos::complex SNAKokkos::complex SNAKokkos -KOKKOS_FORCEINLINE_FUNCTION -typename SNAKokkos::real_type SNAKokkos::evaluate_beta_scaled(const int& j1, const int& j2, const int& j, - const int& iatom_mod, const int& elem1, const int& elem2, const int& elem3, const int& iatom_div, - const Kokkos::View &beta_pack) { +template KOKKOS_INLINE_FUNCTION +void SNAKokkos::compute_bi(const int& iatom, const int& jjb) const +{ + // for j1 = 0,...,twojmax + // for j2 = 0,twojmax + // for j = |j1-j2|,Min(twojmax,j1+j2),2 + // b(j1,j2,j) = 0 + // for mb = 0,...,jmid + // for ma = 0,...,j + // b(j1,j2,j) += + // 2*Conj(u(j,ma,mb))*z(j1,j2,j,ma,mb) - real_type betaj = 0; + const int j1 = idxb(jjb,0); + const int j2 = idxb(jjb,1); + const int j = idxb(jjb,2); - if (j >= j1) { - const int jjb = idxb_block(j1, j2, j); - const int itriple = ((elem1 * nelements + elem2) * nelements + elem3) * idxb_max + jjb; - if (j1 == j) { - if (j2 == j) betaj = static_cast(3) * beta_pack(iatom_mod, itriple, iatom_div); - else betaj = static_cast(2) * beta_pack(iatom_mod, itriple, iatom_div); - } else betaj = beta_pack(iatom_mod, itriple, iatom_div); - } else if (j >= j2) { - const int jjb = idxb_block(j, j2, j1); - const int itriple = ((elem3 * nelements + elem2) * nelements + elem1) * idxb_max + jjb; - if (j2 == j) betaj = static_cast(2) * beta_pack(iatom_mod, itriple, iatom_div); - else betaj = beta_pack(iatom_mod, itriple, iatom_div); + const int jjz = idxz_block(j1,j2,j); + const int jju = idxu_block[j]; + + if constexpr (chemsnap) { + int itriple = 0; + int idouble = 0; + for (int elem1 = 0; elem1 < nelements; elem1++) { + for (int elem2 = 0; elem2 < nelements; elem2++) { + for (int elem3 = 0; elem3 < nelements; elem3++) { + blist(iatom, itriple, jjb) = evaluate_bi(j, jjz, jju, iatom, elem1, elem2, elem3); + itriple++; + } // end loop over elem3 + idouble++; + } // end loop over elem2 + } // end loop over elem1 } else { - const int jjb = idxb_block(j2, j, j1); - const int itriple = ((elem2 * nelements + elem3) * nelements + elem1) * idxb_max + jjb; - betaj = beta_pack(iatom_mod, itriple, iatom_div); + blist(iatom, 0, jjb) = evaluate_bi(j, jjz, jju, iatom, 0, 0, 0); } +} + +/* ---------------------------------------------------------------------- + Core "evaluation" kernel that computes a single blist value. + This gets used in `compute_bi` +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +real_type SNAKokkos::evaluate_bi(const int& j, const int& jjz, const int& jju, const int& iatom, const int& elem1, const int& elem2, const int& elem3) const +{ + // this computes the: + // b(j1,j2,j) = 0 + // for mb = 0,...,jmid + // for ma = 0,...,j + // b(j1,j2,j) += + // 2*Conj(u(j,ma,mb))*z(j1,j2,j,ma,mb) + // portion + + const int idouble = elem1 * nelements + elem2; + real_type sumzu = 0.0; + real_type sumzu_temp = 0.0; + + for (int mb = 0; 2*mb < j; mb++) { + for (int ma = 0; ma <= j; ma++) { + const int jju_index = jju+mb*(j+1)+ma; + const int jjz_index = jjz+mb*(j+1)+ma; + if (2*mb == j) return 0; // I think we can remove this? + const complex utot = ulisttot(iatom, elem3, jju_index); + const complex zloc = zlist(iatom, idouble, jjz_index); + sumzu_temp += utot.re * zloc.re + utot.im * zloc.im; + } + } + sumzu += sumzu_temp; + + // For j even, special treatment for middle column + if (j%2 == 0) { + sumzu_temp = 0.; + + const int mb = j/2; + for (int ma = 0; ma < mb; ma++) { + const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma; + const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; + + const complex utot = ulisttot(iatom, elem3, jju_index); + const complex zloc = zlist(iatom, idouble, jjz_index); + sumzu_temp += utot.re * zloc.re + utot.im * zloc.im; + + } + sumzu += sumzu_temp; + + const int ma = mb; + const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma; + const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; + + const complex utot = ulisttot(iatom, elem3, jju_index); + const complex zloc = zlist(iatom, idouble, jjz_index); + sumzu += static_cast(0.5) * (utot.re * zloc.re + utot.im * zloc.im); + } // end if jeven + + sumzu *= static_cast(2.0); + if (bzero_flag) { + if (!wselfall_flag) { + if (elem1 == elem2 && elem1 == elem3) { + sumzu -= bzero[j]; + } + } else { + sumzu -= bzero[j]; + } + } + return sumzu; + //} // end loop over j + //} // end loop over j1, j2 +} + + +/* ---------------------------------------------------------------------- + compute beta by either appropriately copying it from d_coeffi + or accumulating the quadratic terms from blist +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void SNAKokkos::compute_beta_linear(const int& iatom, const int& idxb, const int& ielem) const +{ + auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL); + + for (int itriple = 0; itriple < ntriples; itriple++) { + int icoeff = idxb + itriple * idxb_max; + d_beta(iatom, icoeff) = d_coeffi[icoeff+1]; + } +} + +template +template +KOKKOS_INLINE_FUNCTION +void SNAKokkos::compute_beta_quadratic(const int& iatom, const int& idxb, const int& ielem) const +{ + auto d_coeffi = Kokkos::subview(d_coeffelem, ielem, Kokkos::ALL); + + // handle quadratic && chemflag as a special case + if (chem_flag) { + if (idxb == 0) { + // no need to use atomics, we're just serializing + int k = ncoeff+1; + for (int icoeff = 0; icoeff < ncoeff; icoeff++) { + const auto idxb = icoeff % idxb_max; + const auto idx_chem = icoeff / idxb_max; + real_type bveci = blist(iatom, idx_chem, idxb); + d_beta(iatom, icoeff) += d_coeffi[k] * bveci; + k++; + for (int jcoeff = icoeff+1; jcoeff < ncoeff; jcoeff++) { + const auto jdxb = jcoeff % idxb_max; + const auto jdx_chem = jcoeff / idxb_max; + real_type bvecj = blist(iatom, jdx_chem, jdxb); + d_beta(iatom, icoeff) += d_coeffi[k] * bvecj; + d_beta(iatom, jcoeff) += d_coeffi[k] * bveci; + k++; + } + } + } + } else { + // Compute triangular partial sum via a closed form to get the starting offset + int k = (idxb * (1 + 2 * idxb_max - idxb)) / 2 + idxb_max + 1; + real_type bveci = blist(iatom, 0, idxb); + + // Locally accumulate the contribution to d_beta(iatom, idxb) + real_type beta_idxb_accum = d_coeffi[k] * bveci; + k++; + + for (int jdxb = idxb + 1; jdxb < idxb_max; jdxb++) { + real_type bvecj = blist(iatom, 0, jdxb); + real_type coeff_k = d_coeffi[k]; + beta_idxb_accum += coeff_k * bvecj; + + // Accumulate "half" contribution into d_beta(iatom, jdxb) + if constexpr (need_atomics) + Kokkos::atomic_add(&d_beta(iatom, jdxb), coeff_k * bveci); + else + d_beta(iatom, jdxb) += coeff_k * bveci; + + k++; + } + + if constexpr (need_atomics) + Kokkos::atomic_add(&d_beta(iatom, idxb), beta_idxb_accum); + else + d_beta(iatom, idxb) += beta_idxb_accum; + } +} + + +/* ---------------------------------------------------------------------- + Compute Yi from Ui without storing Zi, looping over zlist indices. +------------------------------------------------------------------------- */ + +template +template +KOKKOS_INLINE_FUNCTION +void SNAKokkos::compute_yi(const int& iatom, const int& jjz) const +{ + int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju_half, idxcg; + idxz(jjz).get_yi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju_half, idxcg); + + const real_type *cgblock = cglist.data() + idxcg; + //int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2; + //int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2; + + if constexpr (chemsnap) { + for (int elem1 = 0; elem1 < nelements; elem1++) { + for (int elem2 = 0; elem2 < nelements; elem2++) { + + const complex ztmp = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom, elem1, elem2, cgblock); + + // apply to z(j1,j2,j,ma,mb) to unique element of y(j) + // find right y_list[jju] and beta(iatom,jjb) entries + // multiply and divide by j+1 factors + // account for multiplicity of 1, 2, or 3 + + // pick out right beta value + for (int elem3 = 0; elem3 < nelements; elem3++) { + + const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom, elem1, elem2, elem3); + + if constexpr (need_atomics) { + Kokkos::atomic_add(&(ylist_re(iatom, elem3, jju_half)), betaj * ztmp.re); + Kokkos::atomic_add(&(ylist_im(iatom, elem3, jju_half)), betaj * ztmp.im); + } else { + ylist_re(iatom, elem3, jju_half) += betaj * ztmp.re; + ylist_im(iatom, elem3, jju_half) += betaj * ztmp.im; + } + } // end loop over elem3 + } // end loop over elem2 + } // end loop over elem1 + } else { + const complex ztmp = evaluate_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, iatom, 0, 0, cgblock); + const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom, 0, 0, 0); + + if constexpr (need_atomics) { + Kokkos::atomic_add(&(ylist_re(iatom, 0, jju_half)), betaj * ztmp.re); + Kokkos::atomic_add(&(ylist_im(iatom, 0, jju_half)), betaj * ztmp.im); + } else { + ylist_re(iatom, 0, jju_half) += betaj * ztmp.re; + ylist_im(iatom, 0, jju_half) += betaj * ztmp.im; + } + } +} + +/* ---------------------------------------------------------------------- + compute Yi from Ui with the precomputed Zi. +------------------------------------------------------------------------- */ + +template +template +KOKKOS_INLINE_FUNCTION +void SNAKokkos::compute_yi_with_zlist(const int& iatom, const int& jjz) const +{ + int j1, j2, j, jju_half; + idxz(jjz).get_yi_with_zlist(j1, j2, j, jju_half); + + if constexpr (chemsnap) { + int idouble = 0; + for (int elem1 = 0; elem1 < nelements; elem1++) { + for (int elem2 = 0; elem2 < nelements; elem2++) { + const complex ztmp = zlist(iatom, idouble, jjz); + // apply to z(j1,j2,j,ma,mb) to unique element of y(j) + // find right y_list[jju] and beta(iatom,jjb) entries + // multiply and divide by j+1 factors + // account for multiplicity of 1, 2, or 3 + // pick out right beta value + for (int elem3 = 0; elem3 < nelements; elem3++) { + + const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom, elem1, elem2, elem3); + + if constexpr (need_atomics) { + Kokkos::atomic_add(&(ylist_re(iatom, elem3, jju_half)), betaj * ztmp.re); + Kokkos::atomic_add(&(ylist_im(iatom, elem3, jju_half)), betaj * ztmp.im); + } else { + ylist_re(iatom, elem3, jju_half) += betaj * ztmp.re; + ylist_im(iatom, elem3, jju_half) += betaj * ztmp.im; + } + } // end loop over elem3 + idouble++; + } // end loop over elem2 + } // end loop over elem1 + } else { + const complex ztmp = zlist(iatom, 0, jjz); + const real_type betaj = evaluate_beta_scaled(j1, j2, j, iatom, 0, 0, 0); + + if constexpr (need_atomics) { + Kokkos::atomic_add(&(ylist_re(iatom, 0, jju_half)), betaj * ztmp.re); + Kokkos::atomic_add(&(ylist_im(iatom, 0, jju_half)), betaj * ztmp.im); + } else { + ylist_re(iatom, 0, jju_half) += betaj * ztmp.re; + ylist_im(iatom, 0, jju_half) += betaj * ztmp.im; + } + } +} + +/* ---------------------------------------------------------------------- + Core "evaluation" kernel that extracts and rescales the appropriate + `beta` value which gets used in both `compute_yi` and `compute_yi_from_zlist` +------------------------------------------------------------------------- */ + +template +template KOKKOS_FORCEINLINE_FUNCTION +typename SNAKokkos::real_type SNAKokkos::evaluate_beta_scaled(const int& j1, const int& j2, const int& j, + const int& iatom, const int& elem1, const int& elem2, const int& elem3) const { + + int itriple_jjb = 0; + real_type factor = 0; + + if constexpr (chemsnap) { + if (j >= j1) { + itriple_jjb = ((elem1 * nelements + elem2) * nelements + elem3) * idxb_max + idxb_block(j1, j2, j); + if (j1 == j) { + if (j2 == j) factor = 3; + else factor = 2; + } else factor = 1; + } else if (j >= j2) { + itriple_jjb = ((elem3 * nelements + elem2) * nelements + elem1) * idxb_max + idxb_block(j, j2, j1); + if (j2 == j) factor = 2; + else factor = 1; + } else { + itriple_jjb = ((elem2 * nelements + elem3) * nelements + elem1) * idxb_max + idxb_block(j2, j, j1); + factor = 1; + } + } else { + if (j >= j1) { + itriple_jjb = idxb_block(j1, j2, j); + if (j1 == j) { + if (j2 == j) factor = 3; + else factor = 2; + } else factor = 1; + } else if (j >= j2) { + itriple_jjb = idxb_block(j, j2, j1); + if (j2 == j) factor = 2; + else factor = 1; + } else { + itriple_jjb = idxb_block(j2, j, j1); + factor = 1; + } + } + + real_type betaj = factor * d_beta(iatom, itriple_jjb); if (!bnorm_flag && j1 > j) { const real_type scale = static_cast(j1 + 1) / static_cast(j + 1); @@ -955,8 +1234,9 @@ typename SNAKokkos::real_type SNAKokkos template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_fused_deidrj_small(const typename Kokkos::TeamPolicy::member_type& team, const int iatom_mod, const int j_bend, const int jnbor, const int iatom_div) +void SNAKokkos::compute_fused_deidrj_small(const typename Kokkos::TeamPolicy::member_type& team, const int iatom_mod, const int j_bend, const int jnbor, const int iatom_div) const { + const int iatom = iatom_mod + vector_length * iatom_div; // get shared memory offset // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer const int tile_size = vector_length * (twojmax + 1); @@ -969,21 +1249,21 @@ void SNAKokkos::compute_fused_deidrj_small WignerWrapper dulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod); // load parameters - const complex a = a_pack(iatom_mod, jnbor, iatom_div); - const complex b = b_pack(iatom_mod, jnbor, iatom_div); - const complex da = da_pack(iatom_mod, jnbor, iatom_div, dir); - const complex db = db_pack(iatom_mod, jnbor, iatom_div, dir); - const real_type sfac = sfac_pack(iatom_mod, jnbor, iatom_div, 0); - const real_type dsfacu = sfac_pack(iatom_mod, jnbor, iatom_div, dir + 1); // dsfac * u + const complex a = a_gpu(iatom, jnbor); + const complex b = b_gpu(iatom, jnbor); + const complex da = da_gpu(iatom, jnbor, dir); + const complex db = db_gpu(iatom, jnbor, dir); + const real_type sfac = sfac_gpu(iatom, jnbor, 0); + const real_type dsfacu = sfac_gpu(iatom, jnbor, dir + 1); // dsfac * u - const int jelem = element(iatom_mod + vector_length * iatom_div, jnbor); + const int jelem = element(iatom, jnbor); // compute the contribution to dedr_full_sum for one "bend" location const real_type dedr_full_sum = evaluate_duidrj_jbend(ulist_wrapper, a, b, sfac, dulist_wrapper, da, db, dsfacu, - jelem, iatom_mod, j_bend, iatom_div); + jelem, iatom, j_bend); // dedr gets zeroed out at the start of each iteration in compute_cayley_klein - Kokkos::atomic_add(&(dedr(iatom_mod + vector_length * iatom_div, jnbor, dir)), static_cast(2.0) * dedr_full_sum); + Kokkos::atomic_add(&(dedr(iatom, jnbor, dir)), static_cast(2.0) * dedr_full_sum); } @@ -992,8 +1272,9 @@ void SNAKokkos::compute_fused_deidrj_small template template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_fused_deidrj_large(const typename Kokkos::TeamPolicy::member_type& team, const int iatom_mod, const int jnbor, const int iatom_div) +void SNAKokkos::compute_fused_deidrj_large(const typename Kokkos::TeamPolicy::member_type& team, const int iatom_mod, const int jnbor, const int iatom_div) const { + const int iatom = iatom_mod + vector_length * iatom_div; // get shared memory offset // scratch size: 32 atoms * (twojmax+1) cached values, no double buffer const int tile_size = vector_length * (twojmax + 1); @@ -1006,14 +1287,14 @@ void SNAKokkos::compute_fused_deidrj_large WignerWrapper dulist_wrapper((complex*)team.team_shmem().get_shmem(team.team_size() * tile_size * sizeof(complex), 0) + scratch_shift, iatom_mod); // load parameters - const complex a = a_pack(iatom_mod, jnbor, iatom_div); - const complex b = b_pack(iatom_mod, jnbor, iatom_div); - const complex da = da_pack(iatom_mod, jnbor, iatom_div, dir); - const complex db = db_pack(iatom_mod, jnbor, iatom_div, dir); - const real_type sfac = sfac_pack(iatom_mod, jnbor, iatom_div, 0); - const real_type dsfacu = sfac_pack(iatom_mod, jnbor, iatom_div, dir + 1); // dsfac * u + const complex a = a_gpu(iatom, jnbor); + const complex b = b_gpu(iatom, jnbor); + const complex da = da_gpu(iatom, jnbor, dir); + const complex db = db_gpu(iatom, jnbor, dir); + const real_type sfac = sfac_gpu(iatom, jnbor, 0); + const real_type dsfacu = sfac_gpu(iatom, jnbor, dir + 1); // dsfac * u - const int jelem = element(iatom_mod + vector_length * iatom_div, jnbor); + const int jelem = element(iatom, jnbor); // compute the contributions to dedr_full_sum for all "bend" locations real_type dedr_full_sum = static_cast(0); @@ -1022,11 +1303,11 @@ void SNAKokkos::compute_fused_deidrj_large #endif for (int j_bend = 0; j_bend <= twojmax; j_bend++) { dedr_full_sum += evaluate_duidrj_jbend(ulist_wrapper, a, b, sfac, dulist_wrapper, da, db, dsfacu, - jelem, iatom_mod, j_bend, iatom_div); + jelem, iatom,j_bend); } // there's one thread per atom, neighbor pair, so no need to make this atomic - dedr(iatom_mod + vector_length * iatom_div, jnbor, dir) = static_cast(2.0) * dedr_full_sum; + dedr(iatom, jnbor, dir) = static_cast(2.0) * dedr_full_sum; } @@ -1036,7 +1317,7 @@ template KOKKOS_FORCEINLINE_FUNCTION typename SNAKokkos::real_type SNAKokkos::evaluate_duidrj_jbend(const WignerWrapper& ulist_wrapper, const complex& a, const complex& b, const real_type& sfac, const WignerWrapper& dulist_wrapper, const complex& da, const complex& db, const real_type& dsfacu, - const int& jelem, const int& iatom_mod, const int& j_bend, const int& iatom_div) { + const int& jelem, const int& iatom, const int& j_bend) const { real_type dedr_full_sum = static_cast(0); @@ -1106,7 +1387,7 @@ typename SNAKokkos::real_type SNAKokkos::real_type SNAKokkos(0.5)*y_local; } else if (ma > (mb-1)) { y_local.re = static_cast(0.); y_local.im = static_cast(0.); } // can probably avoid this outright @@ -1171,335 +1452,6 @@ typename SNAKokkos::real_type SNAKokkos -KOKKOS_INLINE_FUNCTION -void SNAKokkos::pre_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, const int& iatom, const int& ielem) -{ - for (int jelem = 0; jelem < nelements; jelem++) { - for (int j = 0; j <= twojmax; j++) { - int jju = idxu_half_block(j); // removed "const" to work around GCC 7 bug - - // Only diagonal elements get initialized - // for (int m = 0; m < (j+1)*(j/2+1); m++) - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team, (j+1)*(j/2+1)), - [&] (const int m) { - - const int jjup = jju + m; - - // if m is on the "diagonal", initialize it with the self energy. - // Otherwise zero it out - complex init(static_cast(0.),static_cast(0.)); - if (m % (j+2) == 0 && (!chem_flag || ielem == jelem || wselfall_flag)) { init.re = wself; } //need to map iatom to element - - ulisttot(jjup, jelem, iatom) = init; - }); - } - } - -} - - -/* ---------------------------------------------------------------------- - compute Ui by summing over bispectrum components. CPU only. - See comments above compute_uarray_cpu and add_uarraytot for - data layout comments. -------------------------------------------------------------------------- */ - -template -KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_ui_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) -{ - real_type rsq, r, x, y, z, z0, theta0; - - // utot(j,ma,mb) = 0 for all j,ma,ma - // utot(j,ma,ma) = 1 for all j,ma - // for j in neighbors of i: - // compute r0 = (x,y,z,z0) - // utot(j,ma,mb) += u(r0;j,ma,mb) for all j,ma,mb - - x = rij(iatom,jnbor,0); - y = rij(iatom,jnbor,1); - z = rij(iatom,jnbor,2); - rsq = x * x + y * y + z * z; - r = sqrt(rsq); - - theta0 = (r - rmin0) * rfac0 * MY_PI / (rcutij(iatom,jnbor) - rmin0); - // theta0 = (r - rmin0) * rscale0; - z0 = r / tan(theta0); - - compute_uarray_cpu(team, iatom, jnbor, x, y, z, z0, r); - add_uarraytot(team, iatom, jnbor, r, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor), element(iatom, jnbor)); - -} -/* ---------------------------------------------------------------------- - compute Zi by summing over products of Ui, CPU version -------------------------------------------------------------------------- */ - -template -KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_zi_cpu(const int& iter) -{ - const int iatom = iter / idxz_max; - const int jjz = iter % idxz_max; - - int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, idxcg; - idxz(jjz).get_zi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, idxcg); - - const real_type *cgblock = cglist.data() + idxcg; - - int idouble = 0; - - for (int elem1 = 0; elem1 < nelements; elem1++) { - for (int elem2 = 0; elem2 < nelements; elem2++) { - zlist(jjz, idouble, iatom).re = static_cast(0.0); - zlist(jjz, idouble, iatom).im = static_cast(0.0); - - int jju1 = idxu_block[j1] + (j1+1)*mb1min; - int jju2 = idxu_block[j2] + (j2+1)*mb2max; - int icgb = mb1min*(j2+1) + mb2max; - for (int ib = 0; ib < nb; ib++) { - - real_type suma1_r = static_cast(0.0); - real_type suma1_i = static_cast(0.0); - - int ma1 = ma1min; - int ma2 = ma2max; - int icga = ma1min * (j2 + 1) + ma2max; - for (int ia = 0; ia < na; ia++) { - suma1_r += cgblock[icga] * (ulisttot_full(jju1+ma1, elem1, iatom).re * ulisttot_full(jju2+ma2, elem2, iatom).re - - ulisttot_full(jju1+ma1, elem1, iatom).im * ulisttot_full(jju2+ma2, elem2, iatom).im); - suma1_i += cgblock[icga] * (ulisttot_full(jju1+ma1, elem1, iatom).re * ulisttot_full(jju2+ma2, elem2, iatom).im + - ulisttot_full(jju1+ma1, elem1, iatom).im * ulisttot_full(jju2+ma2, elem2, iatom).re); - ma1++; - ma2--; - icga += j2; - } // end loop over ia - - zlist(jjz, idouble, iatom).re += cgblock[icgb] * suma1_r; - zlist(jjz, idouble, iatom).im += cgblock[icgb] * suma1_i; - - jju1 += j1 + 1; - jju2 -= j2 + 1; - icgb += j2; - } // end loop over ib - - if (bnorm_flag) { - const real_type scale = static_cast(1) / static_cast(j + 1); - zlist(jjz, idouble, iatom).re *= scale; - zlist(jjz, idouble, iatom).im *= scale; - } - idouble++; - } // end loop over elem2 - } // end loop over elem1 -} - - -/* ---------------------------------------------------------------------- - compute Bi by summing conj(Ui)*Zi, CPU version -------------------------------------------------------------------------- */ - -template -KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_bi_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom) -{ - // for j1 = 0,...,twojmax - // for j2 = 0,twojmax - // for j = |j1-j2|,Min(twojmax,j1+j2),2 - // b(j1,j2,j) = 0 - // for mb = 0,...,jmid - // for ma = 0,...,j - // b(j1,j2,j) += - // 2*Conj(u(j,ma,mb))*z(j1,j2,j,ma,mb) - - int itriple = 0; - int idouble = 0; - for (int elem1 = 0; elem1 < nelements; elem1++) { - for (int elem2 = 0; elem2 < nelements; elem2++) { - int jalloy = idouble; // must be non-const to work around gcc compiler bug - for (int elem3 = 0; elem3 < nelements; elem3++) { - Kokkos::parallel_for(Kokkos::TeamThreadRange(team,idxb_max), - [&] (const int& jjb) { - const int j1 = idxb(jjb, 0); - const int j2 = idxb(jjb, 1); - int j = idxb(jjb, 2); // removed "const" to work around GCC 7 bug - - int jjz = idxz_block(j1, j2, j); - int jju = idxu_block[j]; - real_type sumzu = static_cast(0.0); - real_type sumzu_temp = static_cast(0.0); - const int bound = (j+2)/2; - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,(j+1)*bound), - [&] (const int mbma, real_type& sum) { - const int ma = mbma % (j + 1); - const int mb = mbma / (j + 1); - const int jju_index = jju + mb * (j + 1) + ma; - const int jjz_index = jjz + mb * (j + 1) + ma; - if (2*mb == j) return; - sum += - ulisttot_full(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re + - ulisttot_full(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im; - },sumzu_temp); // end loop over ma, mb - sumzu += sumzu_temp; - - // For j even, special treatment for middle column - - if (j%2 == 0) { - int mb = j/2; // removed "const" to work around GCC 7 bug - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, mb), - [&] (const int ma, real_type& sum) { - const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma; - const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; - sum += - ulisttot_full(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re + - ulisttot_full(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im; - },sumzu_temp); // end loop over ma - sumzu += sumzu_temp; - - const int ma = mb; - const int jju_index = jju+(mb-1)*(j+1)+(j+1)+ma; - const int jjz_index = jjz+(mb-1)*(j+1)+(j+1)+ma; - sumzu += static_cast(0.5)* - (ulisttot_full(jju_index, elem3, iatom).re * zlist(jjz_index, jalloy, iatom).re + - ulisttot_full(jju_index, elem3, iatom).im * zlist(jjz_index, jalloy, iatom).im); - } // end if jeven - - Kokkos::single(Kokkos::PerThread(team), [&] () { - sumzu *= static_cast(2.0); - - // apply bzero shift - - if (bzero_flag) { - if (!wselfall_flag) { - if (elem1 == elem2 && elem1 == elem3) { - sumzu -= bzero[j]; - } - } else { - sumzu -= bzero[j]; - } - } - - blist(iatom, itriple, jjb) = sumzu; - }); - }); - //} // end loop over j - //} // end loop over j1, j2 - itriple++; - } - idouble++; - } // end loop over elem2 - } // end loop over elem1 - -} - -/* ---------------------------------------------------------------------- - compute Yi from Ui without storing Zi, looping over zlist indices, - CPU version -------------------------------------------------------------------------- */ - -template -KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_yi_cpu(int iter, - const Kokkos::View &beta) -{ - real_type betaj; - const int iatom = iter / idxz_max; - const int jjz = iter % idxz_max; - - int j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju_half, idxcg; - idxz(jjz).get_yi(j1, j2, j, ma1min, ma2max, mb1min, mb2max, na, nb, jju_half, idxcg); - - const real_type *cgblock = cglist.data() + idxcg; - //int mb = (2 * (mb1min+mb2max) - j1 - j2 + j) / 2; - //int ma = (2 * (ma1min+ma2max) - j1 - j2 + j) / 2; - - for (int elem1 = 0; elem1 < nelements; elem1++) { - for (int elem2 = 0; elem2 < nelements; elem2++) { - - real_type ztmp_r = 0.0; - real_type ztmp_i = 0.0; - - int jju1 = idxu_block[j1] + (j1 + 1) * mb1min; - int jju2 = idxu_block[j2] + (j2 + 1) * mb2max; - int icgb = mb1min * (j2 +1) + mb2max; - - for (int ib = 0; ib < nb; ib++) { - - real_type suma1_r = 0.0; - real_type suma1_i = 0.0; - - int ma1 = ma1min; - int ma2 = ma2max; - int icga = ma1min*(j2+1) + ma2max; - - for (int ia = 0; ia < na; ia++) { - suma1_r += cgblock[icga] * (ulisttot_full(jju1+ma1, elem1, iatom).re * ulisttot_full(jju2+ma2, elem2, iatom).re - - ulisttot_full(jju1+ma1, elem1, iatom).im * ulisttot_full(jju2+ma2, elem2, iatom).im); - suma1_i += cgblock[icga] * (ulisttot_full(jju1+ma1, elem1, iatom).re * ulisttot_full(jju2+ma2, elem2, iatom).im + - ulisttot_full(jju1+ma1, elem1, iatom).im * ulisttot_full(jju2+ma2, elem2, iatom).re); - ma1++; - ma2--; - icga += j2; - } // end loop over ia - - ztmp_r += cgblock[icgb] * suma1_r; - ztmp_i += cgblock[icgb] * suma1_i; - jju1 += j1 + 1; - jju2 -= j2 + 1; - icgb += j2; - } // end loop over ib - - if (bnorm_flag) { - const real_type scale = static_cast(1) / static_cast(j + 1); - ztmp_i *= scale; - ztmp_r *= scale; - } - - // apply to z(j1,j2,j,ma,mb) to unique element of y(j) - // find right y_list[jju] and beta(iatom,jjb) entries - // multiply and divide by j+1 factors - // account for multiplicity of 1, 2, or 3 - - // pick out right beta value - for (int elem3 = 0; elem3 < nelements; elem3++) { - - if (j >= j1) { - const int jjb = idxb_block(j1, j2, j); - const int itriple = ((elem1 * nelements + elem2) * nelements + elem3) * idxb_max + jjb; - if (j1 == j) { - if (j2 == j) betaj = 3 * beta(itriple, iatom); - else betaj = 2 * beta(itriple, iatom); - } else betaj = beta(itriple, iatom); - } else if (j >= j2) { - const int jjb = idxb_block(j, j2, j1); - const int itriple = ((elem3 * nelements + elem2) * nelements + elem1) * idxb_max + jjb; - if (j2 == j) betaj = 2 * beta(itriple, iatom); - else betaj = beta(itriple, iatom); - } else { - const int jjb = idxb_block(j2, j, j1); - const int itriple = ((elem2 * nelements + elem3) * nelements + elem1) * idxb_max + jjb; - betaj = beta(itriple, iatom); - } - - if (!bnorm_flag && j1 > j) - betaj *= static_cast(j1 + 1) / static_cast(j + 1); - - Kokkos::atomic_add(&(ylist(jju_half, elem3, iatom).re), betaj*ztmp_r); - Kokkos::atomic_add(&(ylist(jju_half, elem3, iatom).im), betaj*ztmp_i); - } // end loop over elem3 - } // end loop over elem2 - } // end loop over elem1 -} - - /* ---------------------------------------------------------------------- calculate derivative of Ui w.r.t. atom j see comments above compute_duarray_cpu for comments on the @@ -1508,27 +1460,179 @@ void SNAKokkos::compute_yi_cpu(int iter, template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_duidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) +void SNAKokkos::compute_duidrj_cpu(const int& iatom, const int& jnbor) const { - real_type rsq, r, x, y, z, z0, theta0, cs, sn; - real_type dz0dr; + complex da[3], db[3]; + real_type u[3], dz0[3], dr0inv[3]; - x = rij(iatom,jnbor,0); - y = rij(iatom,jnbor,1); - z = rij(iatom,jnbor,2); - rsq = x * x + y * y + z * z; - r = sqrt(rsq); - real_type rscale0 = rfac0 * static_cast(MY_PI) / (rcutij(iatom,jnbor) - rmin0); - theta0 = (r - rmin0) * rscale0; - sn = sin(theta0); - cs = cos(theta0); - z0 = r * cs / sn; - dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq; + const real_type x = rij(iatom,jnbor,0); + const real_type y = rij(iatom,jnbor,1); + const real_type z = rij(iatom,jnbor,2); + const real_type rsq = x * x + y * y + z * z; + const real_type r = sqrt(rsq); + const real_type rscale0 = rfac0 * static_cast(MY_PI) / (rcutij(iatom,jnbor) - rmin0); + const real_type theta0 = (r - rmin0) * rscale0; + const real_type sn = sin(theta0); + const real_type cs = cos(theta0); + const real_type z0 = r * cs / sn; + const real_type dz0dr = z0 / r - (r*rscale0) * (rsq + z0 * z0) / rsq; - compute_duarray_cpu(team, iatom, jnbor, x, y, z, z0, r, dz0dr, wj(iatom,jnbor), rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor)); + // begin what was compute_duarray_cpu + + real_type rinv = 1.0 / r; + u[0] = x * rinv; + u[1] = y * rinv; + u[2] = z * rinv; + + real_type r0inv = 1.0 / sqrt(r * r + z0 * z0); + complex a = { z0 * r0inv, -z * r0inv }; + complex b = { y * r0inv, -x * r0inv }; + + real_type dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr); + + dr0inv[0] = dr0invdr * u[0]; + dr0inv[1] = dr0invdr * u[1]; + dr0inv[2] = dr0invdr * u[2]; + + dz0[0] = dz0dr * u[0]; + dz0[1] = dz0dr * u[1]; + dz0[2] = dz0dr * u[2]; + + for (int k = 0; k < 3; k++) { + da[k].re = dz0[k] * r0inv + z0 * dr0inv[k]; + da[k].im = -z * dr0inv[k]; + } + + da[2].im += -r0inv; + + for (int k = 0; k < 3; k++) { + db[k].re = y * dr0inv[k]; + db[k].im = -x * dr0inv[k]; + } + + db[0].im += -r0inv; + db[1].re += r0inv; + + for (int k = 0; k < 3; k++) + dulist_cpu(iatom, jnbor, 0, k) = complex::zero(); + + for (int j = 1; j <= twojmax; j++) { + int jju = idxu_cache_block[j]; + int jjup = idxu_cache_block[j-1]; + + for (int mb = 0; 2*mb <= j; mb++) { + int jju_index = jju + mb * (j + 1); + int jjup_index = jjup + mb * j; + + complex duidrj[3] = { complex::zero(), complex::zero(), complex::zero() }; + + for (int ma = 0; ma < j; ma++) { + complex ui_prev = ulist_cpu(iatom, jnbor, jjup_index); + complex duidrj_prev[3] = { dulist_cpu(iatom, jnbor, jjup_index, 0), + dulist_cpu(iatom, jnbor, jjup_index, 1), + dulist_cpu(iatom, jnbor, jjup_index, 2) }; + + real_type rootpq = rootpqarray(j - ma,j - mb); + for (int k = 0; k < 3; k++) { + duidrj[k].re += rootpq * (da[k].re * ui_prev.re + da[k].im * ui_prev.im + + a.re * duidrj_prev[k].re + a.im * duidrj_prev[k].im); + + duidrj[k].im += rootpq * (da[k].re * ui_prev.im - da[k].im * ui_prev.re + + a.re * duidrj_prev[k].im - a.im * duidrj_prev[k].re); + + dulist_cpu(iatom, jnbor, jju_index, k) = duidrj[k]; + } + + rootpq = rootpqarray(ma + 1,j - mb); + for (int k = 0; k < 3; k++) { + duidrj[k].re = -rootpq * (db[k].re * ui_prev.re + db[k].im * ui_prev.im + + b.re * duidrj_prev[k].re + b.im * duidrj_prev[k].im); + + duidrj[k].im = -rootpq * (db[k].re * ui_prev.im - db[k].im * ui_prev.re + + b.re * duidrj_prev[k].im - b.im * duidrj_prev[k].re); + } + + jju_index++; + jjup_index++; + } + + for (int k = 0; k < 3; k++) { + dulist_cpu(iatom, jnbor, jju_index, k) = duidrj[k]; + } + } + + // Only need to add one symmetrized row for convenience + // Symmetry gets "unfolded" during the dedr accumulation + + // copy left side to right side with inversion symmetry VMK 4.4(2) + // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb]) + + if (j % 2 == 1) { + int mb = j / 2; + // begin filling in the extra row + int jju_index = jju + (mb + 1) * (j + 1); + int jjup_index = jjup + mb * j; + + complex duidrj[3] = { complex::zero(), complex::zero(), complex::zero() }; + + for (int ma = 0; ma < j; ma++) { + complex ui_prev = ulist_cpu(iatom, jnbor, jjup_index); + complex duidrj_prev[3] = { dulist_cpu(iatom, jnbor, jjup_index, 0), + dulist_cpu(iatom, jnbor, jjup_index, 1), + dulist_cpu(iatom, jnbor, jjup_index, 2) }; + + real_type rootpq = rootpqarray(j - ma, mb + 1); + for (int k = 0; k < 3; k++) { + duidrj[k].re += rootpq * (db[k].re * ui_prev.re - db[k].im * ui_prev.im + + b.re * duidrj_prev[k].re - b.im * duidrj_prev[k].im); + duidrj[k].im += rootpq * (db[k].re * ui_prev.im + db[k].im * ui_prev.re + + b.re * duidrj_prev[k].im + b.im * duidrj_prev[k].re); + + dulist_cpu(iatom, jnbor, jju_index, k) = duidrj[k]; + } + + rootpq = rootpqarray(ma + 1, mb + 1); + for (int k = 0; k < 3; k++) { + duidrj[k].re = rootpq * (da[k].re * ui_prev.re - da[k].im * ui_prev.im + + a.re * duidrj_prev[k].re - a.im * duidrj_prev[k].im); + duidrj[k].im = rootpq * (da[k].re * ui_prev.im + da[k].im * ui_prev.re + + a.re * duidrj_prev[k].im + a.im * duidrj_prev[k].re); + } + + jju_index++; + jjup_index++; + } + + for (int k = 0; k < 3; k++) { + dulist_cpu(iatom, jnbor, jju_index, k) = duidrj[k]; + } + } + } + + real_type sfac, dsfac; + compute_s_dsfac(r, rcutij(iatom,jnbor), sinnerij(iatom,jnbor), dinnerij(iatom,jnbor), sfac, dsfac); + + sfac *= wj(iatom,jnbor); + dsfac *= wj(iatom,jnbor); + + // Even though we fill out a full "cached" data layout above, + // we only need the "half" data for the accumulation into dedr. + // Thus we skip updating any unnecessary data. + for (int j = 0; j <= twojmax; j++) { + int jju = idxu_cache_block[j]; + for (int mb = 0; 2*mb <= j; mb++) + for (int ma = 0; ma <= j; ma++) { + for (int k = 0; k < 3; k++) { + dulist_cpu(iatom, jnbor, jju, k).re = dsfac * ulist_cpu(iatom, jnbor, jju).re * u[k] + + sfac * dulist_cpu(iatom, jnbor, jju, k).re; + dulist_cpu(iatom, jnbor, jju, k).im = dsfac * ulist_cpu(iatom, jnbor, jju).im * u[k] + + sfac * dulist_cpu(iatom, jnbor, jju, k).im; + } + jju++; + } + } } - /* ---------------------------------------------------------------------- compute dEidRj, CPU path only. dulist takes advantage of a `cached` data layout, similar to the @@ -1537,356 +1641,44 @@ void SNAKokkos::compute_duidrj_cpu(const t dulist only uses the "half" data layout part of that structure. ------------------------------------------------------------------------- */ - template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_deidrj_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor) +void SNAKokkos::compute_deidrj_cpu(const int& iatom, const int& jnbor) const { - t_scalar3 final_sum; + real_type force_sum[3] = { 0, 0, 0 }; const int jelem = element(iatom, jnbor); - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,twojmax+1), - [&] (const int& j, t_scalar3& sum_tmp) { + for (int j = 0; j <= twojmax; j++) { int jju_half = idxu_half_block[j]; int jju_cache = idxu_cache_block[j]; - for (int mb = 0; 2*mb < j; mb++) + for (int mb = 0; 2 * mb < j; mb++) { for (int ma = 0; ma <= j; ma++) { - sum_tmp.x += dulist(jju_cache,iatom,jnbor,0).re * ylist(jju_half,jelem,iatom).re + - dulist(jju_cache,iatom,jnbor,0).im * ylist(jju_half,jelem,iatom).im; - sum_tmp.y += dulist(jju_cache,iatom,jnbor,1).re * ylist(jju_half,jelem,iatom).re + - dulist(jju_cache,iatom,jnbor,1).im * ylist(jju_half,jelem,iatom).im; - sum_tmp.z += dulist(jju_cache,iatom,jnbor,2).re * ylist(jju_half,jelem,iatom).re + - dulist(jju_cache,iatom,jnbor,2).im * ylist(jju_half,jelem,iatom).im; + complex y = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) }; + for (int k = 0; k < 3; k++) + force_sum[k] += dulist_cpu(iatom, jnbor, jju_cache, k).re * y.re + + dulist_cpu(iatom, jnbor, jju_cache, k).im * y.im; jju_half++; jju_cache++; - } //end loop over ma mb + } + } //end loop over ma mb // For j even, handle middle column - if (j%2 == 0) { - - int mb = j/2; - for (int ma = 0; ma < mb; ma++) { - sum_tmp.x += dulist(jju_cache,iatom,jnbor,0).re * ylist(jju_half,jelem,iatom).re + - dulist(jju_cache,iatom,jnbor,0).im * ylist(jju_half,jelem,iatom).im; - sum_tmp.y += dulist(jju_cache,iatom,jnbor,1).re * ylist(jju_half,jelem,iatom).re + - dulist(jju_cache,iatom,jnbor,1).im * ylist(jju_half,jelem,iatom).im; - sum_tmp.z += dulist(jju_cache,iatom,jnbor,2).re * ylist(jju_half,jelem,iatom).re + - dulist(jju_cache,iatom,jnbor,2).im * ylist(jju_half,jelem,iatom).im; + if (j % 2 == 0) { + //int mb = j / 2; + for (int ma = 0; ma <= j; ma++) { + complex y = { ylist_re(iatom, jelem, jju_half), ylist_im(iatom, jelem, jju_half) }; + for (int k = 0; k < 3; k++) + force_sum[k] += static_cast(0.5) * (dulist_cpu(iatom, jnbor, jju_cache, k).re * y.re + + dulist_cpu(iatom, jnbor, jju_cache, k).im * y.im); jju_half++; jju_cache++; } - - //int ma = mb; - sum_tmp.x += (dulist(jju_cache,iatom,jnbor,0).re * ylist(jju_half,jelem,iatom).re + - dulist(jju_cache,iatom,jnbor,0).im * ylist(jju_half,jelem,iatom).im)*0.5; - sum_tmp.y += (dulist(jju_cache,iatom,jnbor,1).re * ylist(jju_half,jelem,iatom).re + - dulist(jju_cache,iatom,jnbor,1).im * ylist(jju_half,jelem,iatom).im)*0.5; - sum_tmp.z += (dulist(jju_cache,iatom,jnbor,2).re * ylist(jju_half,jelem,iatom).re + - dulist(jju_cache,iatom,jnbor,2).im * ylist(jju_half,jelem,iatom).im)*0.5; } // end if jeven - },final_sum); // end loop over j - - Kokkos::single(Kokkos::PerThread(team), [&] () { - dedr(iatom,jnbor,0) = final_sum.x*2.0; - dedr(iatom,jnbor,1) = final_sum.y*2.0; - dedr(iatom,jnbor,2) = final_sum.z*2.0; - }); - -} - - -/* ---------------------------------------------------------------------- - add Wigner U-functions for one neighbor to the total - ulist is in a "cached" data layout, which is a compressed layout - which still keeps the recursive calculation simple. On the other hand - `ulisttot` uses a "half" data layout, which fully takes advantage - of the symmetry of the Wigner U matrices. -------------------------------------------------------------------------- */ - -template -KOKKOS_INLINE_FUNCTION -void SNAKokkos::add_uarraytot(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor, - const real_type& r, const real_type& wj, const real_type& rcut, - const real_type& sinner, const real_type& dinner, int jelem) -{ - const real_type sfac = compute_sfac(r, rcut, sinner, dinner) * wj; - - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,twojmax+1), - [&] (const int& j) { - - int jju_half = idxu_half_block[j]; // index into ulisttot - int jju_cache = idxu_cache_block[j]; // index into ulist - - int count = 0; - for (int mb = 0; 2*mb <= j; mb++) { - for (int ma = 0; ma <= j; ma++) { - Kokkos::atomic_add(&(ulisttot(jju_half+count, jelem, iatom).re), sfac * ulist(jju_cache+count, iatom, jnbor).re); - Kokkos::atomic_add(&(ulisttot(jju_half+count, jelem, iatom).im), sfac * ulist(jju_cache+count, iatom, jnbor).im); - count++; - } - } - }); -} - -/* ---------------------------------------------------------------------- - compute Wigner U-functions for one neighbor. - `ulisttot` uses a "cached" data layout, matching the amount of - information stored between layers via scratch memory on the GPU path -------------------------------------------------------------------------- */ - -template -KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_uarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor, - const real_type& x, const real_type& y, const real_type& z, const real_type& z0, const real_type& r) -{ - real_type r0inv; - real_type a_r, b_r, a_i, b_i; - real_type rootpq; - - // compute Cayley-Klein parameters for unit quaternion - - r0inv = static_cast(1.0) / sqrt(r * r + z0 * z0); - a_r = r0inv * z0; - a_i = -r0inv * z; - b_r = r0inv * y; - b_i = -r0inv * x; - - // VMK Section 4.8.2 - - ulist(0,iatom,jnbor).re = 1.0; - ulist(0,iatom,jnbor).im = 0.0; - - for (int j = 1; j <= twojmax; j++) { - int jju = idxu_cache_block[j]; // removed "const" to work around GCC 7 bug - int jjup = idxu_cache_block[j-1]; // removed "const" to work around GCC 7 bug - - // fill in left side of matrix layer from previous layer - - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2), - [&] (const int& mb) { - //for (int mb = 0; 2*mb <= j; mb++) { - const int jju_index = jju+mb+mb*j; - ulist(jju_index,iatom,jnbor).re = 0.0; - ulist(jju_index,iatom,jnbor).im = 0.0; - - for (int ma = 0; ma < j; ma++) { - const int jju_index = jju+mb+mb*j+ma; - const int jjup_index = jjup+mb*j+ma; - rootpq = rootpqarray(j - ma,j - mb); - ulist(jju_index,iatom,jnbor).re += - rootpq * - (a_r * ulist(jjup_index,iatom,jnbor).re + - a_i * ulist(jjup_index,iatom,jnbor).im); - ulist(jju_index,iatom,jnbor).im += - rootpq * - (a_r * ulist(jjup_index,iatom,jnbor).im - - a_i * ulist(jjup_index,iatom,jnbor).re); - - rootpq = rootpqarray(ma + 1,j - mb); - ulist(jju_index+1,iatom,jnbor).re = - -rootpq * - (b_r * ulist(jjup_index,iatom,jnbor).re + - b_i * ulist(jjup_index,iatom,jnbor).im); - ulist(jju_index+1,iatom,jnbor).im = - -rootpq * - (b_r * ulist(jjup_index,iatom,jnbor).im - - b_i * ulist(jjup_index,iatom,jnbor).re); - } - - // copy left side to right side with inversion symmetry VMK 4.4(2) - // u[ma-j,mb-j] = (-1)^(ma-mb)*Conj([u[ma,mb)) - - // Only need to add one symmetrized row for convenience - // Symmetry gets "unfolded" in accumulating ulisttot - if (j%2==1 && mb==(j/2)) { - const int mbpar = (mb)%2==0?1:-1; - int mapar = mbpar; - for (int ma = 0; ma <= j; ma++) { - const int jju_index = jju + mb*(j+1) + ma; - const int jjup_index = jju + (j+1-mb)*(j+1)-(ma+1); - if (mapar == 1) { - ulist(jjup_index,iatom,jnbor).re = ulist(jju_index,iatom,jnbor).re; - ulist(jjup_index,iatom,jnbor).im = -ulist(jju_index,iatom,jnbor).im; - } else { - ulist(jjup_index,iatom,jnbor).re = -ulist(jju_index,iatom,jnbor).re; - ulist(jjup_index,iatom,jnbor).im = ulist(jju_index,iatom,jnbor).im; - } - mapar = -mapar; - } - } - }); - - } -} - -/* ---------------------------------------------------------------------- - compute derivatives of Wigner U-functions for one neighbor - see comments in compute_uarray_cpu() - Uses same cached data layout of ulist -------------------------------------------------------------------------- */ - -template -KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_duarray_cpu(const typename Kokkos::TeamPolicy::member_type& team, int iatom, int jnbor, - const real_type& x, const real_type& y, const real_type& z, - const real_type& z0, const real_type& r, const real_type& dz0dr, - const real_type& wj, const real_type& rcut, - const real_type& sinner, const real_type& dinner) -{ - real_type r0inv; - real_type a_r, a_i, b_r, b_i; - real_type da_r[3], da_i[3], db_r[3], db_i[3]; - real_type dz0[3], dr0inv[3], dr0invdr; - real_type rootpq; - - real_type rinv = 1.0 / r; - real_type ux = x * rinv; - real_type uy = y * rinv; - real_type uz = z * rinv; - - r0inv = 1.0 / sqrt(r * r + z0 * z0); - a_r = z0 * r0inv; - a_i = -z * r0inv; - b_r = y * r0inv; - b_i = -x * r0inv; - - dr0invdr = -r0inv * r0inv * r0inv * (r + z0 * dz0dr); - - dr0inv[0] = dr0invdr * ux; - dr0inv[1] = dr0invdr * uy; - dr0inv[2] = dr0invdr * uz; - - dz0[0] = dz0dr * ux; - dz0[1] = dz0dr * uy; - dz0[2] = dz0dr * uz; - - for (int k = 0; k < 3; k++) { - da_r[k] = dz0[k] * r0inv + z0 * dr0inv[k]; - da_i[k] = -z * dr0inv[k]; } - da_i[2] += -r0inv; - - for (int k = 0; k < 3; k++) { - db_r[k] = y * dr0inv[k]; - db_i[k] = -x * dr0inv[k]; - } - - db_i[0] += -r0inv; - db_r[1] += r0inv; - - dulist(0,iatom,jnbor,0).re = 0.0; - dulist(0,iatom,jnbor,1).re = 0.0; - dulist(0,iatom,jnbor,2).re = 0.0; - dulist(0,iatom,jnbor,0).im = 0.0; - dulist(0,iatom,jnbor,1).im = 0.0; - dulist(0,iatom,jnbor,2).im = 0.0; - - for (int j = 1; j <= twojmax; j++) { - int jju = idxu_cache_block[j]; - int jjup = idxu_cache_block[j-1]; - Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,(j+2)/2), - [&] (const int& mb) { - //for (int mb = 0; 2*mb <= j; mb++) { - const int jju_index = jju+mb+mb*j; - dulist(jju_index,iatom,jnbor,0).re = 0.0; - dulist(jju_index,iatom,jnbor,1).re = 0.0; - dulist(jju_index,iatom,jnbor,2).re = 0.0; - dulist(jju_index,iatom,jnbor,0).im = 0.0; - dulist(jju_index,iatom,jnbor,1).im = 0.0; - dulist(jju_index,iatom,jnbor,2).im = 0.0; - - for (int ma = 0; ma < j; ma++) { - const int jju_index = jju+mb+mb*j+ma; - const int jjup_index = jjup+mb*j+ma; - rootpq = rootpqarray(j - ma,j - mb); - for (int k = 0; k < 3; k++) { - dulist(jju_index,iatom,jnbor,k).re += - rootpq * (da_r[k] * ulist(jjup_index,iatom,jnbor).re + - da_i[k] * ulist(jjup_index,iatom,jnbor).im + - a_r * dulist(jjup_index,iatom,jnbor,k).re + - a_i * dulist(jjup_index,iatom,jnbor,k).im); - dulist(jju_index,iatom,jnbor,k).im += - rootpq * (da_r[k] * ulist(jjup_index,iatom,jnbor).im - - da_i[k] * ulist(jjup_index,iatom,jnbor).re + - a_r * dulist(jjup_index,iatom,jnbor,k).im - - a_i * dulist(jjup_index,iatom,jnbor,k).re); - } - - rootpq = rootpqarray(ma + 1,j - mb); - for (int k = 0; k < 3; k++) { - dulist(jju_index+1,iatom,jnbor,k).re = - -rootpq * (db_r[k] * ulist(jjup_index,iatom,jnbor).re + - db_i[k] * ulist(jjup_index,iatom,jnbor).im + - b_r * dulist(jjup_index,iatom,jnbor,k).re + - b_i * dulist(jjup_index,iatom,jnbor,k).im); - dulist(jju_index+1,iatom,jnbor,k).im = - -rootpq * (db_r[k] * ulist(jjup_index,iatom,jnbor).im - - db_i[k] * ulist(jjup_index,iatom,jnbor).re + - b_r * dulist(jjup_index,iatom,jnbor,k).im - - b_i * dulist(jjup_index,iatom,jnbor,k).re); - } - } - - // Only need to add one symmetrized row for convenience - // Symmetry gets "unfolded" during the dedr accumulation - - // copy left side to right side with inversion symmetry VMK 4.4(2) - // u[ma-j][mb-j] = (-1)^(ma-mb)*Conj([u[ma][mb]) - - if (j%2==1 && mb==(j/2)) { - const int mbpar = (mb)%2==0?1:-1; - int mapar = mbpar; - for (int ma = 0; ma <= j; ma++) { - const int jju_index = jju+mb*(j+1)+ma; - const int jjup_index = jju+(mb+2)*(j+1)-(ma+1); - if (mapar == 1) { - for (int k = 0; k < 3; k++) { - dulist(jjup_index,iatom,jnbor,k).re = dulist(jju_index,iatom,jnbor,k).re; - dulist(jjup_index,iatom,jnbor,k).im = -dulist(jju_index,iatom,jnbor,k).im; - } - } else { - for (int k = 0; k < 3; k++) { - dulist(jjup_index,iatom,jnbor,k).re = -dulist(jju_index,iatom,jnbor,k).re; - dulist(jjup_index,iatom,jnbor,k).im = dulist(jju_index,iatom,jnbor,k).im; - } - } - mapar = -mapar; - } - } - }); - } - - real_type sfac = compute_sfac(r, rcut, sinner, dinner); - real_type dsfac = compute_dsfac(r, rcut, sinner, dinner); - - sfac *= wj; - dsfac *= wj; - - // Even though we fill out a full "cached" data layout above, - // we only need the "half" data for the accumulation into dedr. - // Thus we skip updating any unnecessary data. - for (int j = 0; j <= twojmax; j++) { - int jju = idxu_cache_block[j]; - for (int mb = 0; 2*mb <= j; mb++) - for (int ma = 0; ma <= j; ma++) { - dulist(jju,iatom,jnbor,0).re = dsfac * ulist(jju,iatom,jnbor).re * ux + - sfac * dulist(jju,iatom,jnbor,0).re; - dulist(jju,iatom,jnbor,0).im = dsfac * ulist(jju,iatom,jnbor).im * ux + - sfac * dulist(jju,iatom,jnbor,0).im; - dulist(jju,iatom,jnbor,1).re = dsfac * ulist(jju,iatom,jnbor).re * uy + - sfac * dulist(jju,iatom,jnbor,1).re; - dulist(jju,iatom,jnbor,1).im = dsfac * ulist(jju,iatom,jnbor).im * uy + - sfac * dulist(jju,iatom,jnbor,1).im; - dulist(jju,iatom,jnbor,2).re = dsfac * ulist(jju,iatom,jnbor).re * uz + - sfac * dulist(jju,iatom,jnbor,2).re; - dulist(jju,iatom,jnbor,2).im = dsfac * ulist(jju,iatom,jnbor).im * uz + - sfac * dulist(jju,iatom,jnbor,2).im; - - jju++; - } - } + for (int k = 0; k < 3; k++) + dedr(iatom, jnbor, k) = 2 * force_sum[k]; } /* ---------------------------------------------------------------------- @@ -2210,7 +2002,7 @@ int SNAKokkos::compute_ncoeff() template KOKKOS_INLINE_FUNCTION -real_type SNAKokkos::compute_sfac(real_type r, real_type rcut, real_type sinner, real_type dinner) +real_type SNAKokkos::compute_sfac(real_type r, real_type rcut, real_type sinner, real_type dinner) const { real_type sfac_outer; constexpr real_type one = static_cast(1.0); @@ -2243,7 +2035,7 @@ real_type SNAKokkos::compute_sfac(real_typ template KOKKOS_INLINE_FUNCTION -real_type SNAKokkos::compute_dsfac(real_type r, real_type rcut, real_type sinner, real_type dinner) +real_type SNAKokkos::compute_dsfac(real_type r, real_type rcut, real_type sinner, real_type dinner) const { real_type sfac_outer, dsfac_outer, sfac_inner, dsfac_inner; constexpr real_type one = static_cast(1.0); @@ -2291,7 +2083,7 @@ real_type SNAKokkos::compute_dsfac(real_ty template KOKKOS_INLINE_FUNCTION -void SNAKokkos::compute_s_dsfac(const real_type r, const real_type rcut, const real_type sinner, const real_type dinner, real_type& sfac, real_type& dsfac) { +void SNAKokkos::compute_s_dsfac(const real_type r, const real_type rcut, const real_type sinner, const real_type dinner, real_type& sfac, real_type& dsfac) const { real_type sfac_outer, dsfac_outer, sfac_inner, dsfac_inner; constexpr real_type one = static_cast(1.0); @@ -2339,41 +2131,26 @@ double SNAKokkos::memory_usage() bytes += MemKK::memory_usage(rootpqarray); bytes += MemKK::memory_usage(cglist); -#ifdef LMP_KOKKOS_GPU - if (!host_flag) { + bytes += MemKK::memory_usage(ulisttot_re); + bytes += MemKK::memory_usage(ulisttot_im); + bytes += MemKK::memory_usage(ulisttot); - bytes += MemKK::memory_usage(a_pack); - bytes += MemKK::memory_usage(b_pack); - bytes += MemKK::memory_usage(da_pack); - bytes += MemKK::memory_usage(db_pack); - bytes += MemKK::memory_usage(sfac_pack); + bytes += MemKK::memory_usage(zlist); + bytes += MemKK::memory_usage(blist); + bytes += MemKK::memory_usage(ylist_re); + bytes += MemKK::memory_usage(ylist_im); - bytes += MemKK::memory_usage(ulisttot_re_pack); - bytes += MemKK::memory_usage(ulisttot_im_pack); - bytes += MemKK::memory_usage(ulisttot_pack); - - bytes += MemKK::memory_usage(zlist_pack); - bytes += MemKK::memory_usage(blist_pack); - - bytes += MemKK::memory_usage(ylist_pack_re); - bytes += MemKK::memory_usage(ylist_pack_im); + if constexpr (!host_flag) { + bytes += MemKK::memory_usage(a_gpu); + bytes += MemKK::memory_usage(b_gpu); + bytes += MemKK::memory_usage(da_gpu); + bytes += MemKK::memory_usage(db_gpu); + bytes += MemKK::memory_usage(sfac_gpu); } else { -#endif - - bytes += MemKK::memory_usage(ulist); - bytes += MemKK::memory_usage(ulisttot); - bytes += MemKK::memory_usage(ulisttot_full); - - bytes += MemKK::memory_usage(zlist); - bytes += MemKK::memory_usage(blist); - - bytes += MemKK::memory_usage(ylist); - - bytes += MemKK::memory_usage(dulist); -#ifdef LMP_KOKKOS_GPU + bytes += MemKK::memory_usage(ulist_cpu); + bytes += MemKK::memory_usage(dulist_cpu); } -#endif bytes += MemKK::memory_usage(dedr);