diff --git a/doc/src/Commands_pair.rst b/doc/src/Commands_pair.rst index 9d44ee5826..c5782f5e04 100644 --- a/doc/src/Commands_pair.rst +++ b/doc/src/Commands_pair.rst @@ -231,7 +231,7 @@ OPT. * :doc:`oxrna2/stk ` * :doc:`oxrna2/xstk ` * :doc:`oxrna2/coaxstk ` - * :doc:`pace ` + * :doc:`pace (k) ` * :doc:`peri/eps ` * :doc:`peri/lps (o) ` * :doc:`peri/pmb (o) ` diff --git a/doc/src/pair_pace.rst b/doc/src/pair_pace.rst index ad66febdf5..3d5649d912 100644 --- a/doc/src/pair_pace.rst +++ b/doc/src/pair_pace.rst @@ -1,7 +1,10 @@ .. index:: pair_style pace +.. index:: pair_style pace/kk pair_style pace command -======================== +======================= + +Accelerator Variants: *snap/kk* Syntax """""" @@ -10,13 +13,14 @@ Syntax pair_style pace ... keyword values ... -* an optional keyword may be appended -* keyword = *product* or *recursive* +* one or more keyword/value pairs may be appended .. parsed-literal:: + keyword = *product* or *recursive* or *chunksize* *product* = use product algorithm for basis functions *recursive* = use recursive algorithm for basis functions + *chunksize* value = number of atoms in each pass Examples """""""" @@ -24,7 +28,7 @@ Examples .. code-block:: LAMMPS pair_style pace - pair_style pace product + pair_style pace product chunksize 2048 pair_coeff * * Cu-PBE-core-rep.ace Cu Description @@ -59,11 +63,19 @@ Note that unlike for other potentials, cutoffs are not set in the pair_style or pair_coeff command; they are specified in the ACE file. -The pair_style *pace* command may be followed by an optional keyword +The pair_style *pace* command may be followed by the optional keyword *product* or *recursive*, which determines which of two algorithms is used for the calculation of basis functions and derivatives. The default is *recursive*. +The keyword *chunksize* is only applicable when +using the pair style *pace* with the KOKKOS package on GPUs and is +ignored otherwise. This keyword controls the number of atoms +in each pass used to compute the atomic cluster expansion and is used to +avoid running out of memory. For example if there are 8192 atoms in the +simulation and the *chunksize* is set to 4096, the ACE +calculation will be broken up into two passes (running on a single GPU). + See the :doc:`pair_coeff ` page for alternate ways to specify the path for the ACE coefficient file. @@ -88,6 +100,10 @@ This pair style can only be used via the *pair* keyword of the ---------- +.. include:: accel_styles.rst + +---------- + Restrictions """""""""""" @@ -103,7 +119,7 @@ Related commands Default """"""" -recursive +recursive, chunksize = 4096 .. _Drautz20191: diff --git a/examples/PACKAGES/pace/in.pace.product b/examples/PACKAGES/pace/in.pace.product index d70bb0f67c..11049ee922 100644 --- a/examples/PACKAGES/pace/in.pace.product +++ b/examples/PACKAGES/pace/in.pace.product @@ -14,8 +14,6 @@ create_atoms 1 box mass 1 26.98 -group Al type 1 - pair_style pace product pair_coeff * * Cu-PBE-core-rep.ace Cu diff --git a/examples/PACKAGES/pace/in.pace.recursive b/examples/PACKAGES/pace/in.pace.recursive index dd655eb18d..1d6d9116f0 100644 --- a/examples/PACKAGES/pace/in.pace.recursive +++ b/examples/PACKAGES/pace/in.pace.recursive @@ -14,8 +14,6 @@ create_atoms 1 box mass 1 26.98 -group Al type 1 - pair_style pace recursive pair_coeff * * Cu-PBE-core-rep.ace Cu diff --git a/src/Depend.sh b/src/Depend.sh index 65744494f5..8046fd2636 100755 --- a/src/Depend.sh +++ b/src/Depend.sh @@ -154,6 +154,10 @@ if (test $1 = "RIGID") then depend DPD-SMOOTH fi +if (test $1 = "ML-PACE") then + depend KOKKOS +fi + if (test $1 = "ML-SNAP") then depend KOKKOS depend ML-IAP diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh index da397cd677..022da7855a 100755 --- a/src/KOKKOS/Install.sh +++ b/src/KOKKOS/Install.sh @@ -291,6 +291,8 @@ action pair_morse_kokkos.cpp action pair_morse_kokkos.h action pair_multi_lucy_rx_kokkos.cpp pair_multi_lucy_rx.cpp action pair_multi_lucy_rx_kokkos.h pair_multi_lucy_rx.h +action pair_pace_kokkos.cpp pair_pace.cpp +action pair_pace_kokkos.h pair_pace.h action pair_reaxff_kokkos.cpp pair_reaxff.cpp action pair_reaxff_kokkos.h pair_reaxff.h action pair_snap_kokkos.cpp pair_snap.cpp diff --git a/src/KOKKOS/kokkos_type.h b/src/KOKKOS/kokkos_type.h index 2926ba1d36..a537851829 100644 --- a/src/KOKKOS/kokkos_type.h +++ b/src/KOKKOS/kokkos_type.h @@ -1284,8 +1284,13 @@ struct alignas(2*sizeof(real_type_)) SNAComplex static constexpr complex one() { return complex(static_cast(1.), static_cast(0.)); } KOKKOS_INLINE_FUNCTION - const complex conj() { return complex(re, -im); } + const complex conj() const { return complex(re, -im); } + KOKKOS_INLINE_FUNCTION + const real_type real_part_product(const complex &cm2) { return re * cm2.re - im * cm2.im; } + + KOKKOS_INLINE_FUNCTION + const real_type real_part_product(const real_type &r) const { return re * r; } }; template @@ -1293,6 +1298,16 @@ KOKKOS_FORCEINLINE_FUNCTION SNAComplex operator*(const real_type& r, return SNAComplex(r*self.re, r*self.im); } +template +KOKKOS_FORCEINLINE_FUNCTION SNAComplex operator*(const SNAComplex& self, const real_type& r) { + return SNAComplex(r*self.re, r*self.im); +} + +template +KOKKOS_FORCEINLINE_FUNCTION SNAComplex operator*(const SNAComplex& self, const SNAComplex& cm2) { + return SNAComplex(self.re*cm2.re - self.im*cm2.im, self.re*cm2.im + self.im*cm2.re); +} + typedef SNAComplex SNAcomplex; #if defined(KOKKOS_ENABLE_CXX11) diff --git a/src/KOKKOS/memory_kokkos.h b/src/KOKKOS/memory_kokkos.h index 1134e627ca..e82746f34d 100644 --- a/src/KOKKOS/memory_kokkos.h +++ b/src/KOKKOS/memory_kokkos.h @@ -273,6 +273,63 @@ void destroy_kokkos(TYPE data, typename TYPE::value_type** &array) array = nullptr; } +/* ---------------------------------------------------------------------- + reallocate Kokkos views without initialization + deallocate first to reduce memory use +------------------------------------------------------------------------- */ + +template +void realloc_kokkos(TYPE &data, const char *name, int n1) +{ + data = TYPE(); + data = TYPE(Kokkos::NoInit(std::string(name)),n1); +} + +template +void realloc_kokkos(TYPE &data, const char *name, int n1, int n2) +{ + data = TYPE(); + data = TYPE(Kokkos::NoInit(std::string(name)),n1,n2); +} + +template +void realloc_kokkos(TYPE &data, const char *name, int n1, int n2, int n3) +{ + data = TYPE(); + data = TYPE(Kokkos::NoInit(std::string(name)),n1,n2,n3); +} + +template +void realloc_kokkos(TYPE &data, const char *name, int n1, int n2, int n3, int n4) +{ + data = TYPE(); + data = TYPE(Kokkos::NoInit(std::string(name)),n1,n2,n3,n4); +} + +template +void realloc_kokkos(TYPE &data, const char *name, int n1, int n2, int n3, int n4, int n5) +{ + data = TYPE(); + data = TYPE(Kokkos::NoInit(std::string(name)),n1,n2,n3,n4,n5); +} + +template +void realloc_kokkos(TYPE &data, const char *name, int n1, int n2, int n3, int n4, int n5, int n6) +{ + data = TYPE(); + data = TYPE(Kokkos::NoInit(std::string(name)),n1,n2,n3,n4,n5,n6); +} + +/* ---------------------------------------------------------------------- + get memory usage of Kokkos view in bytes +------------------------------------------------------------------------- */ + +template +double memory_usage(TYPE &data) +{ + return data.span() * sizeof(typename TYPE::value_type); +} + }; } diff --git a/src/KOKKOS/pair_pace_kokkos.cpp b/src/KOKKOS/pair_pace_kokkos.cpp new file mode 100644 index 0000000000..bb4986344e --- /dev/null +++ b/src/KOKKOS/pair_pace_kokkos.cpp @@ -0,0 +1,1721 @@ +// clang-format off +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + aE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Stan Moore (SNL) +------------------------------------------------------------------------- */ + +#include "pair_pace_kokkos.h" + +#include "atom_kokkos.h" +#include "atom_masks.h" +#include "comm.h" +#include "error.h" +#include "force.h" +#include "kokkos.h" +#include "math_const.h" +#include "memory_kokkos.h" +#include "neighbor_kokkos.h" +#include "neigh_request.h" + +#include "ace_c_basis.h" +#include "ace_evaluator.h" +#include "ace_recursive.h" +#include "ace_version.h" +#include "ace_radial.h" +#include + +namespace LAMMPS_NS { +struct ACEImpl { + ACEImpl() : basis_set(nullptr), ace(nullptr) {} + ~ACEImpl() + { + delete basis_set; + delete ace; + } + ACECTildeBasisSet *basis_set; + ACERecursiveEvaluator *ace; +}; +} // namespace LAMMPS_NS + +using namespace LAMMPS_NS; +using namespace MathConst; + +enum{FS,FS_SHIFTEDSCALED}; + +/* ---------------------------------------------------------------------- */ + +template +PairPACEKokkos::PairPACEKokkos(LAMMPS *lmp) : PairPACE(lmp) +{ + respa_enable = 0; + + kokkosable = 1; + atomKK = (AtomKokkos *) atom; + execution_space = ExecutionSpaceFromDevice::space; + datamask_read = EMPTY_MASK; + datamask_modify = EMPTY_MASK; + + host_flag = (execution_space == Host); +} + +/* ---------------------------------------------------------------------- + check if allocated, since class can be destructed when incomplete +------------------------------------------------------------------------- */ + +template +PairPACEKokkos::~PairPACEKokkos() +{ + if (copymode) return; + + memoryKK->destroy_kokkos(k_eatom,eatom); + memoryKK->destroy_kokkos(k_vatom,vatom); + + // deallocate views of views in serial to prevent issues in Kokkos tools + + for (int i = 0; i < nelements; i++) { + for (int j = 0; j < nelements; j++) { + k_splines_gk.h_view(i, j).deallocate(); + k_splines_rnl.h_view(i, j).deallocate(); + k_splines_hc.h_view(i, j).deallocate(); + } + } +} + +/* ---------------------------------------------------------------------- */ + +template +void PairPACEKokkos::grow(int natom, int maxneigh) +{ + auto basis_set = aceimpl->basis_set; + + if (A.extent(0) < natom) { + + memoryKK->realloc_kokkos(A, "pace:A", natom, nelements, nradmax + 1, (lmax + 1) * (lmax + 1)); + memoryKK->realloc_kokkos(A_rank1, "pace:A_rank1", natom, nelements, nradbase); + + memoryKK->realloc_kokkos(A_list, "pace:A_list", natom, idx_rho_max, basis_set->rankmax); + //size is +1 of max to avoid out-of-boundary array access in double-triangular scheme + memoryKK->realloc_kokkos(A_forward_prod, "pace:A_forward_prod", natom, idx_rho_max, basis_set->rankmax + 1); + + memoryKK->realloc_kokkos(e_atom, "pace:e_atom", natom); + memoryKK->realloc_kokkos(rhos, "pace:rhos", natom, basis_set->ndensitymax + 1); // +1 density for core repulsion + memoryKK->realloc_kokkos(dF_drho, "pace:dF_drho", natom, basis_set->ndensitymax + 1); // +1 density for core repulsion + + memoryKK->realloc_kokkos(weights, "pace:weights", natom, nelements, nradmax + 1, (lmax + 1) * (lmax + 1)); + memoryKK->realloc_kokkos(weights_rank1, "pace:weights_rank1", natom, nelements, nradbase); + + // hard-core repulsion + memoryKK->realloc_kokkos(rho_core, "pace:rho_core", natom); + memoryKK->realloc_kokkos(dF_drho_core, "pace:dF_drho_core", natom); + memoryKK->realloc_kokkos(dB_flatten, "pace:dB_flatten", natom, idx_rho_max, basis_set->rankmax); + } + + if (ylm.extent(0) < natom || ylm.extent(1) < maxneigh) { + + // radial functions + memoryKK->realloc_kokkos(fr, "pace:fr", natom, maxneigh, nradmax, lmax + 1); + memoryKK->realloc_kokkos(dfr, "pace:dfr", natom, maxneigh, nradmax, lmax + 1); + memoryKK->realloc_kokkos(gr, "pace:gr", natom, maxneigh, nradbase); + memoryKK->realloc_kokkos(dgr, "pace:dgr", natom, maxneigh, nradbase); + const int max_num_functions = MAX(nradbase, nradmax*(lmax + 1)); + memoryKK->realloc_kokkos(d_values, "pace:d_values", natom, maxneigh, max_num_functions); + memoryKK->realloc_kokkos(d_derivatives, "pace:d_derivatives", natom, maxneigh, max_num_functions); + + // hard-core repulsion + memoryKK->realloc_kokkos(cr, "pace:cr", natom, maxneigh); + memoryKK->realloc_kokkos(dcr, "pace:dcr", natom, maxneigh); + + // spherical harmonics + memoryKK->realloc_kokkos(plm, "pace:plm", natom, maxneigh, (lmax + 1) * (lmax + 1)); + memoryKK->realloc_kokkos(dplm, "pace:dplm", natom, maxneigh, (lmax + 1) * (lmax + 1)); + memoryKK->realloc_kokkos(ylm, "pace:ylm", natom, maxneigh, (lmax + 1) * (lmax + 1)); + memoryKK->realloc_kokkos(dylm, "pace:dylm", natom, maxneigh, (lmax + 1) * (lmax + 1)); + + // short neigh list + memoryKK->realloc_kokkos(d_ncount, "pace:ncount", natom); + memoryKK->realloc_kokkos(d_mu, "pace:mu", natom, maxneigh); + memoryKK->realloc_kokkos(d_rhats, "pace:rhats", natom, maxneigh); + memoryKK->realloc_kokkos(d_rnorms, "pace:rnorms", natom, maxneigh); + memoryKK->realloc_kokkos(d_nearest, "pace:nearest", natom, maxneigh); + + memoryKK->realloc_kokkos(f_ij, "pace:f_ij", natom, maxneigh); + } +} + +/* ---------------------------------------------------------------------- */ + +template +void PairPACEKokkos::copy_pertype() +{ + auto basis_set = aceimpl->basis_set; + + memoryKK->realloc_kokkos(d_rho_core_cutoff, "pace:rho_core_cutoff", nelements); + memoryKK->realloc_kokkos(d_drho_core_cutoff, "pace:drho_core_cutoff", nelements); + memoryKK->realloc_kokkos(d_E0vals, "pace:E0vals", nelements); + memoryKK->realloc_kokkos(d_ndensity, "pace:ndensity", nelements); + memoryKK->realloc_kokkos(d_npoti, "pace:npoti", nelements); + + auto h_rho_core_cutoff = Kokkos::create_mirror_view(d_rho_core_cutoff); + auto h_drho_core_cutoff = Kokkos::create_mirror_view(d_drho_core_cutoff); + auto h_E0vals = Kokkos::create_mirror_view(d_E0vals); + auto h_ndensity = Kokkos::create_mirror_view(d_ndensity); + auto h_npoti = Kokkos::create_mirror_view(d_npoti); + + for (int n = 0; n < nelements; n++) { + h_rho_core_cutoff[n] = basis_set->map_embedding_specifications.at(n).rho_core_cutoff; + h_drho_core_cutoff[n] = basis_set->map_embedding_specifications.at(n).drho_core_cutoff; + + h_E0vals(n)= basis_set->E0vals(n); + + const int ndensity = h_ndensity(n) = basis_set->map_embedding_specifications.at(n).ndensity; + + string npoti = basis_set->map_embedding_specifications.at(n).npoti; + if (npoti == "FinnisSinclair") + h_npoti(n) = FS; + else if (npoti == "FinnisSinclairShiftedScaled") + h_npoti(n) = FS_SHIFTEDSCALED; + } + + Kokkos::deep_copy(d_rho_core_cutoff, h_rho_core_cutoff); + Kokkos::deep_copy(d_drho_core_cutoff, h_drho_core_cutoff); + Kokkos::deep_copy(d_E0vals, h_E0vals); + Kokkos::deep_copy(d_ndensity, h_ndensity); + Kokkos::deep_copy(d_npoti, h_npoti); + + memoryKK->realloc_kokkos(d_wpre, "pace:wpre", nelements, basis_set->ndensitymax); + memoryKK->realloc_kokkos(d_mexp, "pace:mexp", nelements, basis_set->ndensitymax); + + auto h_wpre = Kokkos::create_mirror_view(d_wpre); + auto h_mexp = Kokkos::create_mirror_view(d_mexp); + + for (int n = 0; n < nelements; n++) { + const int ndensity = basis_set->map_embedding_specifications.at(n).ndensity; + for (int p = 0; p < ndensity; p++) { + h_wpre(n, p) = basis_set->map_embedding_specifications.at(n).FS_parameters.at(p * 2 + 0); + h_mexp(n, p) = basis_set->map_embedding_specifications.at(n).FS_parameters.at(p * 2 + 1); + } + } + + Kokkos::deep_copy(d_wpre, h_wpre); + Kokkos::deep_copy(d_mexp, h_mexp); +} + +/* ---------------------------------------------------------------------- */ + +template +void PairPACEKokkos::copy_splines() +{ + auto basis_set = aceimpl->basis_set; + + if (k_splines_gk.d_view.data()) { + for (int i = 0; i < nelements; i++) { + for (int j = 0; j < nelements; j++) { + k_splines_gk.h_view(i, j).deallocate(); + k_splines_rnl.h_view(i, j).deallocate(); + k_splines_hc.h_view(i, j).deallocate(); + } + } + } + + k_splines_gk = Kokkos::DualView("pace:splines_gk", nelements, nelements); + k_splines_rnl = Kokkos::DualView("pace:splines_rnl", nelements, nelements); + k_splines_hc = Kokkos::DualView("pace:splines_hc", nelements, nelements); + + ACERadialFunctions* radial_functions = dynamic_cast(basis_set->radial_functions); + + for (int i = 0; i < nelements; i++) { + for (int j = 0; j < nelements; j++) { + k_splines_gk.h_view(i, j) = radial_functions->splines_gk(i, j); + k_splines_rnl.h_view(i, j) = radial_functions->splines_rnl(i, j); + k_splines_hc.h_view(i, j) = radial_functions->splines_hc(i, j); + } + } + + k_splines_gk.modify_host(); + k_splines_rnl.modify_host(); + k_splines_hc.modify_host(); + + k_splines_gk.sync_device(); + k_splines_rnl.sync_device(); + k_splines_hc.sync_device(); +} + +/* ---------------------------------------------------------------------- */ + +template +void PairPACEKokkos::copy_tilde() +{ + auto basis_set = aceimpl->basis_set; + + // flatten loops, get per-element count and max + + idx_rho_max = 0; + int total_basis_size_max = 0; + + memoryKK->realloc_kokkos(d_idx_rho_count, "pace:idx_rho_count", nelements); + auto h_idx_rho_count = Kokkos::create_mirror_view(d_idx_rho_count); + + for (int n = 0; n < nelements; n++) { + int idx_rho = 0; + const int total_basis_size_rank1 = basis_set->total_basis_size_rank1[n]; + const int total_basis_size = basis_set->total_basis_size[n]; + + ACECTildeBasisFunction *basis = basis_set->basis[n]; + + // rank=1 + for (int func_rank1_ind = 0; func_rank1_ind < total_basis_size_rank1; ++func_rank1_ind) + idx_rho++; + + // rank > 1 + for (int func_ind = 0; func_ind < total_basis_size; ++func_ind) { + ACECTildeBasisFunction *func = &basis[func_ind]; + + // loop over {ms} combinations in sum + for (int ms_ind = 0; ms_ind < func->num_ms_combs; ++ms_ind) + idx_rho++; + } + h_idx_rho_count(n) = idx_rho; + idx_rho_max = MAX(idx_rho_max, idx_rho); + total_basis_size_max = MAX(total_basis_size_max, total_basis_size_rank1 + total_basis_size); + } + + Kokkos::deep_copy(d_idx_rho_count, h_idx_rho_count); + + memoryKK->realloc_kokkos(d_rank, "pace:rank", nelements, total_basis_size_max); + memoryKK->realloc_kokkos(d_num_ms_combs, "pace:num_ms_combs", nelements, total_basis_size_max); + memoryKK->realloc_kokkos(d_offsets, "pace:offsets", nelements, idx_rho_max); + memoryKK->realloc_kokkos(d_mus, "pace:mus", nelements, total_basis_size_max, basis_set->rankmax); + memoryKK->realloc_kokkos(d_ns, "pace:ns", nelements, total_basis_size_max, basis_set->rankmax); + memoryKK->realloc_kokkos(d_ls, "pace:ls", nelements, total_basis_size_max, basis_set->rankmax); + memoryKK->realloc_kokkos(d_ms_combs, "pace:ms_combs", nelements, idx_rho_max, basis_set->rankmax); + memoryKK->realloc_kokkos(d_ctildes, "pace:ctildes", nelements, idx_rho_max, basis_set->ndensitymax); + + auto h_rank = Kokkos::create_mirror_view(d_rank); + auto h_num_ms_combs = Kokkos::create_mirror_view(d_num_ms_combs); + auto h_offsets = Kokkos::create_mirror_view(d_offsets); + auto h_mus = Kokkos::create_mirror_view(d_mus); + auto h_ns = Kokkos::create_mirror_view(d_ns); + auto h_ls = Kokkos::create_mirror_view(d_ls); + auto h_ms_combs = Kokkos::create_mirror_view(d_ms_combs); + auto h_ctildes = Kokkos::create_mirror_view(d_ctildes); + + // copy values on host + + for (int n = 0; n < nelements; n++) { + const int total_basis_size_rank1 = basis_set->total_basis_size_rank1[n]; + const int total_basis_size = basis_set->total_basis_size[n]; + + ACECTildeBasisFunction *basis_rank1 = basis_set->basis_rank1[n]; + ACECTildeBasisFunction *basis = basis_set->basis[n]; + + const int ndensity = basis_set->map_embedding_specifications.at(n).ndensity; + + int idx_rho = 0; + + // rank=1 + for (int offset = 0; offset < total_basis_size_rank1; ++offset) { + ACECTildeBasisFunction *func = &basis_rank1[offset]; + h_rank(n, offset) = 1; + h_mus(n, offset, 0) = func->mus[0]; + h_ns(n, offset, 0) = func->ns[0]; + for (int p = 0; p < ndensity; p++) + h_ctildes(n, idx_rho, p) = func->ctildes[p]; + h_offsets(n, idx_rho) = offset; + idx_rho++; + } + + // rank > 1 + for (int func_ind = 0; func_ind < total_basis_size; ++func_ind) { + ACECTildeBasisFunction *func = &basis[func_ind]; + // TODO: check if func->ctildes are zero, then skip + + const int offset = total_basis_size_rank1 + func_ind; + + const int rank = h_rank(n, offset) = func->rank; + h_num_ms_combs(n, offset) = func->num_ms_combs; + for (int t = 0; t < rank; t++) { + h_mus(n, offset, t) = func->mus[t]; + h_ns(n, offset, t) = func->ns[t]; + h_ls(n, offset, t) = func->ls[t]; + } + + // loop over {ms} combinations in sum + for (int ms_ind = 0; ms_ind < func->num_ms_combs; ++ms_ind) { + auto ms = &func->ms_combs[ms_ind * rank]; // current ms-combination (of length = rank) + for (int t = 0; t < rank; t++) + h_ms_combs(n, idx_rho, t) = ms[t]; + + for (int p = 0; p < ndensity; ++p) { + // real-part only multiplication + h_ctildes(n, idx_rho, p) = func->ctildes[ms_ind * ndensity + p]; + } + h_offsets(n, idx_rho) = offset; + idx_rho++; + } + } + } + + Kokkos::deep_copy(d_rank, h_rank); + Kokkos::deep_copy(d_num_ms_combs, h_num_ms_combs); + Kokkos::deep_copy(d_offsets, h_offsets); + Kokkos::deep_copy(d_mus, h_mus); + Kokkos::deep_copy(d_ns, h_ns); + Kokkos::deep_copy(d_ls, h_ls); + Kokkos::deep_copy(d_ms_combs, h_ms_combs); + Kokkos::deep_copy(d_ctildes, h_ctildes); +} + +/* ---------------------------------------------------------------------- + init specific to this pair style +------------------------------------------------------------------------- */ + +template +void PairPACEKokkos::init_style() +{ + if (host_flag) { + if (lmp->kokkos->nthreads > 1) + error->all(FLERR,"Pair style pace/kk can currently only run on a single " + "CPU thread"); + + PairPACE::init_style(); + return; + } + + if (atom->tag_enable == 0) error->all(FLERR, "Pair style PACE requires atom IDs"); + if (force->newton_pair == 0) error->all(FLERR, "Pair style PACE requires newton pair on"); + + // neighbor list request for KOKKOS + + neighflag = lmp->kokkos->neighflag; + + auto request = neighbor->add_request(this, NeighConst::REQ_FULL); + request->set_kokkos_host(std::is_same::value && + !std::is_same::value); + request->set_kokkos_device(std::is_same::value); + if (neighflag == FULL) + error->all(FLERR,"Must use half neighbor list style with pair pace/kk"); + + auto basis_set = aceimpl->basis_set; + + nelements = basis_set->nelements; + lmax = basis_set->lmax; + nradmax = basis_set->nradmax; + nradbase = basis_set->nradbase; + + // spherical harmonics + + memoryKK->realloc_kokkos(alm, "pace:alm", (lmax + 1) * (lmax + 1)); + memoryKK->realloc_kokkos(blm, "pace:blm", (lmax + 1) * (lmax + 1)); + memoryKK->realloc_kokkos(cl, "pace:cl", lmax + 1); + memoryKK->realloc_kokkos(dl, "pace:dl", lmax + 1); + + pre_compute_harmonics(lmax); + copy_pertype(); + copy_splines(); + copy_tilde(); +} + +/* ---------------------------------------------------------------------- + init for one type pair i,j and corresponding j,i +------------------------------------------------------------------------- */ + +template +double PairPACEKokkos::init_one(int i, int j) +{ + double cutone = PairPACE::init_one(i,j); + + k_scale.h_view(i,j) = k_scale.h_view(j,i) = scale[i][j]; + k_scale.template modify(); + + k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutone*cutone; + k_cutsq.template modify(); + + return cutone; +} + +/* ---------------------------------------------------------------------- + set coeffs for one or more type pairs +------------------------------------------------------------------------- */ + +template +void PairPACEKokkos::coeff(int narg, char **arg) +{ + PairPACE::coeff(narg,arg); + + // Set up element lists + + auto h_map = Kokkos::create_mirror_view(d_map); + + for (int i = 1; i <= atom->ntypes; i++) + h_map(i) = map[i]; + + Kokkos::deep_copy(d_map,h_map); +} + +/* ---------------------------------------------------------------------- */ + +template +void PairPACEKokkos::allocate() +{ + PairPACE::allocate(); + + int n = atom->ntypes + 1; + memoryKK->realloc_kokkos(d_map, "pace:map", n); + + memoryKK->realloc_kokkos(k_cutsq, "pace:cutsq", n, n); + d_cutsq = k_cutsq.template view(); + + memoryKK->realloc_kokkos(k_scale, "pace:scale", n, n); + d_scale = k_scale.template view(); +} + +/* ---------------------------------------------------------------------- */ + +template +struct FindMaxNumNeighs { + typedef DeviceType device_type; + NeighListKokkos k_list; + + FindMaxNumNeighs(NeighListKokkos* nl): k_list(*nl) {} + ~FindMaxNumNeighs() {k_list.copymode = 1;} + + KOKKOS_INLINE_FUNCTION + void operator() (const int& ii, int& maxneigh) const { + const int i = k_list.d_ilist[ii]; + const int num_neighs = k_list.d_numneigh[i]; + if (maxneigh < num_neighs) maxneigh = num_neighs; + } +}; + +/* ---------------------------------------------------------------------- */ + +template +void PairPACEKokkos::compute(int eflag_in, int vflag_in) +{ + if (host_flag) { + atomKK->sync(Host,X_MASK|TYPE_MASK); + PairPACE::compute(eflag_in,vflag_in); + atomKK->modified(Host,F_MASK); + return; + } + + eflag = eflag_in; + vflag = vflag_in; + + if (neighflag == FULL) no_virial_fdotr_compute = 1; + + ev_init(eflag,vflag,0); + + // reallocate per-atom arrays if necessary + + if (eflag_atom) { + memoryKK->destroy_kokkos(k_eatom,eatom); + memoryKK->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom"); + d_eatom = k_eatom.view(); + } + if (vflag_atom) { + memoryKK->destroy_kokkos(k_vatom,vatom); + memoryKK->create_kokkos(k_vatom,vatom,maxvatom,"pair:vatom"); + d_vatom = k_vatom.view(); + } + + copymode = 1; + int newton_pair = force->newton_pair; + if (newton_pair == false) + error->all(FLERR,"PairPACEKokkos requires 'newton on'"); + + if (recursive) + error->all(FLERR,"Must use 'product' algorithm with pair pace/kk"); + + atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK); + x = atomKK->k_x.view(); + f = atomKK->k_f.view(); + type = atomKK->k_type.view(); + k_scale.template sync(); + k_cutsq.template sync(); + + NeighListKokkos* k_list = static_cast*>(list); + d_numneigh = k_list->d_numneigh; + d_neighbors = k_list->d_neighbors; + d_ilist = k_list->d_ilist; + inum = list->inum; + + need_dup = lmp->kokkos->need_dup(); + if (need_dup) { + dup_f = Kokkos::Experimental::create_scatter_view(f); + dup_vatom = Kokkos::Experimental::create_scatter_view(d_vatom); + } else { + ndup_f = Kokkos::Experimental::create_scatter_view(f); + ndup_vatom = Kokkos::Experimental::create_scatter_view(d_vatom); + } + + maxneigh = 0; + Kokkos::parallel_reduce("pace::find_maxneigh", inum, FindMaxNumNeighs(k_list), Kokkos::Max(maxneigh)); + + int vector_length_default = 1; + int team_size_default = 1; + if (!host_flag) + team_size_default = 32; + + chunk_size = MIN(chunksize,inum); // "chunksize" variable is set by user + chunk_offset = 0; + + grow(chunk_size, maxneigh); + + EV_FLOAT ev; + + while (chunk_offset < inum) { // chunk up loop to prevent running out of memory + + Kokkos::deep_copy(weights, 0.0); + Kokkos::deep_copy(weights_rank1, 0.0); + Kokkos::deep_copy(A, 0.0); + Kokkos::deep_copy(A_rank1, 0.0); + Kokkos::deep_copy(rhos, 0.0); + + EV_FLOAT ev_tmp; + + if (chunk_size > inum - chunk_offset) + chunk_size = inum - chunk_offset; + + //Neigh + { + int vector_length = vector_length_default; + int team_size = team_size_default; + check_team_size_for(chunk_size,team_size,vector_length); + int scratch_size = scratch_size_helper(team_size * maxneigh); + typename Kokkos::TeamPolicy policy_neigh(chunk_size,team_size,vector_length); + policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this); + } + + //ComputeRadial + { + int vector_length = vector_length_default; + int team_size = team_size_default; + check_team_size_for(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length); + typename Kokkos::TeamPolicy policy_radial(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length); + Kokkos::parallel_for("ComputeRadial",policy_radial,*this); + } + + //ComputeYlm + { + int vector_length = vector_length_default; + int team_size = 16; + check_team_size_for(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length); + typename Kokkos::TeamPolicy policy_ylm(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length); + Kokkos::parallel_for("ComputeYlm",policy_ylm,*this); + } + + //ComputeAi + { + int vector_length = vector_length_default; + int team_size = team_size_default; + check_team_size_for(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length); + typename Kokkos::TeamPolicy policy_ai(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length); + Kokkos::parallel_for("ComputeAi",policy_ai,*this); + } + + //ConjugateAi + { + typename Kokkos::RangePolicy policy_conj_ai(0,chunk_size); + Kokkos::parallel_for("ConjugateAi",policy_conj_ai,*this); + } + + //ComputeRho + { + typename Kokkos::RangePolicy policy_rho(0,chunk_size*idx_rho_max); + Kokkos::parallel_for("ComputeRho",policy_rho,*this); + } + + //ComputeFS + { + typename Kokkos::RangePolicy policy_fs(0,chunk_size); + Kokkos::parallel_for("ComputeFS",policy_fs,*this); + } + + //ComputeWeights + { + typename Kokkos::RangePolicy policy_weights(0,chunk_size*idx_rho_max); + Kokkos::parallel_for("ComputeWeights",policy_weights,*this); + } + + //ComputeDerivative + { + int vector_length = vector_length_default; + int team_size = team_size_default; + check_team_size_for(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length); + typename Kokkos::TeamPolicy policy_derivative(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length); + Kokkos::parallel_for("ComputeDerivative",policy_derivative,*this); + } + + //ComputeForce + { + if (evflag) { + if (neighflag == HALF) { + typename Kokkos::RangePolicy > policy_force(0,chunk_size); + Kokkos::parallel_reduce(policy_force, *this, ev_tmp); + } else if (neighflag == HALFTHREAD) { + typename Kokkos::RangePolicy > policy_force(0,chunk_size); + Kokkos::parallel_reduce("ComputeForce",policy_force, *this, ev_tmp); + } + } else { + if (neighflag == HALF) { + typename Kokkos::RangePolicy > policy_force(0,chunk_size); + Kokkos::parallel_for(policy_force, *this); + } else if (neighflag == HALFTHREAD) { + typename Kokkos::RangePolicy > policy_force(0,chunk_size); + Kokkos::parallel_for("ComputeForce",policy_force, *this); + } + } + } + ev += ev_tmp; + chunk_offset += chunk_size; + + } // end while + + if (need_dup) + Kokkos::Experimental::contribute(f, dup_f); + + if (eflag_global) eng_vdwl += ev.evdwl; + if (vflag_global) { + virial[0] += ev.v[0]; + virial[1] += ev.v[1]; + virial[2] += ev.v[2]; + virial[3] += ev.v[3]; + virial[4] += ev.v[4]; + virial[5] += ev.v[5]; + } + + if (vflag_fdotr) pair_virial_fdotr_compute(this); + + if (eflag_atom) { + k_eatom.template modify(); + k_eatom.template sync(); + } + + if (vflag_atom) { + if (need_dup) + Kokkos::Experimental::contribute(d_vatom, dup_vatom); + k_vatom.template modify(); + k_vatom.template sync(); + } + + atomKK->modified(execution_space,F_MASK); + + copymode = 0; + + // free duplicated memory + if (need_dup) { + dup_f = decltype(dup_f)(); + dup_vatom = decltype(dup_vatom)(); + } +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairPACEKokkos::operator() (TagPairPACEComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const +{ + const int ii = team.league_rank(); + const int i = d_ilist[ii + chunk_offset]; + const int itype = type[i]; + const X_FLOAT xtmp = x(i,0); + const X_FLOAT ytmp = x(i,1); + const X_FLOAT ztmp = x(i,2); + const int jnum = d_numneigh[i]; + + // get a pointer to scratch memory + // This is used to cache whether or not an atom is within the cutoff + // If it is, inside is assigned to 1, otherwise -1 + const int team_rank = team.team_rank(); + const int scratch_shift = team_rank * maxneigh; // offset into pointer for entire team + int* inside = (int*)team.team_shmem().get_shmem(team.team_size() * maxneigh * sizeof(int), 0) + scratch_shift; + + // loop over list of all neighbors within force cutoff + // distsq[] = distance sq to each + // rlist[] = distance vector to each + // nearest[] = atom indices of neighbors + + int ncount = 0; + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,jnum), + [&] (const int jj, int& count) { + int j = d_neighbors(i,jj); + j &= NEIGHMASK; + + const int jtype = type(j); + + const F_FLOAT delx = xtmp - x(j,0); + const F_FLOAT dely = ytmp - x(j,1); + const F_FLOAT delz = ztmp - x(j,2); + const F_FLOAT rsq = delx*delx + dely*dely + delz*delz; + + inside[jj] = -1; + if (rsq < d_cutsq(itype,jtype)) { + inside[jj] = 1; + count++; + } + },ncount); + + d_ncount(ii) = ncount; + + Kokkos::parallel_scan(Kokkos::TeamThreadRange(team,jnum), + [&] (const int jj, int& offset, bool final) { + + if (inside[jj] < 0) return; + + if (final) { + int j = d_neighbors(i,jj); + j &= NEIGHMASK; + const F_FLOAT delx = xtmp - x(j,0); + const F_FLOAT dely = ytmp - x(j,1); + const F_FLOAT delz = ztmp - x(j,2); + const F_FLOAT rsq = delx*delx + dely*dely + delz*delz; + const F_FLOAT r = sqrt(rsq); + const F_FLOAT rinv = 1.0/r; + const int mu_j = d_map(type(j)); + d_mu(ii,offset) = mu_j; + d_rnorms(ii,offset) = r; + d_rhats(ii,offset,0) = -delx*rinv; + d_rhats(ii,offset,1) = -dely*rinv; + d_rhats(ii,offset,2) = -delz*rinv; + d_nearest(ii,offset) = j; + } + offset++; + }); +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairPACEKokkos::operator() (TagPairPACEComputeRadial, const typename Kokkos::TeamPolicy::member_type& team) const +{ + // Extract the atom number + int ii = team.team_rank() + team.team_size() * (team.league_rank() % + ((chunk_size+team.team_size()-1)/team.team_size())); + if (ii >= chunk_size) return; + const int i = d_ilist[ii + chunk_offset]; + + // Extract the neighbor number + const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size()); + const int ncount = d_ncount(ii); + if (jj >= ncount) return; + + const double r_norm = d_rnorms(ii, jj); + const int mu_i = d_map(type(i)); + const int mu_j = d_mu(ii, jj); + + evaluate_splines(ii, jj, r_norm, nradbase, nradmax, mu_i, mu_j); +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairPACEKokkos::operator() (TagPairPACEComputeYlm, const typename Kokkos::TeamPolicy::member_type& team) const +{ + // Extract the atom number + int ii = team.team_rank() + team.team_size() * (team.league_rank() % + ((chunk_size+team.team_size()-1)/team.team_size())); + if (ii >= chunk_size) return; + + // Extract the neighbor number + const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size()); + const int ncount = d_ncount(ii); + if (jj >= ncount) return; + + const double xn = d_rhats(ii, jj, 0); + const double yn = d_rhats(ii, jj, 1); + const double zn = d_rhats(ii, jj, 2); + compute_ylm(ii,jj,xn,yn,zn,lmax); +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairPACEKokkos::operator() (TagPairPACEComputeAi, const typename Kokkos::TeamPolicy::member_type& team) const +{ + // Extract the atom number + int ii = team.team_rank() + team.team_size() * (team.league_rank() % + ((chunk_size+team.team_size()-1)/team.team_size())); + if (ii >= chunk_size) return; + const int i = d_ilist[ii + chunk_offset]; + + // Extract the neighbor number + const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size()); + const int ncount = d_ncount(ii); + if (jj >= ncount) return; + + const int mu_j = d_mu(ii, jj); + + // rank = 1 + for (int n = 0; n < nradbase; n++) + Kokkos::atomic_add(&A_rank1(ii, mu_j, n), gr(ii, jj, n) * Y00); + + // rank > 1 + for (int n = 0; n < nradmax; n++) { + for (int l = 0; l <= lmax; l++) { + for (int m = 0; m <= l; m++) { + const int idx = l * (l + 1) + m; // (l, m) + Kokkos::atomic_add(&A(ii, mu_j, n, idx).re, fr(ii, jj, n, l) * ylm(ii, jj, idx).re); + Kokkos::atomic_add(&A(ii, mu_j, n, idx).im, fr(ii, jj, n, l) * ylm(ii, jj, idx).im); + } + } + } + + // hard-core repulsion + Kokkos::atomic_add(&rho_core(ii), cr(ii, jj)); +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairPACEKokkos::operator() (TagPairPACEConjugateAi, const int& ii) const +{ + const int i = d_ilist[ii + chunk_offset]; + + //complex conjugate A's (for NEGATIVE (-m) terms) + // for rank > 1 + for (int mu_j = 0; mu_j < nelements; mu_j++) { + for (int n = 0; n < nradmax; n++) { + for (int l = 0; l <= lmax; l++) { + //fill in -m part in the outer loop using the same m <-> -m symmetry as for Ylm + for (int m = 1; m <= l; m++) { + const int idx = l * (l + 1) + m; // (l, m) + const int idxm = l * (l + 1) - m; // (l, -m) + const int factor = m % 2 == 0 ? 1 : -1; + A(ii, mu_j, n, idxm) = A(ii, mu_j, n, idx).conj() * (double)factor; + } + } + } + } +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairPACEKokkos::operator() (TagPairPACEComputeRho, const int& iter) const +{ + const int idx_rho = iter / chunk_size; + const int ii = iter % chunk_size; + + const int i = d_ilist[ii + chunk_offset]; + const int mu_i = d_map(type(i)); + + if (idx_rho >= d_idx_rho_count(mu_i)) return; + + const int ndensity = d_ndensity(mu_i); + + const int offset = d_offsets(mu_i, idx_rho); + const int rank = d_rank(mu_i, offset); + const int r = rank - 1; + + // Basis functions B with iterative product and density rho(p) calculation + if (rank == 1) { + const int mu = d_mus(mu_i, offset, 0); + const int n = d_ns(mu_i, offset, 0); + double A_cur = A_rank1(ii, mu, n - 1); + for (int p = 0; p < ndensity; ++p) { + //for rank=1 (r=0) only 1 ms-combination exists (ms_ind=0), so index of func.ctildes is 0..ndensity-1 + Kokkos::atomic_add(&rhos(ii, p), d_ctildes(mu_i, idx_rho, p) * A_cur); + } + } else { // rank > 1 + // loop over {ms} combinations in sum + + // loop over m, collect B = product of A with given ms + A_forward_prod(ii, idx_rho, 0) = complex::one(); + + // fill forward A-product triangle + for (int t = 0; t < rank; t++) { + //TODO: optimize ns[t]-1 -> ns[t] during functions construction + const int mu = d_mus(mu_i, offset, t); + const int n = d_ns(mu_i, offset, t); + const int l = d_ls(mu_i, offset, t); + const int m = d_ms_combs(mu_i, idx_rho, t); // current ms-combination (of length = rank) + const int idx = l * (l + 1) + m; // (l, m) + A_list(ii, idx_rho, t) = A(ii, mu, n - 1, idx); + A_forward_prod(ii, idx_rho, t + 1) = A_forward_prod(ii, idx_rho, t) * A_list(ii, idx_rho, t); + } + + complex A_backward_prod = complex::one(); + + // fill backward A-product triangle + for (int t = r; t >= 1; t--) { + const complex dB = A_forward_prod(ii, idx_rho, t) * A_backward_prod; // dB - product of all A's except t-th + dB_flatten(ii, idx_rho, t) = dB; + + A_backward_prod = A_backward_prod * A_list(ii, idx_rho, t); + } + dB_flatten(ii, idx_rho, 0) = A_forward_prod(ii, idx_rho, 0) * A_backward_prod; + + const complex B = A_forward_prod(ii, idx_rho, rank); + + for (int p = 0; p < ndensity; ++p) { + // real-part only multiplication + Kokkos::atomic_add(&rhos(ii, p), B.real_part_product(d_ctildes(mu_i, idx_rho, p))); + } + } +} + +/* ---------------------------------------------------------------------- */ + + +template +KOKKOS_INLINE_FUNCTION +void PairPACEKokkos::operator() (TagPairPACEComputeFS, const int& ii) const +{ + const int i = d_ilist[ii + chunk_offset]; + const int mu_i = d_map(type(i)); + + const double rho_cut = d_rho_core_cutoff(mu_i); + const double drho_cut = d_drho_core_cutoff(mu_i); + const int ndensity = d_ndensity(mu_i); + + double evdwl, fcut, dfcut; + evdwl = fcut = dfcut = 0.0; + + inner_cutoff(rho_core(ii), rho_cut, drho_cut, fcut, dfcut); + FS_values_and_derivatives(ii, evdwl, mu_i); + + dF_drho_core(ii) = evdwl * dfcut + 1; + for (int p = 0; p < ndensity; ++p) + dF_drho(ii, p) *= fcut; + + + // tally energy contribution + if (eflag) { + double evdwl_cut = evdwl * fcut + rho_core(ii); + // E0 shift + evdwl_cut += d_E0vals(mu_i); + e_atom(ii) = evdwl_cut; + } +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairPACEKokkos::operator() (TagPairPACEComputeWeights, const int& iter) const +{ + const int idx_rho = iter / chunk_size; + const int ii = iter % chunk_size; + + const int i = d_ilist[ii + chunk_offset]; + const int mu_i = d_map(type(i)); + + if (idx_rho >= d_idx_rho_count(mu_i)) return; + + const int ndensity = d_ndensity(mu_i); + + const int offset = d_offsets(mu_i, idx_rho); + const int rank = d_rank(mu_i, offset); + + // Weights and theta calculation + + if (rank == 1) { + const int mu = d_mus(mu_i, offset, 0); + const int n = d_ns(mu_i, offset, 0); + double theta = 0.0; + for (int p = 0; p < ndensity; ++p) { + // for rank=1 (r=0) only 1 ms-combination exists (ms_ind=0), so index of func.ctildes is 0..ndensity-1 + theta += dF_drho(ii, p) * d_ctildes(mu_i, idx_rho, p); + } + Kokkos::atomic_add(&weights_rank1(ii, mu, n - 1), theta); + } else { // rank > 1 + double theta = 0.0; + for (int p = 0; p < ndensity; ++p) + theta += dF_drho(ii, p) * d_ctildes(mu_i, idx_rho, p); + + theta *= 0.5; // 0.5 factor due to possible double counting ??? + for (int t = 0; t < rank; ++t) { + const int m_t = d_ms_combs(mu_i, idx_rho, t); + const int factor = (m_t % 2 == 0 ? 1 : -1); + const complex dB = dB_flatten(ii, idx_rho, t); + const int mu_t = d_mus(mu_i, offset, t); + const int n_t = d_ns(mu_i, offset, t); + const int l_t = d_ls(mu_i, offset, t); + const int idx = l_t * (l_t + 1) + m_t; // (l, m) + const complex value = theta * dB; + Kokkos::atomic_add(&(weights(ii, mu_t, n_t - 1, idx).re), value.re); + Kokkos::atomic_add(&(weights(ii, mu_t, n_t - 1, idx).im), value.im); + // update -m_t (that could also be positive), because the basis is half_basis + const int idxm = l_t * (l_t + 1) - m_t; // (l, -m) + const complex valuem = theta * dB.conj() * (double)factor; + Kokkos::atomic_add(&(weights(ii, mu_t, n_t - 1, idxm).re), valuem.re); + Kokkos::atomic_add(&(weights(ii, mu_t, n_t - 1, idxm).im), valuem.im); + } + } +} + +/* ---------------------------------------------------------------------- */ +template +KOKKOS_INLINE_FUNCTION +void PairPACEKokkos::operator() (TagPairPACEComputeDerivative, const typename Kokkos::TeamPolicy::member_type& team) const +{ + // Extract the atom number + int ii = team.team_rank() + team.team_size() * (team.league_rank() % + ((chunk_size+team.team_size()-1)/team.team_size())); + if (ii >= chunk_size) return; + const int i = d_ilist[ii + chunk_offset]; + + // Extract the neighbor number + const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size()); + const int ncount = d_ncount(ii); + if (jj >= ncount) return; + + const int itype = type(i); + const double scale = d_scale(itype,itype); + + int j = d_nearest(ii,jj); + + const int mu_j = d_mu(ii, jj); + double r_hat[3]; + r_hat[0] = d_rhats(ii, jj, 0); + r_hat[1] = d_rhats(ii, jj, 1); + r_hat[2] = d_rhats(ii, jj, 2); + const double r = d_rnorms(ii, jj); + const double rinv = 1.0/r; + + double f_ji[3]; + f_ji[0] = f_ji[1] = f_ji[2] = 0; + + // for rank = 1 + for (int n = 0; n < nradbase; ++n) { + if (weights_rank1(ii, mu_j, n) == 0) continue; + double &DG = dgr(ii, jj, n); + double DGR = DG * Y00; + DGR *= weights_rank1(ii, mu_j, n); + f_ji[0] += DGR * r_hat[0]; + f_ji[1] += DGR * r_hat[1]; + f_ji[2] += DGR * r_hat[2]; + } + + // for rank > 1 + for (int n = 0; n < nradmax; n++) { + for (int l = 0; l <= lmax; l++) { + const double R_over_r = fr(ii, jj, n, l) * rinv; + const double DR = dfr(ii, jj, n, l); + + // for m >= 0 + for (int m = 0; m <= l; m++) { + const int idx = l * (l + 1) + m; // (l, m) + complex w = weights(ii, mu_j, n, idx); + if (w.re == 0.0 && w.im == 0.0) continue; + // counting for -m cases if m > 0 + if (m > 0) { + w.re *= 2.0; + w.im *= 2.0; + } + + complex DY[3]; + DY[0] = dylm(ii, jj, idx, 0); + DY[1] = dylm(ii, jj, idx, 1); + DY[2] = dylm(ii, jj, idx, 2); + const complex Y_DR = ylm(ii, jj, idx) * DR; + + complex grad_phi_nlm[3]; + grad_phi_nlm[0] = Y_DR * r_hat[0] + DY[0] * R_over_r; + grad_phi_nlm[1] = Y_DR * r_hat[1] + DY[1] * R_over_r; + grad_phi_nlm[2] = Y_DR * r_hat[2] + DY[2] * R_over_r; + // real-part multiplication only + f_ji[0] += w.real_part_product(grad_phi_nlm[0]); + f_ji[1] += w.real_part_product(grad_phi_nlm[1]); + f_ji[2] += w.real_part_product(grad_phi_nlm[2]); + } + } + } + + // hard-core repulsion + const double fpair = dF_drho_core(ii) * dcr(ii,jj); + f_ij(ii, jj, 0) = scale * f_ji[0] + fpair * r_hat[0]; + f_ij(ii, jj, 1) = scale * f_ji[1] + fpair * r_hat[1]; + f_ij(ii, jj, 2) = scale * f_ji[2] + fpair * r_hat[2]; +} + +/* ---------------------------------------------------------------------- */ + +template +template +KOKKOS_INLINE_FUNCTION +void PairPACEKokkos::operator() (TagPairPACEComputeForce, const int& ii, EV_FLOAT& ev) const +{ + // The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial + const auto v_f = ScatterViewHelper::value,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f); + const auto a_f = v_f.template access::value>(); + + const int i = d_ilist[ii + chunk_offset]; + const int itype = type(i); + const double scale = d_scale(itype,itype); + + const int ncount = d_ncount(ii); + + F_FLOAT fitmp[3] = {0.0,0.0,0.0}; + for (int jj = 0; jj < ncount; jj++) { + int j = d_nearest(ii,jj); + + double r_hat[3]; + r_hat[0] = d_rhats(ii, jj, 0); + r_hat[1] = d_rhats(ii, jj, 1); + r_hat[2] = d_rhats(ii, jj, 2); + const double r = d_rnorms(ii, jj); + const double delx = -r_hat[0]*r; + const double dely = -r_hat[1]*r; + const double delz = -r_hat[2]*r; + + const double fpairx = f_ij(ii, jj, 0); + const double fpairy = f_ij(ii, jj, 1); + const double fpairz = f_ij(ii, jj, 2); + + fitmp[0] += fpairx; + fitmp[1] += fpairy; + fitmp[2] += fpairz; + a_f(j,0) -= fpairx; + a_f(j,1) -= fpairy; + a_f(j,2) -= fpairz; + + // tally per-atom virial contribution + if (EVFLAG && vflag_either) + v_tally_xyz(ev, i, j, fpairx, fpairy, fpairz, delx, dely, delz); + } + + a_f(i,0) += fitmp[0]; + a_f(i,1) += fitmp[1]; + a_f(i,2) += fitmp[2]; + + // tally energy contribution + if (EVFLAG && eflag_either) { + const double evdwl = scale*e_atom(ii); + //ev_tally_full(i, 2.0 * evdwl, 0.0, 0.0, 0.0, 0.0, 0.0); + if (eflag_global) ev.evdwl += evdwl; + if (eflag_atom) d_eatom[i] += evdwl; + } +} + +template +template +KOKKOS_INLINE_FUNCTION +void PairPACEKokkos::operator() (TagPairPACEComputeForce,const int& ii) const { + EV_FLOAT ev; + this->template operator()(TagPairPACEComputeForce(), ii, ev); +} + +/* ---------------------------------------------------------------------- */ + +template +template +KOKKOS_INLINE_FUNCTION +void PairPACEKokkos::v_tally_xyz(EV_FLOAT &ev, const int &i, const int &j, + const F_FLOAT &fx, const F_FLOAT &fy, const F_FLOAT &fz, + const F_FLOAT &delx, const F_FLOAT &dely, const F_FLOAT &delz) const +{ + // The vatom array is duplicated for OpenMP, atomic for CUDA, and neither for Serial + + auto v_vatom = ScatterViewHelper,decltype(dup_vatom),decltype(ndup_vatom)>::get(dup_vatom,ndup_vatom); + auto a_vatom = v_vatom.template access>(); + + const E_FLOAT v0 = delx*fx; + const E_FLOAT v1 = dely*fy; + const E_FLOAT v2 = delz*fz; + const E_FLOAT v3 = delx*fy; + const E_FLOAT v4 = delx*fz; + const E_FLOAT v5 = dely*fz; + + if (vflag_global) { + ev.v[0] += v0; + ev.v[1] += v1; + ev.v[2] += v2; + ev.v[3] += v3; + ev.v[4] += v4; + ev.v[5] += v5; + } + + if (vflag_atom) { + a_vatom(i,0) += 0.5*v0; + a_vatom(i,1) += 0.5*v1; + a_vatom(i,2) += 0.5*v2; + a_vatom(i,3) += 0.5*v3; + a_vatom(i,4) += 0.5*v4; + a_vatom(i,5) += 0.5*v5; + a_vatom(j,0) += 0.5*v0; + a_vatom(j,1) += 0.5*v1; + a_vatom(j,2) += 0.5*v2; + a_vatom(j,3) += 0.5*v3; + a_vatom(j,4) += 0.5*v4; + a_vatom(j,5) += 0.5*v5; + } +} + +/* ---------------------------------------------------------------------- */ + +template +void PairPACEKokkos::pre_compute_harmonics(int lmax) +{ + auto h_alm = Kokkos::create_mirror_view(alm); + auto h_blm = Kokkos::create_mirror_view(blm); + auto h_cl = Kokkos::create_mirror_view(cl); + auto h_dl = Kokkos::create_mirror_view(dl); + + for (int l = 1; l <= lmax; l++) { + const double lsq = l * l; + const double ld = 2 * l; + const double l1 = (4 * lsq - 1); + const double l2 = lsq - ld + 1; + for (int m = 0; m < l - 1; m++) { + const double msq = m * m; + const double a = sqrt((double(l1)) / (double(lsq - msq))); + const double b = -sqrt((double(l2 - msq)) / (double(4 * l2 - 1))); + const int idx = l * (l + 1) + m; // (l, m) + h_alm(idx) = a; + h_blm(idx) = b; + } + } + + for (int l = 1; l <= lmax; l++) { + h_cl(l) = -sqrt(1.0 + 0.5 / (double(l))); + h_dl(l) = sqrt(double(2 * (l - 1) + 3)); + } + + Kokkos::deep_copy(alm, h_alm); + Kokkos::deep_copy(blm, h_blm); + Kokkos::deep_copy(cl, h_cl); + Kokkos::deep_copy(dl, h_dl); +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairPACEKokkos::compute_barplm(int ii, int jj, double rz, int lmax) const +{ + // requires -1 <= rz <= 1 , NO CHECKING IS PERFORMED !!!!!!!!! + // prefactors include 1/sqrt(2) factor compared to reference + + // l=0, m=0 + // plm(ii, jj, 0, 0) = Y00/sq1o4pi; //= sq1o4pi; + plm(ii, jj, 0) = Y00; //= 1; + dplm(ii, jj, 0) = 0.0; + + if (lmax > 0) { + + // l=1, m=0 + plm(ii, jj, 2) = Y00 * sq3 * rz; + dplm(ii, jj, 2) = Y00 * sq3; + + // l=1, m=1 + plm(ii, jj, 3) = -sq3o2 * Y00; + dplm(ii, jj, 3) = 0.0; + + // loop l = 2, lmax + for (int l = 2; l <= lmax; l++) { + for (int m = 0; m < l - 1; m++) { + const int idx = l * (l + 1) + m; // (l, m) + const int idx1 = (l - 1) * l + m; // (l - 1, m) + const int idx2 = (l - 2) * (l - 1) + m; // (l - 2, m) + plm(ii, jj, idx) = alm(idx) * (rz * plm(ii, jj, idx1) + blm(idx) * plm(ii, jj, idx2)); + dplm(ii, jj, idx) = alm(idx) * (plm(ii, jj, idx1) + rz * dplm(ii, jj, idx1) + blm(idx) * dplm(ii, jj, idx2)); + } + const int idx = l * (l + 1) + l; // (l, l) + const int idx1 = l * (l + 1) + l - 1; // (l, l - 1) + const int idx2 = (l - 1) * l + l - 1; // (l - 1, l - 1) + const double t = dl(l) * plm(ii, jj, idx2); + plm(ii, jj, idx1) = t * rz; + dplm(ii, jj, idx1) = t; + plm(ii, jj, idx) = cl(l) * plm(ii, jj, idx2); + dplm(ii, jj, idx) = 0.0; + } + } +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairPACEKokkos::compute_ylm(int ii, int jj, double rx, double ry, double rz, int lmax) const +{ + // requires rx^2 + ry^2 + rz^2 = 1 , NO CHECKING IS PERFORMED !!!!!!!!! + + complex phase; + complex phasem, mphasem1; + complex dyx, dyy, dyz; + complex rdy; + + phase.re = rx; + phase.im = ry; + + // compute barplm + compute_barplm(ii, jj, rz, lmax); + + // m = 0 + for (int l = 0; l <= lmax; l++) { + const int idx = l * (l + 1); + + ylm(ii, jj, idx).re = plm(ii, jj, idx); + ylm(ii, jj, idx).im = 0.0; + + dyz.re = dplm(ii, jj, idx); + rdy.re = dyz.re * rz; + + dylm(ii, jj, idx, 0).re = -rdy.re * rx; + dylm(ii, jj, idx, 0).im = 0.0; + dylm(ii, jj, idx, 1).re = -rdy.re * ry; + dylm(ii, jj, idx, 1).im = 0.0; + dylm(ii, jj, idx, 2).re = dyz.re - rdy.re * rz; + dylm(ii, jj, idx, 2).im = 0; + } + // m = 1 + for (int l = 1; l <= lmax; l++) { + const int idx = l * (l + 1) + 1; + + ylm(ii, jj, idx) = phase * plm(ii, jj, idx); + + dyx.re = plm(ii, jj, idx); + dyx.im = 0.0; + dyy.re = 0.0; + dyy.im = plm(ii, jj, idx); + dyz.re = phase.re * dplm(ii, jj, idx); + dyz.im = phase.im * dplm(ii, jj, idx); + + rdy.re = rx * dyx.re + +rz * dyz.re; + rdy.im = ry * dyy.im + rz * dyz.im; + + dylm(ii, jj, idx, 0).re = dyx.re - rdy.re * rx; + dylm(ii, jj, idx, 0).im = -rdy.im * rx; + dylm(ii, jj, idx, 1).re = -rdy.re * ry; + dylm(ii, jj, idx, 1).im = dyy.im - rdy.im * ry; + dylm(ii, jj, idx, 2).re = dyz.re - rdy.re * rz; + dylm(ii, jj, idx, 2).im = dyz.im - rdy.im * rz; + } + + // m > 1 + phasem = phase; + for (int m = 2; m <= lmax; m++) { + + mphasem1.re = phasem.re * double(m); + mphasem1.im = phasem.im * double(m); + phasem = phasem * phase; + + for (int l = m; l <= lmax; l++) { + const int idx = l * (l + 1) + m; + + ylm(ii, jj, idx).re = phasem.re * plm(ii, jj, idx); + ylm(ii, jj, idx).im = phasem.im * plm(ii, jj, idx); + + dyx = mphasem1 * plm(ii, jj, idx); + dyy.re = -dyx.im; + dyy.im = dyx.re; + dyz = phasem * dplm(ii, jj, idx); + + rdy.re = rx * dyx.re + ry * dyy.re + rz * dyz.re; + rdy.im = rx * dyx.im + ry * dyy.im + rz * dyz.im; + + dylm(ii, jj, idx, 0).re = dyx.re - rdy.re * rx; + dylm(ii, jj, idx, 0).im = dyx.im - rdy.im * rx; + dylm(ii, jj, idx, 1).re = dyy.re - rdy.re * ry; + dylm(ii, jj, idx, 1).im = dyy.im - rdy.im * ry; + dylm(ii, jj, idx, 2).re = dyz.re - rdy.re * rz; + dylm(ii, jj, idx, 2).im = dyz.im - rdy.im * rz; + } + } +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairPACEKokkos::cutoff_func_poly(const double r, const double r_in, const double delta_in, double &fc, double &dfc) const +{ + if (r <= r_in-delta_in) { + fc = 1; + dfc = 0; + } else if (r >= r_in ) { + fc = 0; + dfc = 0; + } else { + double x = 1 - 2 * (1 + (r - r_in) / delta_in); + fc = 0.5 + 7.5 / 2. * (x / 4. - pow(x, 3) / 6. + pow(x, 5) / 20.); + dfc = -7.5 / delta_in * (0.25 - x * x / 2.0 + pow(x, 4) / 4.); + } +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairPACEKokkos::Fexp(const double x, const double m, double &F, double &DF) const +{ + const double w = 1.e6; + const double eps = 1e-10; + + const double lambda = pow(1.0 / w, m - 1.0); + if (abs(x) > eps) { + double g; + const double a = abs(x); + const double am = pow(a, m); + const double w3x3 = pow(w * a, 3); //// use cube + const double sign_factor = (signbit(x) ? -1 : 1); + if (w3x3 > 30.0) + g = 0.0; + else + g = exp(-w3x3); + + const double omg = 1.0 - g; + F = sign_factor * (omg * am + lambda * g * a); + const double dg = -3.0 * w * w * w * a * a * g; + DF = m * pow(a, m - 1.0) * omg - am * dg + lambda * dg * a + lambda * g; + } else { + F = lambda * x; + DF = lambda; + } +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairPACEKokkos::FexpShiftedScaled(const double rho, const double mexp, double &F, double &DF) const +{ + const double eps = 1e-10; + + if (abs(mexp - 1.0) < eps) { + F = rho; + DF = 1; + } else { + const double a = abs(rho); + const double exprho = exp(-a); + const double nx = 1. / mexp; + const double xoff = pow(nx, (nx / (1.0 - nx))) * exprho; + const double yoff = pow(nx, (1 / (1.0 - nx))) * exprho; + const double sign_factor = (signbit(rho) ? -1 : 1); + F = sign_factor * (pow(xoff + a, mexp) - yoff); + DF = yoff + mexp * (-xoff + 1.0) * pow(xoff + a, mexp - 1.); + } +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairPACEKokkos::inner_cutoff(const double rho_core, const double rho_cut, const double drho_cut, + double &fcut, double &dfcut) const +{ + double rho_low = rho_cut - drho_cut; + if (rho_core >= rho_cut) { + fcut = 0; + dfcut = 0; + } else if (rho_core <= rho_low) { + fcut = 1; + dfcut = 0; + } else { + cutoff_func_poly(rho_core, rho_cut, drho_cut, fcut, dfcut); + } +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairPACEKokkos::FS_values_and_derivatives(const int ii, double &evdwl, const int mu_i) const +{ + double F, DF = 0; + int npoti = d_npoti(mu_i); + int ndensity = d_ndensity(mu_i); + for (int p = 0; p < ndensity; p++) { + const double wpre = d_wpre(mu_i, p); + const double mexp = d_mexp(mu_i, p); + + if (npoti == FS) + Fexp(rhos(ii, p), mexp, F, DF); + else if (npoti == FS_SHIFTEDSCALED) + FexpShiftedScaled(rhos(ii, p), mexp, F, DF); + + evdwl += F * wpre; // * weight (wpre) + dF_drho(ii, p) = DF * wpre; // * weight (wpre) + } +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairPACEKokkos::evaluate_splines(const int ii, const int jj, double r, int nradbase_c, int nradial_c, int mu_i, + int mu_j) const +{ + auto &spline_gk = k_splines_gk.template view()(mu_i, mu_j); + auto &spline_rnl = k_splines_rnl.template view()(mu_i, mu_j); + auto &spline_hc = k_splines_hc.template view()(mu_i, mu_j); + + spline_gk.calcSplines(ii, jj, r, gr, dgr); + + spline_rnl.calcSplines(ii, jj, r, d_values, d_derivatives); + for (int kk = 0; kk < fr.extent(2); kk++) { + for (int ll = 0; ll < fr.extent(3); ll++) { + const int flatten = kk*fr.extent(3) + ll; + fr(ii, jj, kk, ll) = d_values(ii, jj, flatten); + dfr(ii, jj, kk, ll) = d_derivatives(ii, jj, flatten); + } + } + + spline_hc.calcSplines(ii, jj, r, d_values, d_derivatives); + cr(ii, jj) = d_values(ii, jj, 0); + dcr(ii, jj) = d_derivatives(ii, jj, 0); +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairPACEKokkos::SplineInterpolatorKokkos::calcSplines(const int ii, const int jj, const double r, const t_ace_3d &d_values, const t_ace_3d &d_derivatives) const +{ + double wl, wl2, wl3, w2l1, w3l2; + double c[4]; + double x = r * rscalelookup; + int nl = static_cast(floor(x)); + + if (nl <= 0) + Kokkos::abort("Encountered very small distance. Stopping."); + + if (nl < nlut) { + wl = x - double(nl); + wl2 = wl * wl; + wl3 = wl2 * wl; + w2l1 = 2.0 * wl; + w3l2 = 3.0 * wl2; + for (int func_id = 0; func_id < num_of_functions; func_id++) { + for (int idx = 0; idx < 4; idx++) + c[idx] = lookupTable(nl, func_id, idx); + d_values(ii, jj, func_id) = c[0] + c[1] * wl + c[2] * wl2 + c[3] * wl3; + d_derivatives(ii, jj, func_id) = (c[1] + c[2] * w2l1 + c[3] * w3l2) * rscalelookup; + } + } else { // fill with zeroes + for (int func_id = 0; func_id < num_of_functions; func_id++) { + d_values(ii, jj, func_id) = 0.0; + d_derivatives(ii, jj, func_id) = 0.0; + } + } +} + +/* ---------------------------------------------------------------------- */ + +template +template +void PairPACEKokkos::check_team_size_for(int inum, int &team_size, int vector_length) { + int team_size_max; + + team_size_max = Kokkos::TeamPolicy(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag()); + + if (team_size*vector_length > team_size_max) + team_size = team_size_max/vector_length; +} + +/* ---------------------------------------------------------------------- */ + +template +template +void PairPACEKokkos::check_team_size_reduce(int inum, int &team_size, int vector_length) { + int team_size_max; + + team_size_max = Kokkos::TeamPolicy(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelReduceTag()); + + if (team_size*vector_length > team_size_max) + team_size = team_size_max/vector_length; +} + +template +template +int PairPACEKokkos::scratch_size_helper(int values_per_team) { + typedef Kokkos::View > ScratchViewType; + + return ScratchViewType::shmem_size(values_per_team); +} + +/* ---------------------------------------------------------------------- + memory usage of arrays +------------------------------------------------------------------------- */ + +template +double PairPACEKokkos::memory_usage() +{ + double bytes = 0; + + bytes += memoryKK->memory_usage(A); + bytes += memoryKK->memory_usage(A_rank1); + bytes += memoryKK->memory_usage(A_list); + bytes += memoryKK->memory_usage(A_forward_prod); + bytes += memoryKK->memory_usage(e_atom); + bytes += memoryKK->memory_usage(rhos); + bytes += memoryKK->memory_usage(dF_drho); + bytes += memoryKK->memory_usage(weights); + bytes += memoryKK->memory_usage(weights_rank1); + bytes += memoryKK->memory_usage(rho_core); + bytes += memoryKK->memory_usage(dF_drho_core); + bytes += memoryKK->memory_usage(dB_flatten); + bytes += memoryKK->memory_usage(fr); + bytes += memoryKK->memory_usage(dfr); + bytes += memoryKK->memory_usage(gr); + bytes += memoryKK->memory_usage(dgr); + bytes += memoryKK->memory_usage(d_values); + bytes += memoryKK->memory_usage(d_derivatives); + bytes += memoryKK->memory_usage(cr); + bytes += memoryKK->memory_usage(dcr); + bytes += memoryKK->memory_usage(plm); + bytes += memoryKK->memory_usage(dplm); + bytes += memoryKK->memory_usage(ylm); + bytes += memoryKK->memory_usage(dylm); + bytes += memoryKK->memory_usage(d_ncount); + bytes += memoryKK->memory_usage(d_mu); + bytes += memoryKK->memory_usage(d_rhats); + bytes += memoryKK->memory_usage(d_rnorms); + bytes += memoryKK->memory_usage(d_nearest); + bytes += memoryKK->memory_usage(f_ij); + bytes += memoryKK->memory_usage(d_rho_core_cutoff); + bytes += memoryKK->memory_usage(d_drho_core_cutoff); + bytes += memoryKK->memory_usage(d_E0vals); + bytes += memoryKK->memory_usage(d_ndensity); + bytes += memoryKK->memory_usage(d_npoti); + bytes += memoryKK->memory_usage(d_wpre); + bytes += memoryKK->memory_usage(d_mexp); + bytes += memoryKK->memory_usage(d_idx_rho_count); + bytes += memoryKK->memory_usage(d_rank); + bytes += memoryKK->memory_usage(d_num_ms_combs); + bytes += memoryKK->memory_usage(d_offsets); + bytes += memoryKK->memory_usage(d_mus); + bytes += memoryKK->memory_usage(d_ns); + bytes += memoryKK->memory_usage(d_ls); + bytes += memoryKK->memory_usage(d_ms_combs); + bytes += memoryKK->memory_usage(d_ctildes); + bytes += memoryKK->memory_usage(alm); + bytes += memoryKK->memory_usage(blm); + bytes += memoryKK->memory_usage(cl); + bytes += memoryKK->memory_usage(dl); + + for (int i = 0; i < nelements; i++) + for (int j = 0; j < nelements; j++) + bytes += k_splines_gk.h_view(i, j).memory_usage(); + + return bytes; +} + +/* ---------------------------------------------------------------------- */ + +namespace LAMMPS_NS { +template class PairPACEKokkos; +#ifdef LMP_KOKKOS_GPU +template class PairPACEKokkos; +#endif +} diff --git a/src/KOKKOS/pair_pace_kokkos.h b/src/KOKKOS/pair_pace_kokkos.h new file mode 100644 index 0000000000..d9f0b14835 --- /dev/null +++ b/src/KOKKOS/pair_pace_kokkos.h @@ -0,0 +1,334 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(pace/kk,PairPACEKokkos); +PairStyle(pace/kk/device,PairPACEKokkos); +PairStyle(pace/kk/host,PairPACEKokkos); +// clang-format on +#else + +// clang-format off +#ifndef LMP_PAIR_PACE_KOKKOS_H +#define LMP_PAIR_PACE_KOKKOS_H + +#include "pair_pace.h" +#include "ace_radial.h" +#include "kokkos_type.h" +#include "pair_kokkos.h" + +namespace LAMMPS_NS { + +template +class PairPACEKokkos : public PairPACE { + public: + struct TagPairPACEComputeNeigh{}; + struct TagPairPACEComputeRadial{}; + struct TagPairPACEComputeYlm{}; + struct TagPairPACEComputeAi{}; + struct TagPairPACEConjugateAi{}; + struct TagPairPACEComputeRho{}; + struct TagPairPACEComputeFS{}; + struct TagPairPACEComputeWeights{}; + struct TagPairPACEComputeDerivative{}; + + template + struct TagPairPACEComputeForce{}; + + typedef DeviceType device_type; + typedef ArrayTypes AT; + typedef EV_FLOAT value_type; + using complex = SNAComplex; + + PairPACEKokkos(class LAMMPS *); + ~PairPACEKokkos() override; + + void compute(int, int) override; + void coeff(int, char **) override; + void init_style() override; + double init_one(int, int) override; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairPACEComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairPACEComputeRadial,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairPACEComputeYlm,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairPACEComputeAi,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairPACEConjugateAi,const int& ii) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairPACEComputeRho,const int& iter) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairPACEComputeFS,const int& ii) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairPACEComputeWeights,const int& iter) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairPACEComputeDerivative,const typename Kokkos::TeamPolicy::member_type& team) const; + + template + KOKKOS_INLINE_FUNCTION + void operator() (TagPairPACEComputeForce,const int& ii) const; + + template + KOKKOS_INLINE_FUNCTION + void operator() (TagPairPACEComputeForce,const int& ii, EV_FLOAT&) const; + + protected: + int inum, maxneigh, chunk_size, chunk_offset, idx_rho_max; + int host_flag; + + int eflag, vflag; + + int neighflag, max_ndensity; + int nelements, lmax, nradmax, nradbase; + + typename AT::t_neighbors_2d d_neighbors; + typename AT::t_int_1d_randomread d_ilist; + typename AT::t_int_1d_randomread d_numneigh; + + DAT::tdual_efloat_1d k_eatom; + DAT::tdual_virial_array k_vatom; + typename AT::t_efloat_1d d_eatom; + typename AT::t_virial_array d_vatom; + + typename AT::t_x_array_randomread x; + typename AT::t_f_array f; + typename AT::t_int_1d_randomread type; + + typedef Kokkos::DualView tdual_fparams; + tdual_fparams k_cutsq, k_scale; + typedef Kokkos::View t_fparams; + t_fparams d_cutsq, d_scale; + + typename AT::t_int_1d d_map; + + int need_dup; + + using KKDeviceType = typename KKDevice::value; + + template + using DupScatterView = KKScatterView; + + template + using NonDupScatterView = KKScatterView; + + DupScatterView dup_f; + DupScatterView dup_vatom; + + NonDupScatterView ndup_f; + NonDupScatterView ndup_vatom; + + friend void pair_virial_fdotr_compute(PairPACEKokkos*); + + void grow(int, int); + void copy_pertype(); + void copy_splines(); + void copy_tilde(); + void allocate() override; + void precompute_harmonics(); + double memory_usage(); + + template + KOKKOS_INLINE_FUNCTION + void v_tally_xyz(EV_FLOAT &ev, const int &i, const int &j, + const F_FLOAT &fx, const F_FLOAT &fy, const F_FLOAT &fz, + const F_FLOAT &delx, const F_FLOAT &dely, const F_FLOAT &delz) const; + + KOKKOS_INLINE_FUNCTION + void compute_barplm(int, int, double, int) const; + + KOKKOS_INLINE_FUNCTION + void compute_ylm(int, int, double, double, double, int) const; + + KOKKOS_INLINE_FUNCTION + void cutoff_func_poly(const double, const double, const double, double &, double &) const; + + KOKKOS_INLINE_FUNCTION + void Fexp(const double, const double, double &, double &) const; + + KOKKOS_INLINE_FUNCTION + void FexpShiftedScaled(const double, const double, double &, double &) const; + + KOKKOS_INLINE_FUNCTION + void inner_cutoff(const double, const double, const double, double &, double &) const; + + KOKKOS_INLINE_FUNCTION + void FS_values_and_derivatives(const int, double&, const int) const; + + KOKKOS_INLINE_FUNCTION + void evaluate_splines(const int, const int, double, int, int, int, int) const; + + template + void check_team_size_for(int, int&, int); + + template + void check_team_size_reduce(int, int&, int); + + // Utility routine which wraps computing per-team scratch size requirements for + // ComputeNeigh, ComputeUi, and ComputeFusedDeidrj + template + int scratch_size_helper(int values_per_team); + + typedef Kokkos::View t_ace_1i; + typedef Kokkos::View t_ace_2i; + typedef Kokkos::View t_ace_3i; + typedef Kokkos::View t_ace_4i; + typedef Kokkos::View t_ace_1d; + typedef Kokkos::View t_ace_2d; + typedef Kokkos::View t_ace_2d3; + typedef Kokkos::View t_ace_3d; + typedef Kokkos::View t_ace_3d3; + typedef Kokkos::View t_ace_3d4; + typedef Kokkos::View t_ace_4d; + typedef Kokkos::View t_ace_1c; + typedef Kokkos::View t_ace_2c; + typedef Kokkos::View t_ace_3c; + typedef Kokkos::View t_ace_3c3; + typedef Kokkos::View t_ace_4c; + typedef Kokkos::View t_ace_4c3; + + t_ace_3d A_rank1; + t_ace_4c A; + + t_ace_3c A_list; + t_ace_3c A_forward_prod; + + t_ace_3d weights_rank1; + t_ace_4c weights; + + t_ace_1d e_atom; + t_ace_2d rhos; + t_ace_2d dF_drho; + + // hard-core repulsion + t_ace_1d rho_core; + t_ace_3c dB_flatten; + t_ace_2d cr; + t_ace_2d dcr; + t_ace_1d dF_drho_core; + + // radial functions + t_ace_4d fr; + t_ace_4d dfr; + t_ace_3d gr; + t_ace_3d dgr; + t_ace_3d d_values; + t_ace_3d d_derivatives; + + // Spherical Harmonics + + void pre_compute_harmonics(int); + + KOKKOS_INLINE_FUNCTION + void compute_barplm(double rz, int lmaxi); + + KOKKOS_INLINE_FUNCTION + void compute_ylm(double rx, double ry, double rz, int lmaxi); + + t_ace_1d alm; + t_ace_1d blm; + t_ace_1d cl; + t_ace_1d dl; + + t_ace_3d plm; + t_ace_3d dplm; + + t_ace_3c ylm; + t_ace_4c3 dylm; + + // short neigh list + t_ace_1i d_ncount; + t_ace_2d d_mu; + t_ace_2d d_rnorms; + t_ace_3d3 d_rhats; + t_ace_2i d_nearest; + + // per-type + t_ace_1i d_ndensity; + t_ace_1i d_npoti; + t_ace_1d d_rho_core_cutoff; + t_ace_1d d_drho_core_cutoff; + t_ace_1d d_E0vals; + t_ace_2d d_wpre; + t_ace_2d d_mexp; + + // tilde + t_ace_1i d_idx_rho_count; + t_ace_2i d_rank; + t_ace_2i d_num_ms_combs; + t_ace_2i d_offsets; + t_ace_3i d_mus; + t_ace_3i d_ns; + t_ace_3i d_ls; + t_ace_3i d_ms_combs; + t_ace_3d d_ctildes; + + t_ace_3d3 f_ij; + + public: + struct SplineInterpolatorKokkos { + int ntot, nlut, num_of_functions; + double cutoff, deltaSplineBins, invrscalelookup, rscalelookup; + + t_ace_3d4 lookupTable; + + void operator=(const SplineInterpolator &spline) { + cutoff = spline.cutoff; + deltaSplineBins = spline.deltaSplineBins; + ntot = spline.ntot; + nlut = spline.nlut; + invrscalelookup = spline.invrscalelookup; + rscalelookup = spline.rscalelookup; + num_of_functions = spline.num_of_functions; + + lookupTable = t_ace_3d4("lookupTable", ntot+1, num_of_functions); + auto h_lookupTable = Kokkos::create_mirror_view(lookupTable); + for (int i = 0; i < ntot+1; i++) + for (int j = 0; j < num_of_functions; j++) + for (int k = 0; k < 4; k++) + h_lookupTable(i, j, k) = spline.lookupTable(i, j, k); + Kokkos::deep_copy(lookupTable, h_lookupTable); + } + + void deallocate() { + lookupTable = t_ace_3d4(); + } + + double memory_usage() { + return lookupTable.span() * sizeof(typename decltype(lookupTable)::value_type); + } + + KOKKOS_INLINE_FUNCTION + void calcSplines(const int ii, const int jj, const double r, const t_ace_3d &d_values, const t_ace_3d &d_derivatives) const; + }; + + Kokkos::DualView k_splines_gk; + Kokkos::DualView k_splines_rnl; + Kokkos::DualView k_splines_hc; + +}; +} // namespace LAMMPS_NS + +#endif +#endif diff --git a/src/KOKKOS/pair_snap_kokkos_impl.h b/src/KOKKOS/pair_snap_kokkos_impl.h index 74264ca47e..c2657f8534 100644 --- a/src/KOKKOS/pair_snap_kokkos_impl.h +++ b/src/KOKKOS/pair_snap_kokkos_impl.h @@ -91,9 +91,8 @@ void PairSNAPKokkos::init_style() { if (host_flag) { if (lmp->kokkos->nthreads > 1) - if (comm->me == 0) - utils::logmesg(lmp,"Pair style snap/kk currently only runs on a single " - "CPU thread, even if more threads are requested\n"); + error->all(FLERR,"Pair style snap/kk can currently only run on a single " + "CPU thread"); PairSNAP::init_style(); return; diff --git a/src/ML-PACE/pair_pace.cpp b/src/ML-PACE/pair_pace.cpp index 11e8a659a5..a583ebe508 100644 --- a/src/ML-PACE/pair_pace.cpp +++ b/src/ML-PACE/pair_pace.cpp @@ -91,6 +91,8 @@ PairPACE::PairPACE(LAMMPS *lmp) : Pair(lmp) recursive = false; scale = nullptr; + + chunksize = 4096; } /* ---------------------------------------------------------------------- @@ -250,18 +252,25 @@ void PairPACE::allocate() void PairPACE::settings(int narg, char **arg) { - if (narg > 1) error->all(FLERR, "Illegal pair_style command."); + if (narg > 3) error->all(FLERR, "Illegal pair_style command."); // ACE potentials are parameterized in metal units if (strcmp("metal", update->unit_style) != 0) error->all(FLERR, "ACE potentials require 'metal' units"); recursive = true; // default evaluator style: RECURSIVE - if (narg > 0) { - if (strcmp(arg[0], "recursive") == 0) + + int iarg = 0; + while (iarg < narg) { + if (strcmp(arg[iarg], "recursive") == 0) { recursive = true; - else if (strcmp(arg[0], "product") == 0) { + iarg += 1; + } else if (strcmp(arg[iarg], "product") == 0) { recursive = false; + iarg += 1; + } else if (strcmp(arg[iarg], "chunksize") == 0) { + chunksize = utils::inumeric(FLERR,arg[iarg+1],false,lmp); + iarg += 2; } else error->all(FLERR, "Illegal pair_style command"); } diff --git a/src/ML-PACE/pair_pace.h b/src/ML-PACE/pair_pace.h index c932b5a679..1f0cffeb3f 100644 --- a/src/ML-PACE/pair_pace.h +++ b/src/ML-PACE/pair_pace.h @@ -56,6 +56,8 @@ class PairPACE : public Pair { double **scale; bool recursive; // "recursive" option for ACERecursiveEvaluator + + int chunksize; }; } // namespace LAMMPS_NS