From 656505642405168137d531896e106172151791ce Mon Sep 17 00:00:00 2001 From: Yury Lysogorskiy Date: Mon, 2 Jan 2023 12:21:00 +0100 Subject: [PATCH 01/12] WIP: add pair_pace_extrapolation_kokkos.cpp/h --- src/KOKKOS/pair_pace_extrapolation_kokkos.cpp | 1724 +++++++++++++++++ src/KOKKOS/pair_pace_extrapolation_kokkos.h | 334 ++++ 2 files changed, 2058 insertions(+) create mode 100644 src/KOKKOS/pair_pace_extrapolation_kokkos.cpp create mode 100644 src/KOKKOS/pair_pace_extrapolation_kokkos.h diff --git a/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp new file mode 100644 index 0000000000..b4f4932db4 --- /dev/null +++ b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp @@ -0,0 +1,1724 @@ +// clang-format off +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + aE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Stan Moore (SNL) +------------------------------------------------------------------------- */ + +#include "pair_pace_extrapolation_kokkos.h" + +#include "atom_kokkos.h" +#include "atom_masks.h" +#include "comm.h" +#include "error.h" +#include "force.h" +#include "kokkos.h" +#include "math_const.h" +#include "memory_kokkos.h" +#include "neighbor_kokkos.h" +#include "neigh_request.h" + +#include "ace-evaluator/ace_c_basis.h" +#include "ace-evaluator/ace_evaluator.h" +#include "ace-evaluator/ace_recursive.h" +#include "ace-evaluator/ace_version.h" +#include "ace-evaluator/ace_radial.h" +#include + +namespace LAMMPS_NS { +struct ACEImpl { + ACEImpl() : basis_set(nullptr), ace(nullptr) {} + ~ACEImpl() + { + delete basis_set; + delete ace; + } + ACECTildeBasisSet *basis_set; + ACERecursiveEvaluator *ace; +}; +} // namespace LAMMPS_NS + +using namespace LAMMPS_NS; +using namespace MathConst; + +enum{FS,FS_SHIFTEDSCALED}; + +/* ---------------------------------------------------------------------- */ + +template +PairPACEExtrapolationKokkos::PairPACEExtrapolationKokkos(LAMMPS *lmp) : PairPACE(lmp) +{ + respa_enable = 0; + + kokkosable = 1; + atomKK = (AtomKokkos *) atom; + execution_space = ExecutionSpaceFromDevice::space; + datamask_read = EMPTY_MASK; + datamask_modify = EMPTY_MASK; + + host_flag = (execution_space == Host); +} + +/* ---------------------------------------------------------------------- + check if allocated, since class can be destructed when incomplete +------------------------------------------------------------------------- */ + +template +PairPACEExtrapolationKokkos::~PairPACEExtrapolationKokkos() +{ + if (copymode) return; + + memoryKK->destroy_kokkos(k_eatom,eatom); + memoryKK->destroy_kokkos(k_vatom,vatom); + + // deallocate views of views in serial to prevent issues in Kokkos tools + + if (k_splines_gk.h_view.data()) { + for (int i = 0; i < nelements; i++) { + for (int j = 0; j < nelements; j++) { + k_splines_gk.h_view(i, j).deallocate(); + k_splines_rnl.h_view(i, j).deallocate(); + k_splines_hc.h_view(i, j).deallocate(); + } + } + } +} + +/* ---------------------------------------------------------------------- */ + +template +void PairPACEExtrapolationKokkos::grow(int natom, int maxneigh) +{ + auto basis_set = aceimpl->basis_set; + + if ((int)A.extent(0) < natom) { + + MemKK::realloc_kokkos(A, "pace:A", natom, nelements, nradmax + 1, (lmax + 1) * (lmax + 1)); + MemKK::realloc_kokkos(A_rank1, "pace:A_rank1", natom, nelements, nradbase); + + MemKK::realloc_kokkos(A_list, "pace:A_list", natom, idx_rho_max, basis_set->rankmax); + //size is +1 of max to avoid out-of-boundary array access in double-triangular scheme + MemKK::realloc_kokkos(A_forward_prod, "pace:A_forward_prod", natom, idx_rho_max, basis_set->rankmax + 1); + + MemKK::realloc_kokkos(e_atom, "pace:e_atom", natom); + MemKK::realloc_kokkos(rhos, "pace:rhos", natom, basis_set->ndensitymax + 1); // +1 density for core repulsion + MemKK::realloc_kokkos(dF_drho, "pace:dF_drho", natom, basis_set->ndensitymax + 1); // +1 density for core repulsion + + MemKK::realloc_kokkos(weights, "pace:weights", natom, nelements, nradmax + 1, (lmax + 1) * (lmax + 1)); + MemKK::realloc_kokkos(weights_rank1, "pace:weights_rank1", natom, nelements, nradbase); + + // hard-core repulsion + MemKK::realloc_kokkos(rho_core, "pace:rho_core", natom); + MemKK::realloc_kokkos(dF_drho_core, "pace:dF_drho_core", natom); + MemKK::realloc_kokkos(dB_flatten, "pace:dB_flatten", natom, idx_rho_max, basis_set->rankmax); + } + + if (((int)ylm.extent(0) < natom) || ((int)ylm.extent(1) < maxneigh)) { + + // radial functions + MemKK::realloc_kokkos(fr, "pace:fr", natom, maxneigh, nradmax, lmax + 1); + MemKK::realloc_kokkos(dfr, "pace:dfr", natom, maxneigh, nradmax, lmax + 1); + MemKK::realloc_kokkos(gr, "pace:gr", natom, maxneigh, nradbase); + MemKK::realloc_kokkos(dgr, "pace:dgr", natom, maxneigh, nradbase); + const int max_num_functions = MAX(nradbase, nradmax*(lmax + 1)); + MemKK::realloc_kokkos(d_values, "pace:d_values", natom, maxneigh, max_num_functions); + MemKK::realloc_kokkos(d_derivatives, "pace:d_derivatives", natom, maxneigh, max_num_functions); + + // hard-core repulsion + MemKK::realloc_kokkos(cr, "pace:cr", natom, maxneigh); + MemKK::realloc_kokkos(dcr, "pace:dcr", natom, maxneigh); + + // spherical harmonics + MemKK::realloc_kokkos(plm, "pace:plm", natom, maxneigh, (lmax + 1) * (lmax + 1)); + MemKK::realloc_kokkos(dplm, "pace:dplm", natom, maxneigh, (lmax + 1) * (lmax + 1)); + MemKK::realloc_kokkos(ylm, "pace:ylm", natom, maxneigh, (lmax + 1) * (lmax + 1)); + MemKK::realloc_kokkos(dylm, "pace:dylm", natom, maxneigh, (lmax + 1) * (lmax + 1)); + + // short neigh list + MemKK::realloc_kokkos(d_ncount, "pace:ncount", natom); + MemKK::realloc_kokkos(d_mu, "pace:mu", natom, maxneigh); + MemKK::realloc_kokkos(d_rhats, "pace:rhats", natom, maxneigh); + MemKK::realloc_kokkos(d_rnorms, "pace:rnorms", natom, maxneigh); + MemKK::realloc_kokkos(d_nearest, "pace:nearest", natom, maxneigh); + + MemKK::realloc_kokkos(f_ij, "pace:f_ij", natom, maxneigh); + } +} + +/* ---------------------------------------------------------------------- */ + +template +void PairPACEExtrapolationKokkos::copy_pertype() +{ + auto basis_set = aceimpl->basis_set; + + MemKK::realloc_kokkos(d_rho_core_cutoff, "pace:rho_core_cutoff", nelements); + MemKK::realloc_kokkos(d_drho_core_cutoff, "pace:drho_core_cutoff", nelements); + MemKK::realloc_kokkos(d_E0vals, "pace:E0vals", nelements); + MemKK::realloc_kokkos(d_ndensity, "pace:ndensity", nelements); + MemKK::realloc_kokkos(d_npoti, "pace:npoti", nelements); + + auto h_rho_core_cutoff = Kokkos::create_mirror_view(d_rho_core_cutoff); + auto h_drho_core_cutoff = Kokkos::create_mirror_view(d_drho_core_cutoff); + auto h_E0vals = Kokkos::create_mirror_view(d_E0vals); + auto h_ndensity = Kokkos::create_mirror_view(d_ndensity); + auto h_npoti = Kokkos::create_mirror_view(d_npoti); + + for (int n = 0; n < nelements; n++) { + h_rho_core_cutoff[n] = basis_set->map_embedding_specifications.at(n).rho_core_cutoff; + h_drho_core_cutoff[n] = basis_set->map_embedding_specifications.at(n).drho_core_cutoff; + + h_E0vals(n)= basis_set->E0vals(n); + + h_ndensity(n) = basis_set->map_embedding_specifications.at(n).ndensity; + + string npoti = basis_set->map_embedding_specifications.at(n).npoti; + if (npoti == "FinnisSinclair") + h_npoti(n) = FS; + else if (npoti == "FinnisSinclairShiftedScaled") + h_npoti(n) = FS_SHIFTEDSCALED; + } + + Kokkos::deep_copy(d_rho_core_cutoff, h_rho_core_cutoff); + Kokkos::deep_copy(d_drho_core_cutoff, h_drho_core_cutoff); + Kokkos::deep_copy(d_E0vals, h_E0vals); + Kokkos::deep_copy(d_ndensity, h_ndensity); + Kokkos::deep_copy(d_npoti, h_npoti); + + MemKK::realloc_kokkos(d_wpre, "pace:wpre", nelements, basis_set->ndensitymax); + MemKK::realloc_kokkos(d_mexp, "pace:mexp", nelements, basis_set->ndensitymax); + + auto h_wpre = Kokkos::create_mirror_view(d_wpre); + auto h_mexp = Kokkos::create_mirror_view(d_mexp); + + for (int n = 0; n < nelements; n++) { + const int ndensity = basis_set->map_embedding_specifications.at(n).ndensity; + for (int p = 0; p < ndensity; p++) { + h_wpre(n, p) = basis_set->map_embedding_specifications.at(n).FS_parameters.at(p * 2 + 0); + h_mexp(n, p) = basis_set->map_embedding_specifications.at(n).FS_parameters.at(p * 2 + 1); + } + } + + Kokkos::deep_copy(d_wpre, h_wpre); + Kokkos::deep_copy(d_mexp, h_mexp); +} + +/* ---------------------------------------------------------------------- */ + +template +void PairPACEExtrapolationKokkos::copy_splines() +{ + auto basis_set = aceimpl->basis_set; + + if (k_splines_gk.d_view.data()) { + for (int i = 0; i < nelements; i++) { + for (int j = 0; j < nelements; j++) { + k_splines_gk.h_view(i, j).deallocate(); + k_splines_rnl.h_view(i, j).deallocate(); + k_splines_hc.h_view(i, j).deallocate(); + } + } + } + + k_splines_gk = Kokkos::DualView("pace:splines_gk", nelements, nelements); + k_splines_rnl = Kokkos::DualView("pace:splines_rnl", nelements, nelements); + k_splines_hc = Kokkos::DualView("pace:splines_hc", nelements, nelements); + + ACERadialFunctions* radial_functions = dynamic_cast(basis_set->radial_functions); + + for (int i = 0; i < nelements; i++) { + for (int j = 0; j < nelements; j++) { + k_splines_gk.h_view(i, j) = radial_functions->splines_gk(i, j); + k_splines_rnl.h_view(i, j) = radial_functions->splines_rnl(i, j); + k_splines_hc.h_view(i, j) = radial_functions->splines_hc(i, j); + } + } + + k_splines_gk.modify_host(); + k_splines_rnl.modify_host(); + k_splines_hc.modify_host(); + + k_splines_gk.sync_device(); + k_splines_rnl.sync_device(); + k_splines_hc.sync_device(); +} + +/* ---------------------------------------------------------------------- */ + +template +void PairPACEExtrapolationKokkos::copy_tilde() +{ + auto basis_set = aceimpl->basis_set; + + // flatten loops, get per-element count and max + + idx_rho_max = 0; + int total_basis_size_max = 0; + + MemKK::realloc_kokkos(d_idx_rho_count, "pace:idx_rho_count", nelements); + auto h_idx_rho_count = Kokkos::create_mirror_view(d_idx_rho_count); + + for (int n = 0; n < nelements; n++) { + int idx_rho = 0; + const int total_basis_size_rank1 = basis_set->total_basis_size_rank1[n]; + const int total_basis_size = basis_set->total_basis_size[n]; + + ACECTildeBasisFunction *basis = basis_set->basis[n]; + + // rank=1 + for (int func_rank1_ind = 0; func_rank1_ind < total_basis_size_rank1; ++func_rank1_ind) + idx_rho++; + + // rank > 1 + for (int func_ind = 0; func_ind < total_basis_size; ++func_ind) { + ACECTildeBasisFunction *func = &basis[func_ind]; + + // loop over {ms} combinations in sum + for (int ms_ind = 0; ms_ind < func->num_ms_combs; ++ms_ind) + idx_rho++; + } + h_idx_rho_count(n) = idx_rho; + idx_rho_max = MAX(idx_rho_max, idx_rho); + total_basis_size_max = MAX(total_basis_size_max, total_basis_size_rank1 + total_basis_size); + } + + Kokkos::deep_copy(d_idx_rho_count, h_idx_rho_count); + + MemKK::realloc_kokkos(d_rank, "pace:rank", nelements, total_basis_size_max); + MemKK::realloc_kokkos(d_num_ms_combs, "pace:num_ms_combs", nelements, total_basis_size_max); + MemKK::realloc_kokkos(d_offsets, "pace:offsets", nelements, idx_rho_max); + MemKK::realloc_kokkos(d_mus, "pace:mus", nelements, total_basis_size_max, basis_set->rankmax); + MemKK::realloc_kokkos(d_ns, "pace:ns", nelements, total_basis_size_max, basis_set->rankmax); + MemKK::realloc_kokkos(d_ls, "pace:ls", nelements, total_basis_size_max, basis_set->rankmax); + MemKK::realloc_kokkos(d_ms_combs, "pace:ms_combs", nelements, idx_rho_max, basis_set->rankmax); + MemKK::realloc_kokkos(d_ctildes, "pace:ctildes", nelements, idx_rho_max, basis_set->ndensitymax); + + auto h_rank = Kokkos::create_mirror_view(d_rank); + auto h_num_ms_combs = Kokkos::create_mirror_view(d_num_ms_combs); + auto h_offsets = Kokkos::create_mirror_view(d_offsets); + auto h_mus = Kokkos::create_mirror_view(d_mus); + auto h_ns = Kokkos::create_mirror_view(d_ns); + auto h_ls = Kokkos::create_mirror_view(d_ls); + auto h_ms_combs = Kokkos::create_mirror_view(d_ms_combs); + auto h_ctildes = Kokkos::create_mirror_view(d_ctildes); + + // copy values on host + + for (int n = 0; n < nelements; n++) { + const int total_basis_size_rank1 = basis_set->total_basis_size_rank1[n]; + const int total_basis_size = basis_set->total_basis_size[n]; + + ACECTildeBasisFunction *basis_rank1 = basis_set->basis_rank1[n]; + ACECTildeBasisFunction *basis = basis_set->basis[n]; + + const int ndensity = basis_set->map_embedding_specifications.at(n).ndensity; + + int idx_rho = 0; + + // rank=1 + for (int offset = 0; offset < total_basis_size_rank1; ++offset) { + ACECTildeBasisFunction *func = &basis_rank1[offset]; + h_rank(n, offset) = 1; + h_mus(n, offset, 0) = func->mus[0]; + h_ns(n, offset, 0) = func->ns[0]; + for (int p = 0; p < ndensity; p++) + h_ctildes(n, idx_rho, p) = func->ctildes[p]; + h_offsets(n, idx_rho) = offset; + idx_rho++; + } + + // rank > 1 + for (int func_ind = 0; func_ind < total_basis_size; ++func_ind) { + ACECTildeBasisFunction *func = &basis[func_ind]; + // TODO: check if func->ctildes are zero, then skip + + const int offset = total_basis_size_rank1 + func_ind; + + const int rank = h_rank(n, offset) = func->rank; + h_num_ms_combs(n, offset) = func->num_ms_combs; + for (int t = 0; t < rank; t++) { + h_mus(n, offset, t) = func->mus[t]; + h_ns(n, offset, t) = func->ns[t]; + h_ls(n, offset, t) = func->ls[t]; + } + + // loop over {ms} combinations in sum + for (int ms_ind = 0; ms_ind < func->num_ms_combs; ++ms_ind) { + auto ms = &func->ms_combs[ms_ind * rank]; // current ms-combination (of length = rank) + for (int t = 0; t < rank; t++) + h_ms_combs(n, idx_rho, t) = ms[t]; + + for (int p = 0; p < ndensity; ++p) { + // real-part only multiplication + h_ctildes(n, idx_rho, p) = func->ctildes[ms_ind * ndensity + p]; + } + h_offsets(n, idx_rho) = offset; + idx_rho++; + } + } + } + + Kokkos::deep_copy(d_rank, h_rank); + Kokkos::deep_copy(d_num_ms_combs, h_num_ms_combs); + Kokkos::deep_copy(d_offsets, h_offsets); + Kokkos::deep_copy(d_mus, h_mus); + Kokkos::deep_copy(d_ns, h_ns); + Kokkos::deep_copy(d_ls, h_ls); + Kokkos::deep_copy(d_ms_combs, h_ms_combs); + Kokkos::deep_copy(d_ctildes, h_ctildes); +} + +/* ---------------------------------------------------------------------- + init specific to this pair style +------------------------------------------------------------------------- */ + +template +void PairPACEExtrapolationKokkos::init_style() +{ + if (host_flag) { + if (lmp->kokkos->nthreads > 1) + error->all(FLERR,"Pair style pace/kk can currently only run on a single " + "CPU thread"); + + PairPACE::init_style(); + return; + } + + if (atom->tag_enable == 0) error->all(FLERR, "Pair style PACE requires atom IDs"); + if (force->newton_pair == 0) error->all(FLERR, "Pair style PACE requires newton pair on"); + + // neighbor list request for KOKKOS + + neighflag = lmp->kokkos->neighflag; + + auto request = neighbor->add_request(this, NeighConst::REQ_FULL); + request->set_kokkos_host(std::is_same::value && + !std::is_same::value); + request->set_kokkos_device(std::is_same::value); + if (neighflag == FULL) + error->all(FLERR,"Must use half neighbor list style with pair pace/kk"); + + auto basis_set = aceimpl->basis_set; + + nelements = basis_set->nelements; + lmax = basis_set->lmax; + nradmax = basis_set->nradmax; + nradbase = basis_set->nradbase; + + // spherical harmonics + + MemKK::realloc_kokkos(alm, "pace:alm", (lmax + 1) * (lmax + 1)); + MemKK::realloc_kokkos(blm, "pace:blm", (lmax + 1) * (lmax + 1)); + MemKK::realloc_kokkos(cl, "pace:cl", lmax + 1); + MemKK::realloc_kokkos(dl, "pace:dl", lmax + 1); + + pre_compute_harmonics(lmax); + copy_pertype(); + copy_splines(); + copy_tilde(); +} + +/* ---------------------------------------------------------------------- + init for one type pair i,j and corresponding j,i +------------------------------------------------------------------------- */ + +template +double PairPACEExtrapolationKokkos::init_one(int i, int j) +{ + double cutone = PairPACE::init_one(i,j); + + k_scale.h_view(i,j) = k_scale.h_view(j,i) = scale[i][j]; + k_scale.template modify(); + + k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutone*cutone; + k_cutsq.template modify(); + + return cutone; +} + +/* ---------------------------------------------------------------------- + set coeffs for one or more type pairs +------------------------------------------------------------------------- */ + +template +void PairPACEExtrapolationKokkos::coeff(int narg, char **arg) +{ + PairPACE::coeff(narg,arg); + + // Set up element lists + + auto h_map = Kokkos::create_mirror_view(d_map); + + for (int i = 1; i <= atom->ntypes; i++) + h_map(i) = map[i]; + + Kokkos::deep_copy(d_map,h_map); +} + +/* ---------------------------------------------------------------------- */ + +template +void PairPACEExtrapolationKokkos::allocate() +{ + PairPACE::allocate(); + + int n = atom->ntypes + 1; + MemKK::realloc_kokkos(d_map, "pace:map", n); + + MemKK::realloc_kokkos(k_cutsq, "pace:cutsq", n, n); + d_cutsq = k_cutsq.template view(); + + MemKK::realloc_kokkos(k_scale, "pace:scale", n, n); + d_scale = k_scale.template view(); +} + +/* ---------------------------------------------------------------------- */ + +template +struct FindMaxNumNeighs { + typedef DeviceType device_type; + NeighListKokkos k_list; + + FindMaxNumNeighs(NeighListKokkos* nl): k_list(*nl) {} + ~FindMaxNumNeighs() {k_list.copymode = 1;} + + KOKKOS_INLINE_FUNCTION + void operator() (const int& ii, int& maxneigh) const { + const int i = k_list.d_ilist[ii]; + const int num_neighs = k_list.d_numneigh[i]; + if (maxneigh < num_neighs) maxneigh = num_neighs; + } +}; + +/* ---------------------------------------------------------------------- */ + +template +void PairPACEExtrapolationKokkos::compute(int eflag_in, int vflag_in) +{ + if (host_flag) { + atomKK->sync(Host,X_MASK|TYPE_MASK); + PairPACE::compute(eflag_in,vflag_in); + atomKK->modified(Host,F_MASK); + return; + } + + eflag = eflag_in; + vflag = vflag_in; + + if (neighflag == FULL) no_virial_fdotr_compute = 1; + + ev_init(eflag,vflag,0); + + // reallocate per-atom arrays if necessary + + if (eflag_atom) { + memoryKK->destroy_kokkos(k_eatom,eatom); + memoryKK->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom"); + d_eatom = k_eatom.view(); + } + if (vflag_atom) { + memoryKK->destroy_kokkos(k_vatom,vatom); + memoryKK->create_kokkos(k_vatom,vatom,maxvatom,"pair:vatom"); + d_vatom = k_vatom.view(); + } + + copymode = 1; + if (!force->newton_pair) + error->all(FLERR,"PairPACEExtrapolationKokkos requires 'newton on'"); + + if (recursive) + error->all(FLERR,"Must use 'product' algorithm with pair pace/kk on the GPU"); + + atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK); + x = atomKK->k_x.view(); + f = atomKK->k_f.view(); + type = atomKK->k_type.view(); + k_scale.template sync(); + k_cutsq.template sync(); + + NeighListKokkos* k_list = static_cast*>(list); + d_numneigh = k_list->d_numneigh; + d_neighbors = k_list->d_neighbors; + d_ilist = k_list->d_ilist; + inum = list->inum; + + need_dup = lmp->kokkos->need_dup(); + if (need_dup) { + dup_f = Kokkos::Experimental::create_scatter_view(f); + dup_vatom = Kokkos::Experimental::create_scatter_view(d_vatom); + } else { + ndup_f = Kokkos::Experimental::create_scatter_view(f); + ndup_vatom = Kokkos::Experimental::create_scatter_view(d_vatom); + } + + maxneigh = 0; + Kokkos::parallel_reduce("pace::find_maxneigh", inum, FindMaxNumNeighs(k_list), Kokkos::Max(maxneigh)); + + int vector_length_default = 1; + int team_size_default = 1; + if (!host_flag) + team_size_default = 32; + + chunk_size = MIN(chunksize,inum); // "chunksize" variable is set by user + chunk_offset = 0; + + grow(chunk_size, maxneigh); + + EV_FLOAT ev; + + while (chunk_offset < inum) { // chunk up loop to prevent running out of memory + + Kokkos::deep_copy(weights, 0.0); + Kokkos::deep_copy(weights_rank1, 0.0); + Kokkos::deep_copy(A, 0.0); + Kokkos::deep_copy(A_rank1, 0.0); + Kokkos::deep_copy(rhos, 0.0); + + EV_FLOAT ev_tmp; + + if (chunk_size > inum - chunk_offset) + chunk_size = inum - chunk_offset; + + //Neigh + { + int vector_length = vector_length_default; + int team_size = team_size_default; + check_team_size_for(chunk_size,team_size,vector_length); + int scratch_size = scratch_size_helper(team_size * maxneigh); + typename Kokkos::TeamPolicy policy_neigh(chunk_size,team_size,vector_length); + policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size)); + Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this); + } + + //ComputeRadial + { + int vector_length = vector_length_default; + int team_size = team_size_default; + check_team_size_for(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length); + typename Kokkos::TeamPolicy policy_radial(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length); + Kokkos::parallel_for("ComputeRadial",policy_radial,*this); + } + + //ComputeYlm + { + int vector_length = vector_length_default; + int team_size = 16; + check_team_size_for(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length); + typename Kokkos::TeamPolicy policy_ylm(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length); + Kokkos::parallel_for("ComputeYlm",policy_ylm,*this); + } + + //ComputeAi + { + int vector_length = vector_length_default; + int team_size = team_size_default; + check_team_size_for(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length); + typename Kokkos::TeamPolicy policy_ai(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length); + Kokkos::parallel_for("ComputeAi",policy_ai,*this); + } + + //ConjugateAi + { + typename Kokkos::RangePolicy policy_conj_ai(0,chunk_size); + Kokkos::parallel_for("ConjugateAi",policy_conj_ai,*this); + } + + //ComputeRho + { + typename Kokkos::RangePolicy policy_rho(0,chunk_size*idx_rho_max); + Kokkos::parallel_for("ComputeRho",policy_rho,*this); + } + + //ComputeFS + { + typename Kokkos::RangePolicy policy_fs(0,chunk_size); + Kokkos::parallel_for("ComputeFS",policy_fs,*this); + } + + //ComputeWeights + { + typename Kokkos::RangePolicy policy_weights(0,chunk_size*idx_rho_max); + Kokkos::parallel_for("ComputeWeights",policy_weights,*this); + } + + //ComputeDerivative + { + int vector_length = vector_length_default; + int team_size = team_size_default; + check_team_size_for(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length); + typename Kokkos::TeamPolicy policy_derivative(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length); + Kokkos::parallel_for("ComputeDerivative",policy_derivative,*this); + } + + //ComputeForce + { + if (evflag) { + if (neighflag == HALF) { + typename Kokkos::RangePolicy > policy_force(0,chunk_size); + Kokkos::parallel_reduce(policy_force, *this, ev_tmp); + } else if (neighflag == HALFTHREAD) { + typename Kokkos::RangePolicy > policy_force(0,chunk_size); + Kokkos::parallel_reduce("ComputeForce",policy_force, *this, ev_tmp); + } + } else { + if (neighflag == HALF) { + typename Kokkos::RangePolicy > policy_force(0,chunk_size); + Kokkos::parallel_for(policy_force, *this); + } else if (neighflag == HALFTHREAD) { + typename Kokkos::RangePolicy > policy_force(0,chunk_size); + Kokkos::parallel_for("ComputeForce",policy_force, *this); + } + } + } + ev += ev_tmp; + chunk_offset += chunk_size; + + } // end while + + if (need_dup) + Kokkos::Experimental::contribute(f, dup_f); + + if (eflag_global) eng_vdwl += ev.evdwl; + if (vflag_global) { + virial[0] += ev.v[0]; + virial[1] += ev.v[1]; + virial[2] += ev.v[2]; + virial[3] += ev.v[3]; + virial[4] += ev.v[4]; + virial[5] += ev.v[5]; + } + + if (vflag_fdotr) pair_virial_fdotr_compute(this); + + if (eflag_atom) { + k_eatom.template modify(); + k_eatom.template sync(); + } + + if (vflag_atom) { + if (need_dup) + Kokkos::Experimental::contribute(d_vatom, dup_vatom); + k_vatom.template modify(); + k_vatom.template sync(); + } + + atomKK->modified(execution_space,F_MASK); + + copymode = 0; + + // free duplicated memory + if (need_dup) { + dup_f = decltype(dup_f)(); + dup_vatom = decltype(dup_vatom)(); + } +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairPACEExtrapolationKokkos::operator() (TagPairPACEComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const +{ + const int ii = team.league_rank(); + const int i = d_ilist[ii + chunk_offset]; + const int itype = type[i]; + const X_FLOAT xtmp = x(i,0); + const X_FLOAT ytmp = x(i,1); + const X_FLOAT ztmp = x(i,2); + const int jnum = d_numneigh[i]; + + // get a pointer to scratch memory + // This is used to cache whether or not an atom is within the cutoff + // If it is, inside is assigned to 1, otherwise -1 + const int team_rank = team.team_rank(); + const int scratch_shift = team_rank * maxneigh; // offset into pointer for entire team + int* inside = (int*)team.team_shmem().get_shmem(team.team_size() * maxneigh * sizeof(int), 0) + scratch_shift; + + // loop over list of all neighbors within force cutoff + // distsq[] = distance sq to each + // rlist[] = distance vector to each + // nearest[] = atom indices of neighbors + + int ncount = 0; + Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,jnum), + [&] (const int jj, int& count) { + int j = d_neighbors(i,jj); + j &= NEIGHMASK; + + const int jtype = type(j); + + const F_FLOAT delx = xtmp - x(j,0); + const F_FLOAT dely = ytmp - x(j,1); + const F_FLOAT delz = ztmp - x(j,2); + const F_FLOAT rsq = delx*delx + dely*dely + delz*delz; + + inside[jj] = -1; + if (rsq < d_cutsq(itype,jtype)) { + inside[jj] = 1; + count++; + } + },ncount); + + d_ncount(ii) = ncount; + + Kokkos::parallel_scan(Kokkos::TeamThreadRange(team,jnum), + [&] (const int jj, int& offset, bool final) { + + if (inside[jj] < 0) return; + + if (final) { + int j = d_neighbors(i,jj); + j &= NEIGHMASK; + const F_FLOAT delx = xtmp - x(j,0); + const F_FLOAT dely = ytmp - x(j,1); + const F_FLOAT delz = ztmp - x(j,2); + const F_FLOAT rsq = delx*delx + dely*dely + delz*delz; + const F_FLOAT r = sqrt(rsq); + const F_FLOAT rinv = 1.0/r; + const int mu_j = d_map(type(j)); + d_mu(ii,offset) = mu_j; + d_rnorms(ii,offset) = r; + d_rhats(ii,offset,0) = -delx*rinv; + d_rhats(ii,offset,1) = -dely*rinv; + d_rhats(ii,offset,2) = -delz*rinv; + d_nearest(ii,offset) = j; + } + offset++; + }); +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairPACEExtrapolationKokkos::operator() (TagPairPACEComputeRadial, const typename Kokkos::TeamPolicy::member_type& team) const +{ + // Extract the atom number + int ii = team.team_rank() + team.team_size() * (team.league_rank() % + ((chunk_size+team.team_size()-1)/team.team_size())); + if (ii >= chunk_size) return; + const int i = d_ilist[ii + chunk_offset]; + + // Extract the neighbor number + const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size()); + const int ncount = d_ncount(ii); + if (jj >= ncount) return; + + const double r_norm = d_rnorms(ii, jj); + const int mu_i = d_map(type(i)); + const int mu_j = d_mu(ii, jj); + + evaluate_splines(ii, jj, r_norm, nradbase, nradmax, mu_i, mu_j); +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairPACEExtrapolationKokkos::operator() (TagPairPACEComputeYlm, const typename Kokkos::TeamPolicy::member_type& team) const +{ + // Extract the atom number + int ii = team.team_rank() + team.team_size() * (team.league_rank() % + ((chunk_size+team.team_size()-1)/team.team_size())); + if (ii >= chunk_size) return; + + // Extract the neighbor number + const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size()); + const int ncount = d_ncount(ii); + if (jj >= ncount) return; + + const double xn = d_rhats(ii, jj, 0); + const double yn = d_rhats(ii, jj, 1); + const double zn = d_rhats(ii, jj, 2); + compute_ylm(ii,jj,xn,yn,zn,lmax); +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairPACEExtrapolationKokkos::operator() (TagPairPACEComputeAi, const typename Kokkos::TeamPolicy::member_type& team) const +{ + // Extract the atom number + int ii = team.team_rank() + team.team_size() * (team.league_rank() % + ((chunk_size+team.team_size()-1)/team.team_size())); + if (ii >= chunk_size) return; + + // Extract the neighbor number + const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size()); + const int ncount = d_ncount(ii); + if (jj >= ncount) return; + + const int mu_j = d_mu(ii, jj); + + // rank = 1 + for (int n = 0; n < nradbase; n++) + Kokkos::atomic_add(&A_rank1(ii, mu_j, n), gr(ii, jj, n) * Y00); + + // rank > 1 + for (int n = 0; n < nradmax; n++) { + for (int l = 0; l <= lmax; l++) { + for (int m = 0; m <= l; m++) { + const int idx = l * (l + 1) + m; // (l, m) + Kokkos::atomic_add(&A(ii, mu_j, n, idx).re, fr(ii, jj, n, l) * ylm(ii, jj, idx).re); + Kokkos::atomic_add(&A(ii, mu_j, n, idx).im, fr(ii, jj, n, l) * ylm(ii, jj, idx).im); + } + } + } + + // hard-core repulsion + Kokkos::atomic_add(&rho_core(ii), cr(ii, jj)); +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairPACEExtrapolationKokkos::operator() (TagPairPACEConjugateAi, const int& ii) const +{ + //complex conjugate A's (for NEGATIVE (-m) terms) + // for rank > 1 + for (int mu_j = 0; mu_j < nelements; mu_j++) { + for (int n = 0; n < nradmax; n++) { + for (int l = 0; l <= lmax; l++) { + //fill in -m part in the outer loop using the same m <-> -m symmetry as for Ylm + for (int m = 1; m <= l; m++) { + const int idx = l * (l + 1) + m; // (l, m) + const int idxm = l * (l + 1) - m; // (l, -m) + const int factor = m % 2 == 0 ? 1 : -1; + A(ii, mu_j, n, idxm) = A(ii, mu_j, n, idx).conj() * (double)factor; + } + } + } + } +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairPACEExtrapolationKokkos::operator() (TagPairPACEComputeRho, const int& iter) const +{ + const int idx_rho = iter / chunk_size; + const int ii = iter % chunk_size; + + const int i = d_ilist[ii + chunk_offset]; + const int mu_i = d_map(type(i)); + + if (idx_rho >= d_idx_rho_count(mu_i)) return; + + const int ndensity = d_ndensity(mu_i); + + const int offset = d_offsets(mu_i, idx_rho); + const int rank = d_rank(mu_i, offset); + const int r = rank - 1; + + // Basis functions B with iterative product and density rho(p) calculation + if (rank == 1) { + const int mu = d_mus(mu_i, offset, 0); + const int n = d_ns(mu_i, offset, 0); + double A_cur = A_rank1(ii, mu, n - 1); + for (int p = 0; p < ndensity; ++p) { + //for rank=1 (r=0) only 1 ms-combination exists (ms_ind=0), so index of func.ctildes is 0..ndensity-1 + Kokkos::atomic_add(&rhos(ii, p), d_ctildes(mu_i, idx_rho, p) * A_cur); + } + } else { // rank > 1 + // loop over {ms} combinations in sum + + // loop over m, collect B = product of A with given ms + A_forward_prod(ii, idx_rho, 0) = complex::one(); + + // fill forward A-product triangle + for (int t = 0; t < rank; t++) { + //TODO: optimize ns[t]-1 -> ns[t] during functions construction + const int mu = d_mus(mu_i, offset, t); + const int n = d_ns(mu_i, offset, t); + const int l = d_ls(mu_i, offset, t); + const int m = d_ms_combs(mu_i, idx_rho, t); // current ms-combination (of length = rank) + const int idx = l * (l + 1) + m; // (l, m) + A_list(ii, idx_rho, t) = A(ii, mu, n - 1, idx); + A_forward_prod(ii, idx_rho, t + 1) = A_forward_prod(ii, idx_rho, t) * A_list(ii, idx_rho, t); + } + + complex A_backward_prod = complex::one(); + + // fill backward A-product triangle + for (int t = r; t >= 1; t--) { + const complex dB = A_forward_prod(ii, idx_rho, t) * A_backward_prod; // dB - product of all A's except t-th + dB_flatten(ii, idx_rho, t) = dB; + + A_backward_prod = A_backward_prod * A_list(ii, idx_rho, t); + } + dB_flatten(ii, idx_rho, 0) = A_forward_prod(ii, idx_rho, 0) * A_backward_prod; + + const complex B = A_forward_prod(ii, idx_rho, rank); + + for (int p = 0; p < ndensity; ++p) { + // real-part only multiplication + Kokkos::atomic_add(&rhos(ii, p), B.real_part_product(d_ctildes(mu_i, idx_rho, p))); + } + } +} + +/* ---------------------------------------------------------------------- */ + + +template +KOKKOS_INLINE_FUNCTION +void PairPACEExtrapolationKokkos::operator() (TagPairPACEComputeFS, const int& ii) const +{ + const int i = d_ilist[ii + chunk_offset]; + const int mu_i = d_map(type(i)); + + const double rho_cut = d_rho_core_cutoff(mu_i); + const double drho_cut = d_drho_core_cutoff(mu_i); + const int ndensity = d_ndensity(mu_i); + + double evdwl, fcut, dfcut; + evdwl = fcut = dfcut = 0.0; + + inner_cutoff(rho_core(ii), rho_cut, drho_cut, fcut, dfcut); + FS_values_and_derivatives(ii, evdwl, mu_i); + + dF_drho_core(ii) = evdwl * dfcut + 1; + for (int p = 0; p < ndensity; ++p) + dF_drho(ii, p) *= fcut; + + + // tally energy contribution + if (eflag) { + double evdwl_cut = evdwl * fcut + rho_core(ii); + // E0 shift + evdwl_cut += d_E0vals(mu_i); + e_atom(ii) = evdwl_cut; + } +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairPACEExtrapolationKokkos::operator() (TagPairPACEComputeWeights, const int& iter) const +{ + const int idx_rho = iter / chunk_size; + const int ii = iter % chunk_size; + + const int i = d_ilist[ii + chunk_offset]; + const int mu_i = d_map(type(i)); + + if (idx_rho >= d_idx_rho_count(mu_i)) return; + + const int ndensity = d_ndensity(mu_i); + + const int offset = d_offsets(mu_i, idx_rho); + const int rank = d_rank(mu_i, offset); + + // Weights and theta calculation + + if (rank == 1) { + const int mu = d_mus(mu_i, offset, 0); + const int n = d_ns(mu_i, offset, 0); + double theta = 0.0; + for (int p = 0; p < ndensity; ++p) { + // for rank=1 (r=0) only 1 ms-combination exists (ms_ind=0), so index of func.ctildes is 0..ndensity-1 + theta += dF_drho(ii, p) * d_ctildes(mu_i, idx_rho, p); + } + Kokkos::atomic_add(&weights_rank1(ii, mu, n - 1), theta); + } else { // rank > 1 + double theta = 0.0; + for (int p = 0; p < ndensity; ++p) + theta += dF_drho(ii, p) * d_ctildes(mu_i, idx_rho, p); + + theta *= 0.5; // 0.5 factor due to possible double counting ??? + for (int t = 0; t < rank; ++t) { + const int m_t = d_ms_combs(mu_i, idx_rho, t); + const int factor = (m_t % 2 == 0 ? 1 : -1); + const complex dB = dB_flatten(ii, idx_rho, t); + const int mu_t = d_mus(mu_i, offset, t); + const int n_t = d_ns(mu_i, offset, t); + const int l_t = d_ls(mu_i, offset, t); + const int idx = l_t * (l_t + 1) + m_t; // (l, m) + const complex value = theta * dB; + Kokkos::atomic_add(&(weights(ii, mu_t, n_t - 1, idx).re), value.re); + Kokkos::atomic_add(&(weights(ii, mu_t, n_t - 1, idx).im), value.im); + // update -m_t (that could also be positive), because the basis is half_basis + const int idxm = l_t * (l_t + 1) - m_t; // (l, -m) + const complex valuem = theta * dB.conj() * (double)factor; + Kokkos::atomic_add(&(weights(ii, mu_t, n_t - 1, idxm).re), valuem.re); + Kokkos::atomic_add(&(weights(ii, mu_t, n_t - 1, idxm).im), valuem.im); + } + } +} + +/* ---------------------------------------------------------------------- */ +template +KOKKOS_INLINE_FUNCTION +void PairPACEExtrapolationKokkos::operator() (TagPairPACEComputeDerivative, const typename Kokkos::TeamPolicy::member_type& team) const +{ + // Extract the atom number + int ii = team.team_rank() + team.team_size() * (team.league_rank() % + ((chunk_size+team.team_size()-1)/team.team_size())); + if (ii >= chunk_size) return; + const int i = d_ilist[ii + chunk_offset]; + + // Extract the neighbor number + const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size()); + const int ncount = d_ncount(ii); + if (jj >= ncount) return; + + const int itype = type(i); + const double scale = d_scale(itype,itype); + + const int mu_j = d_mu(ii, jj); + double r_hat[3]; + r_hat[0] = d_rhats(ii, jj, 0); + r_hat[1] = d_rhats(ii, jj, 1); + r_hat[2] = d_rhats(ii, jj, 2); + const double r = d_rnorms(ii, jj); + const double rinv = 1.0/r; + + double f_ji[3]; + f_ji[0] = f_ji[1] = f_ji[2] = 0; + + // for rank = 1 + for (int n = 0; n < nradbase; ++n) { + if (weights_rank1(ii, mu_j, n) == 0) continue; + double &DG = dgr(ii, jj, n); + double DGR = DG * Y00; + DGR *= weights_rank1(ii, mu_j, n); + f_ji[0] += DGR * r_hat[0]; + f_ji[1] += DGR * r_hat[1]; + f_ji[2] += DGR * r_hat[2]; + } + + // for rank > 1 + for (int n = 0; n < nradmax; n++) { + for (int l = 0; l <= lmax; l++) { + const double R_over_r = fr(ii, jj, n, l) * rinv; + const double DR = dfr(ii, jj, n, l); + + // for m >= 0 + for (int m = 0; m <= l; m++) { + const int idx = l * (l + 1) + m; // (l, m) + complex w = weights(ii, mu_j, n, idx); + if (w.re == 0.0 && w.im == 0.0) continue; + // counting for -m cases if m > 0 + if (m > 0) { + w.re *= 2.0; + w.im *= 2.0; + } + + complex DY[3]; + DY[0] = dylm(ii, jj, idx, 0); + DY[1] = dylm(ii, jj, idx, 1); + DY[2] = dylm(ii, jj, idx, 2); + const complex Y_DR = ylm(ii, jj, idx) * DR; + + complex grad_phi_nlm[3]; + grad_phi_nlm[0] = Y_DR * r_hat[0] + DY[0] * R_over_r; + grad_phi_nlm[1] = Y_DR * r_hat[1] + DY[1] * R_over_r; + grad_phi_nlm[2] = Y_DR * r_hat[2] + DY[2] * R_over_r; + // real-part multiplication only + f_ji[0] += w.real_part_product(grad_phi_nlm[0]); + f_ji[1] += w.real_part_product(grad_phi_nlm[1]); + f_ji[2] += w.real_part_product(grad_phi_nlm[2]); + } + } + } + + // hard-core repulsion + const double fpair = dF_drho_core(ii) * dcr(ii,jj); + f_ij(ii, jj, 0) = scale * f_ji[0] + fpair * r_hat[0]; + f_ij(ii, jj, 1) = scale * f_ji[1] + fpair * r_hat[1]; + f_ij(ii, jj, 2) = scale * f_ji[2] + fpair * r_hat[2]; +} + +/* ---------------------------------------------------------------------- */ + +template +template +KOKKOS_INLINE_FUNCTION +void PairPACEExtrapolationKokkos::operator() (TagPairPACEComputeForce, const int& ii, EV_FLOAT& ev) const +{ + // The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial + const auto v_f = ScatterViewHelper::value,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f); + const auto a_f = v_f.template access::value>(); + + const int i = d_ilist[ii + chunk_offset]; + const int itype = type(i); + const double scale = d_scale(itype,itype); + + const int ncount = d_ncount(ii); + + F_FLOAT fitmp[3] = {0.0,0.0,0.0}; + for (int jj = 0; jj < ncount; jj++) { + int j = d_nearest(ii,jj); + + double r_hat[3]; + r_hat[0] = d_rhats(ii, jj, 0); + r_hat[1] = d_rhats(ii, jj, 1); + r_hat[2] = d_rhats(ii, jj, 2); + const double r = d_rnorms(ii, jj); + const double delx = -r_hat[0]*r; + const double dely = -r_hat[1]*r; + const double delz = -r_hat[2]*r; + + const double fpairx = f_ij(ii, jj, 0); + const double fpairy = f_ij(ii, jj, 1); + const double fpairz = f_ij(ii, jj, 2); + + fitmp[0] += fpairx; + fitmp[1] += fpairy; + fitmp[2] += fpairz; + a_f(j,0) -= fpairx; + a_f(j,1) -= fpairy; + a_f(j,2) -= fpairz; + + // tally per-atom virial contribution + if (EVFLAG && vflag_either) + v_tally_xyz(ev, i, j, fpairx, fpairy, fpairz, delx, dely, delz); + } + + a_f(i,0) += fitmp[0]; + a_f(i,1) += fitmp[1]; + a_f(i,2) += fitmp[2]; + + // tally energy contribution + if (EVFLAG && eflag_either) { + const double evdwl = scale*e_atom(ii); + //ev_tally_full(i, 2.0 * evdwl, 0.0, 0.0, 0.0, 0.0, 0.0); + if (eflag_global) ev.evdwl += evdwl; + if (eflag_atom) d_eatom[i] += evdwl; + } +} + +template +template +KOKKOS_INLINE_FUNCTION +void PairPACEExtrapolationKokkos::operator() (TagPairPACEComputeForce,const int& ii) const { + EV_FLOAT ev; + this->template operator()(TagPairPACEComputeForce(), ii, ev); +} + +/* ---------------------------------------------------------------------- */ + +template +template +KOKKOS_INLINE_FUNCTION +void PairPACEExtrapolationKokkos::v_tally_xyz(EV_FLOAT &ev, const int &i, const int &j, + const F_FLOAT &fx, const F_FLOAT &fy, const F_FLOAT &fz, + const F_FLOAT &delx, const F_FLOAT &dely, const F_FLOAT &delz) const +{ + // The vatom array is duplicated for OpenMP, atomic for CUDA, and neither for Serial + + auto v_vatom = ScatterViewHelper,decltype(dup_vatom),decltype(ndup_vatom)>::get(dup_vatom,ndup_vatom); + auto a_vatom = v_vatom.template access>(); + + const E_FLOAT v0 = delx*fx; + const E_FLOAT v1 = dely*fy; + const E_FLOAT v2 = delz*fz; + const E_FLOAT v3 = delx*fy; + const E_FLOAT v4 = delx*fz; + const E_FLOAT v5 = dely*fz; + + if (vflag_global) { + ev.v[0] += v0; + ev.v[1] += v1; + ev.v[2] += v2; + ev.v[3] += v3; + ev.v[4] += v4; + ev.v[5] += v5; + } + + if (vflag_atom) { + a_vatom(i,0) += 0.5*v0; + a_vatom(i,1) += 0.5*v1; + a_vatom(i,2) += 0.5*v2; + a_vatom(i,3) += 0.5*v3; + a_vatom(i,4) += 0.5*v4; + a_vatom(i,5) += 0.5*v5; + a_vatom(j,0) += 0.5*v0; + a_vatom(j,1) += 0.5*v1; + a_vatom(j,2) += 0.5*v2; + a_vatom(j,3) += 0.5*v3; + a_vatom(j,4) += 0.5*v4; + a_vatom(j,5) += 0.5*v5; + } +} + +/* ---------------------------------------------------------------------- */ + +template +void PairPACEExtrapolationKokkos::pre_compute_harmonics(int lmax) +{ + auto h_alm = Kokkos::create_mirror_view(alm); + auto h_blm = Kokkos::create_mirror_view(blm); + auto h_cl = Kokkos::create_mirror_view(cl); + auto h_dl = Kokkos::create_mirror_view(dl); + + for (int l = 1; l <= lmax; l++) { + const double lsq = l * l; + const double ld = 2 * l; + const double l1 = (4 * lsq - 1); + const double l2 = lsq - ld + 1; + for (int m = 0; m < l - 1; m++) { + const double msq = m * m; + const double a = sqrt((double(l1)) / (double(lsq - msq))); + const double b = -sqrt((double(l2 - msq)) / (double(4 * l2 - 1))); + const int idx = l * (l + 1) + m; // (l, m) + h_alm(idx) = a; + h_blm(idx) = b; + } + } + + for (int l = 1; l <= lmax; l++) { + h_cl(l) = -sqrt(1.0 + 0.5 / (double(l))); + h_dl(l) = sqrt(double(2 * (l - 1) + 3)); + } + + Kokkos::deep_copy(alm, h_alm); + Kokkos::deep_copy(blm, h_blm); + Kokkos::deep_copy(cl, h_cl); + Kokkos::deep_copy(dl, h_dl); +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairPACEExtrapolationKokkos::compute_barplm(int ii, int jj, double rz, int lmax) const +{ + // requires -1 <= rz <= 1 , NO CHECKING IS PERFORMED !!!!!!!!! + // prefactors include 1/sqrt(2) factor compared to reference + + // l=0, m=0 + // plm(ii, jj, 0, 0) = Y00/sq1o4pi; //= sq1o4pi; + plm(ii, jj, 0) = Y00; //= 1; + dplm(ii, jj, 0) = 0.0; + + if (lmax > 0) { + + // l=1, m=0 + plm(ii, jj, 2) = Y00 * sq3 * rz; + dplm(ii, jj, 2) = Y00 * sq3; + + // l=1, m=1 + plm(ii, jj, 3) = -sq3o2 * Y00; + dplm(ii, jj, 3) = 0.0; + + // loop l = 2, lmax + for (int l = 2; l <= lmax; l++) { + for (int m = 0; m < l - 1; m++) { + const int idx = l * (l + 1) + m; // (l, m) + const int idx1 = (l - 1) * l + m; // (l - 1, m) + const int idx2 = (l - 2) * (l - 1) + m; // (l - 2, m) + plm(ii, jj, idx) = alm(idx) * (rz * plm(ii, jj, idx1) + blm(idx) * plm(ii, jj, idx2)); + dplm(ii, jj, idx) = alm(idx) * (plm(ii, jj, idx1) + rz * dplm(ii, jj, idx1) + blm(idx) * dplm(ii, jj, idx2)); + } + const int idx = l * (l + 1) + l; // (l, l) + const int idx1 = l * (l + 1) + l - 1; // (l, l - 1) + const int idx2 = (l - 1) * l + l - 1; // (l - 1, l - 1) + const double t = dl(l) * plm(ii, jj, idx2); + plm(ii, jj, idx1) = t * rz; + dplm(ii, jj, idx1) = t; + plm(ii, jj, idx) = cl(l) * plm(ii, jj, idx2); + dplm(ii, jj, idx) = 0.0; + } + } +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairPACEExtrapolationKokkos::compute_ylm(int ii, int jj, double rx, double ry, double rz, int lmax) const +{ + // requires rx^2 + ry^2 + rz^2 = 1 , NO CHECKING IS PERFORMED !!!!!!!!! + + complex phase; + complex phasem, mphasem1; + complex dyx, dyy, dyz; + complex rdy; + + phase.re = rx; + phase.im = ry; + + // compute barplm + compute_barplm(ii, jj, rz, lmax); + + // m = 0 + for (int l = 0; l <= lmax; l++) { + const int idx = l * (l + 1); + + ylm(ii, jj, idx).re = plm(ii, jj, idx); + ylm(ii, jj, idx).im = 0.0; + + dyz.re = dplm(ii, jj, idx); + rdy.re = dyz.re * rz; + + dylm(ii, jj, idx, 0).re = -rdy.re * rx; + dylm(ii, jj, idx, 0).im = 0.0; + dylm(ii, jj, idx, 1).re = -rdy.re * ry; + dylm(ii, jj, idx, 1).im = 0.0; + dylm(ii, jj, idx, 2).re = dyz.re - rdy.re * rz; + dylm(ii, jj, idx, 2).im = 0; + } + // m = 1 + for (int l = 1; l <= lmax; l++) { + const int idx = l * (l + 1) + 1; + + ylm(ii, jj, idx) = phase * plm(ii, jj, idx); + + dyx.re = plm(ii, jj, idx); + dyx.im = 0.0; + dyy.re = 0.0; + dyy.im = plm(ii, jj, idx); + dyz.re = phase.re * dplm(ii, jj, idx); + dyz.im = phase.im * dplm(ii, jj, idx); + + rdy.re = rx * dyx.re + +rz * dyz.re; + rdy.im = ry * dyy.im + rz * dyz.im; + + dylm(ii, jj, idx, 0).re = dyx.re - rdy.re * rx; + dylm(ii, jj, idx, 0).im = -rdy.im * rx; + dylm(ii, jj, idx, 1).re = -rdy.re * ry; + dylm(ii, jj, idx, 1).im = dyy.im - rdy.im * ry; + dylm(ii, jj, idx, 2).re = dyz.re - rdy.re * rz; + dylm(ii, jj, idx, 2).im = dyz.im - rdy.im * rz; + } + + // m > 1 + phasem = phase; + for (int m = 2; m <= lmax; m++) { + + mphasem1.re = phasem.re * double(m); + mphasem1.im = phasem.im * double(m); + phasem = phasem * phase; + + for (int l = m; l <= lmax; l++) { + const int idx = l * (l + 1) + m; + + ylm(ii, jj, idx).re = phasem.re * plm(ii, jj, idx); + ylm(ii, jj, idx).im = phasem.im * plm(ii, jj, idx); + + dyx = mphasem1 * plm(ii, jj, idx); + dyy.re = -dyx.im; + dyy.im = dyx.re; + dyz = phasem * dplm(ii, jj, idx); + + rdy.re = rx * dyx.re + ry * dyy.re + rz * dyz.re; + rdy.im = rx * dyx.im + ry * dyy.im + rz * dyz.im; + + dylm(ii, jj, idx, 0).re = dyx.re - rdy.re * rx; + dylm(ii, jj, idx, 0).im = dyx.im - rdy.im * rx; + dylm(ii, jj, idx, 1).re = dyy.re - rdy.re * ry; + dylm(ii, jj, idx, 1).im = dyy.im - rdy.im * ry; + dylm(ii, jj, idx, 2).re = dyz.re - rdy.re * rz; + dylm(ii, jj, idx, 2).im = dyz.im - rdy.im * rz; + } + } +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairPACEExtrapolationKokkos::cutoff_func_poly(const double r, const double r_in, const double delta_in, double &fc, double &dfc) const +{ + if (r <= r_in-delta_in) { + fc = 1; + dfc = 0; + } else if (r >= r_in ) { + fc = 0; + dfc = 0; + } else { + double x = 1 - 2 * (1 + (r - r_in) / delta_in); + fc = 0.5 + 7.5 / 2. * (x / 4. - pow(x, 3) / 6. + pow(x, 5) / 20.); + dfc = -7.5 / delta_in * (0.25 - x * x / 2.0 + pow(x, 4) / 4.); + } +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairPACEExtrapolationKokkos::Fexp(const double x, const double m, double &F, double &DF) const +{ + const double w = 1.e6; + const double eps = 1e-10; + + const double lambda = pow(1.0 / w, m - 1.0); + if (abs(x) > eps) { + double g; + const double a = abs(x); + const double am = pow(a, m); + const double w3x3 = pow(w * a, 3); //// use cube + const double sign_factor = (signbit(x) ? -1 : 1); + if (w3x3 > 30.0) + g = 0.0; + else + g = exp(-w3x3); + + const double omg = 1.0 - g; + F = sign_factor * (omg * am + lambda * g * a); + const double dg = -3.0 * w * w * w * a * a * g; + DF = m * pow(a, m - 1.0) * omg - am * dg + lambda * dg * a + lambda * g; + } else { + F = lambda * x; + DF = lambda; + } +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairPACEExtrapolationKokkos::FexpShiftedScaled(const double rho, const double mexp, double &F, double &DF) const +{ + const double eps = 1e-10; + + if (abs(mexp - 1.0) < eps) { + F = rho; + DF = 1; + } else { + const double a = abs(rho); + const double exprho = exp(-a); + const double nx = 1. / mexp; + const double xoff = pow(nx, (nx / (1.0 - nx))) * exprho; + const double yoff = pow(nx, (1 / (1.0 - nx))) * exprho; + const double sign_factor = (signbit(rho) ? -1 : 1); + F = sign_factor * (pow(xoff + a, mexp) - yoff); + DF = yoff + mexp * (-xoff + 1.0) * pow(xoff + a, mexp - 1.); + } +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairPACEExtrapolationKokkos::inner_cutoff(const double rho_core, const double rho_cut, const double drho_cut, + double &fcut, double &dfcut) const +{ + double rho_low = rho_cut - drho_cut; + if (rho_core >= rho_cut) { + fcut = 0; + dfcut = 0; + } else if (rho_core <= rho_low) { + fcut = 1; + dfcut = 0; + } else { + cutoff_func_poly(rho_core, rho_cut, drho_cut, fcut, dfcut); + } +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairPACEExtrapolationKokkos::FS_values_and_derivatives(const int ii, double &evdwl, const int mu_i) const +{ + double F, DF = 0; + int npoti = d_npoti(mu_i); + int ndensity = d_ndensity(mu_i); + for (int p = 0; p < ndensity; p++) { + const double wpre = d_wpre(mu_i, p); + const double mexp = d_mexp(mu_i, p); + + if (npoti == FS) + Fexp(rhos(ii, p), mexp, F, DF); + else if (npoti == FS_SHIFTEDSCALED) + FexpShiftedScaled(rhos(ii, p), mexp, F, DF); + + evdwl += F * wpre; // * weight (wpre) + dF_drho(ii, p) = DF * wpre; // * weight (wpre) + } +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairPACEExtrapolationKokkos::evaluate_splines(const int ii, const int jj, double r, + int /*nradbase_c*/, int /*nradial_c*/, + int mu_i, int mu_j) const +{ + auto &spline_gk = k_splines_gk.template view()(mu_i, mu_j); + auto &spline_rnl = k_splines_rnl.template view()(mu_i, mu_j); + auto &spline_hc = k_splines_hc.template view()(mu_i, mu_j); + + spline_gk.calcSplines(ii, jj, r, gr, dgr); + + spline_rnl.calcSplines(ii, jj, r, d_values, d_derivatives); + for (int kk = 0; kk < (int)fr.extent(2); kk++) { + for (int ll = 0; ll < (int)fr.extent(3); ll++) { + const int flatten = kk*fr.extent(3) + ll; + fr(ii, jj, kk, ll) = d_values(ii, jj, flatten); + dfr(ii, jj, kk, ll) = d_derivatives(ii, jj, flatten); + } + } + + spline_hc.calcSplines(ii, jj, r, d_values, d_derivatives); + cr(ii, jj) = d_values(ii, jj, 0); + dcr(ii, jj) = d_derivatives(ii, jj, 0); +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairPACEExtrapolationKokkos::SplineInterpolatorKokkos::calcSplines(const int ii, const int jj, const double r, const t_ace_3d &d_values, const t_ace_3d &d_derivatives) const +{ + double wl, wl2, wl3, w2l1, w3l2; + double c[4]; + double x = r * rscalelookup; + int nl = static_cast(floor(x)); + + if (nl <= 0) + Kokkos::abort("Encountered very small distance. Stopping."); + + if (nl < nlut) { + wl = x - double(nl); + wl2 = wl * wl; + wl3 = wl2 * wl; + w2l1 = 2.0 * wl; + w3l2 = 3.0 * wl2; + for (int func_id = 0; func_id < num_of_functions; func_id++) { + for (int idx = 0; idx < 4; idx++) + c[idx] = lookupTable(nl, func_id, idx); + d_values(ii, jj, func_id) = c[0] + c[1] * wl + c[2] * wl2 + c[3] * wl3; + d_derivatives(ii, jj, func_id) = (c[1] + c[2] * w2l1 + c[3] * w3l2) * rscalelookup; + } + } else { // fill with zeroes + for (int func_id = 0; func_id < num_of_functions; func_id++) { + d_values(ii, jj, func_id) = 0.0; + d_derivatives(ii, jj, func_id) = 0.0; + } + } +} + +/* ---------------------------------------------------------------------- */ + +template +template +void PairPACEExtrapolationKokkos::check_team_size_for(int inum, int &team_size, int vector_length) { + int team_size_max; + + team_size_max = Kokkos::TeamPolicy(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag()); + + if (team_size*vector_length > team_size_max) + team_size = team_size_max/vector_length; +} + +/* ---------------------------------------------------------------------- */ + +template +template +void PairPACEExtrapolationKokkos::check_team_size_reduce(int inum, int &team_size, int vector_length) { + int team_size_max; + + team_size_max = Kokkos::TeamPolicy(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelReduceTag()); + + if (team_size*vector_length > team_size_max) + team_size = team_size_max/vector_length; +} + +template +template +int PairPACEExtrapolationKokkos::scratch_size_helper(int values_per_team) { + typedef Kokkos::View > ScratchViewType; + + return ScratchViewType::shmem_size(values_per_team); +} + +/* ---------------------------------------------------------------------- + memory usage of arrays +------------------------------------------------------------------------- */ + +template +double PairPACEExtrapolationKokkos::memory_usage() +{ + double bytes = 0; + + bytes += MemKK::memory_usage(A); + bytes += MemKK::memory_usage(A_rank1); + bytes += MemKK::memory_usage(A_list); + bytes += MemKK::memory_usage(A_forward_prod); + bytes += MemKK::memory_usage(e_atom); + bytes += MemKK::memory_usage(rhos); + bytes += MemKK::memory_usage(dF_drho); + bytes += MemKK::memory_usage(weights); + bytes += MemKK::memory_usage(weights_rank1); + bytes += MemKK::memory_usage(rho_core); + bytes += MemKK::memory_usage(dF_drho_core); + bytes += MemKK::memory_usage(dB_flatten); + bytes += MemKK::memory_usage(fr); + bytes += MemKK::memory_usage(dfr); + bytes += MemKK::memory_usage(gr); + bytes += MemKK::memory_usage(dgr); + bytes += MemKK::memory_usage(d_values); + bytes += MemKK::memory_usage(d_derivatives); + bytes += MemKK::memory_usage(cr); + bytes += MemKK::memory_usage(dcr); + bytes += MemKK::memory_usage(plm); + bytes += MemKK::memory_usage(dplm); + bytes += MemKK::memory_usage(ylm); + bytes += MemKK::memory_usage(dylm); + bytes += MemKK::memory_usage(d_ncount); + bytes += MemKK::memory_usage(d_mu); + bytes += MemKK::memory_usage(d_rhats); + bytes += MemKK::memory_usage(d_rnorms); + bytes += MemKK::memory_usage(d_nearest); + bytes += MemKK::memory_usage(f_ij); + bytes += MemKK::memory_usage(d_rho_core_cutoff); + bytes += MemKK::memory_usage(d_drho_core_cutoff); + bytes += MemKK::memory_usage(d_E0vals); + bytes += MemKK::memory_usage(d_ndensity); + bytes += MemKK::memory_usage(d_npoti); + bytes += MemKK::memory_usage(d_wpre); + bytes += MemKK::memory_usage(d_mexp); + bytes += MemKK::memory_usage(d_idx_rho_count); + bytes += MemKK::memory_usage(d_rank); + bytes += MemKK::memory_usage(d_num_ms_combs); + bytes += MemKK::memory_usage(d_offsets); + bytes += MemKK::memory_usage(d_mus); + bytes += MemKK::memory_usage(d_ns); + bytes += MemKK::memory_usage(d_ls); + bytes += MemKK::memory_usage(d_ms_combs); + bytes += MemKK::memory_usage(d_ctildes); + bytes += MemKK::memory_usage(alm); + bytes += MemKK::memory_usage(blm); + bytes += MemKK::memory_usage(cl); + bytes += MemKK::memory_usage(dl); + + if (k_splines_gk.h_view.data()) { + for (int i = 0; i < nelements; i++) { + for (int j = 0; j < nelements; j++) { + bytes += k_splines_gk.h_view(i, j).memory_usage(); + bytes += k_splines_rnl.h_view(i, j).memory_usage(); + bytes += k_splines_hc.h_view(i, j).memory_usage(); + } + } + } + + return bytes; +} + +/* ---------------------------------------------------------------------- */ + +namespace LAMMPS_NS { +template class PairPACEExtrapolationKokkos; +#ifdef LMP_KOKKOS_GPU +template class PairPACEExtrapolationKokkos; +#endif +} diff --git a/src/KOKKOS/pair_pace_extrapolation_kokkos.h b/src/KOKKOS/pair_pace_extrapolation_kokkos.h new file mode 100644 index 0000000000..482d068725 --- /dev/null +++ b/src/KOKKOS/pair_pace_extrapolation_kokkos.h @@ -0,0 +1,334 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + https://www.lammps.org/, Sandia National Laboratories + LAMMPS development team: developers@lammps.org + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS +// clang-format off +PairStyle(pace/extrapolation/kk,PairPACEExtrapolationKokkos); +PairStyle(pace/extrapolation/kk/device,PairPACEExtrapolationKokkos); +PairStyle(pace/extrapolation/kk/host,PairPACEExtrapolationKokkos); +// clang-format on +#else + +// clang-format off +#ifndef LMP_PAIR_PACE_EXTRAPOLATION_KOKKOS_H +#define LMP_PAIR_PACE_EXTRAPOLATION_KOKKOS_H + +#include "pair_pace.h" +#include "ace-evaluator/ace_radial.h" +#include "kokkos_type.h" +#include "pair_kokkos.h" + +namespace LAMMPS_NS { + +template +class PairPACEExtrapolationKokkos : public PairPACE { + public: + struct TagPairPACEComputeNeigh{}; + struct TagPairPACEComputeRadial{}; + struct TagPairPACEComputeYlm{}; + struct TagPairPACEComputeAi{}; + struct TagPairPACEConjugateAi{}; + struct TagPairPACEComputeRho{}; + struct TagPairPACEComputeFS{}; + struct TagPairPACEComputeWeights{}; + struct TagPairPACEComputeDerivative{}; + + template + struct TagPairPACEComputeForce{}; + + typedef DeviceType device_type; + typedef ArrayTypes AT; + typedef EV_FLOAT value_type; + using complex = SNAComplex; + + PairPACEExtrapolationKokkos(class LAMMPS *); + ~PairPACEExtrapolationKokkos() override; + + void compute(int, int) override; + void coeff(int, char **) override; + void init_style() override; + double init_one(int, int) override; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairPACEComputeNeigh,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairPACEComputeRadial,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairPACEComputeYlm,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairPACEComputeAi,const typename Kokkos::TeamPolicy::member_type& team) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairPACEConjugateAi,const int& ii) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairPACEComputeRho,const int& iter) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairPACEComputeFS,const int& ii) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairPACEComputeWeights,const int& iter) const; + + KOKKOS_INLINE_FUNCTION + void operator() (TagPairPACEComputeDerivative,const typename Kokkos::TeamPolicy::member_type& team) const; + + template + KOKKOS_INLINE_FUNCTION + void operator() (TagPairPACEComputeForce,const int& ii) const; + + template + KOKKOS_INLINE_FUNCTION + void operator() (TagPairPACEComputeForce,const int& ii, EV_FLOAT&) const; + + protected: + int inum, maxneigh, chunk_size, chunk_offset, idx_rho_max; + int host_flag; + + int eflag, vflag; + + int neighflag, max_ndensity; + int nelements, lmax, nradmax, nradbase; + + typename AT::t_neighbors_2d d_neighbors; + typename AT::t_int_1d_randomread d_ilist; + typename AT::t_int_1d_randomread d_numneigh; + + DAT::tdual_efloat_1d k_eatom; + DAT::tdual_virial_array k_vatom; + typename AT::t_efloat_1d d_eatom; + typename AT::t_virial_array d_vatom; + + typename AT::t_x_array_randomread x; + typename AT::t_f_array f; + typename AT::t_int_1d_randomread type; + + typedef Kokkos::DualView tdual_fparams; + tdual_fparams k_cutsq, k_scale; + typedef Kokkos::View t_fparams; + t_fparams d_cutsq, d_scale; + + typename AT::t_int_1d d_map; + + int need_dup; + + using KKDeviceType = typename KKDevice::value; + + template + using DupScatterView = KKScatterView; + + template + using NonDupScatterView = KKScatterView; + + DupScatterView dup_f; + DupScatterView dup_vatom; + + NonDupScatterView ndup_f; + NonDupScatterView ndup_vatom; + + friend void pair_virial_fdotr_compute(PairPACEExtrapolationKokkos*); + + void grow(int, int); + void copy_pertype(); + void copy_splines(); + void copy_tilde(); + void allocate() override; + void precompute_harmonics(); + double memory_usage() override; + + template + KOKKOS_INLINE_FUNCTION + void v_tally_xyz(EV_FLOAT &ev, const int &i, const int &j, + const F_FLOAT &fx, const F_FLOAT &fy, const F_FLOAT &fz, + const F_FLOAT &delx, const F_FLOAT &dely, const F_FLOAT &delz) const; + + KOKKOS_INLINE_FUNCTION + void compute_barplm(int, int, double, int) const; + + KOKKOS_INLINE_FUNCTION + void compute_ylm(int, int, double, double, double, int) const; + + KOKKOS_INLINE_FUNCTION + void cutoff_func_poly(const double, const double, const double, double &, double &) const; + + KOKKOS_INLINE_FUNCTION + void Fexp(const double, const double, double &, double &) const; + + KOKKOS_INLINE_FUNCTION + void FexpShiftedScaled(const double, const double, double &, double &) const; + + KOKKOS_INLINE_FUNCTION + void inner_cutoff(const double, const double, const double, double &, double &) const; + + KOKKOS_INLINE_FUNCTION + void FS_values_and_derivatives(const int, double&, const int) const; + + KOKKOS_INLINE_FUNCTION + void evaluate_splines(const int, const int, double, int, int, int, int) const; + + template + void check_team_size_for(int, int&, int); + + template + void check_team_size_reduce(int, int&, int); + + // Utility routine which wraps computing per-team scratch size requirements for + // ComputeNeigh, ComputeUi, and ComputeFusedDeidrj + template + int scratch_size_helper(int values_per_team); + + typedef Kokkos::View t_ace_1i; + typedef Kokkos::View t_ace_2i; + typedef Kokkos::View t_ace_3i; + typedef Kokkos::View t_ace_4i; + typedef Kokkos::View t_ace_1d; + typedef Kokkos::View t_ace_2d; + typedef Kokkos::View t_ace_2d3; + typedef Kokkos::View t_ace_3d; + typedef Kokkos::View t_ace_3d3; + typedef Kokkos::View t_ace_3d4; + typedef Kokkos::View t_ace_4d; + typedef Kokkos::View t_ace_1c; + typedef Kokkos::View t_ace_2c; + typedef Kokkos::View t_ace_3c; + typedef Kokkos::View t_ace_3c3; + typedef Kokkos::View t_ace_4c; + typedef Kokkos::View t_ace_4c3; + + t_ace_3d A_rank1; + t_ace_4c A; + + t_ace_3c A_list; + t_ace_3c A_forward_prod; + + t_ace_3d weights_rank1; + t_ace_4c weights; + + t_ace_1d e_atom; + t_ace_2d rhos; + t_ace_2d dF_drho; + + // hard-core repulsion + t_ace_1d rho_core; + t_ace_3c dB_flatten; + t_ace_2d cr; + t_ace_2d dcr; + t_ace_1d dF_drho_core; + + // radial functions + t_ace_4d fr; + t_ace_4d dfr; + t_ace_3d gr; + t_ace_3d dgr; + t_ace_3d d_values; + t_ace_3d d_derivatives; + + // Spherical Harmonics + + void pre_compute_harmonics(int); + + KOKKOS_INLINE_FUNCTION + void compute_barplm(double rz, int lmaxi); + + KOKKOS_INLINE_FUNCTION + void compute_ylm(double rx, double ry, double rz, int lmaxi); + + t_ace_1d alm; + t_ace_1d blm; + t_ace_1d cl; + t_ace_1d dl; + + t_ace_3d plm; + t_ace_3d dplm; + + t_ace_3c ylm; + t_ace_4c3 dylm; + + // short neigh list + t_ace_1i d_ncount; + t_ace_2d d_mu; + t_ace_2d d_rnorms; + t_ace_3d3 d_rhats; + t_ace_2i d_nearest; + + // per-type + t_ace_1i d_ndensity; + t_ace_1i d_npoti; + t_ace_1d d_rho_core_cutoff; + t_ace_1d d_drho_core_cutoff; + t_ace_1d d_E0vals; + t_ace_2d d_wpre; + t_ace_2d d_mexp; + + // tilde + t_ace_1i d_idx_rho_count; + t_ace_2i d_rank; + t_ace_2i d_num_ms_combs; + t_ace_2i d_offsets; + t_ace_3i d_mus; + t_ace_3i d_ns; + t_ace_3i d_ls; + t_ace_3i d_ms_combs; + t_ace_3d d_ctildes; + + t_ace_3d3 f_ij; + + public: + struct SplineInterpolatorKokkos { + int ntot, nlut, num_of_functions; + double cutoff, deltaSplineBins, invrscalelookup, rscalelookup; + + t_ace_3d4 lookupTable; + + void operator=(const SplineInterpolator &spline) { + cutoff = spline.cutoff; + deltaSplineBins = spline.deltaSplineBins; + ntot = spline.ntot; + nlut = spline.nlut; + invrscalelookup = spline.invrscalelookup; + rscalelookup = spline.rscalelookup; + num_of_functions = spline.num_of_functions; + + lookupTable = t_ace_3d4("lookupTable", ntot+1, num_of_functions); + auto h_lookupTable = Kokkos::create_mirror_view(lookupTable); + for (int i = 0; i < ntot+1; i++) + for (int j = 0; j < num_of_functions; j++) + for (int k = 0; k < 4; k++) + h_lookupTable(i, j, k) = spline.lookupTable(i, j, k); + Kokkos::deep_copy(lookupTable, h_lookupTable); + } + + void deallocate() { + lookupTable = t_ace_3d4(); + } + + double memory_usage() { + return lookupTable.span() * sizeof(typename decltype(lookupTable)::value_type); + } + + KOKKOS_INLINE_FUNCTION + void calcSplines(const int ii, const int jj, const double r, const t_ace_3d &d_values, const t_ace_3d &d_derivatives) const; + }; + + Kokkos::DualView k_splines_gk; + Kokkos::DualView k_splines_rnl; + Kokkos::DualView k_splines_hc; + +}; +} // namespace LAMMPS_NS + +#endif +#endif From 1b92569187093deb1f620d33363e9000d00c4a79 Mon Sep 17 00:00:00 2001 From: Yury Lysogorskiy Date: Mon, 2 Jan 2023 18:29:12 +0100 Subject: [PATCH 02/12] WIP: pair_pace_extrapolation_kokkos.h/cpp: - rename idx_rho_max -> idx_ms_combs_max, d_idx_rho_count ->d_idx_ms_combs_count, d_offsets->d_func_inds - remove d_ctildes, add d_gen_cgs and d_coeffs - use ACEBBasisFunction - update TagPairPACEComputeRho and TagPairPACEComputeWeights - pair_pace_extrapolation.h/cpp: add chunksize option --- src/KOKKOS/pair_pace_extrapolation_kokkos.cpp | 249 ++++++++++-------- src/KOKKOS/pair_pace_extrapolation_kokkos.h | 14 +- src/ML-PACE/pair_pace_extrapolation.cpp | 22 +- src/ML-PACE/pair_pace_extrapolation.h | 4 +- 4 files changed, 164 insertions(+), 125 deletions(-) diff --git a/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp index b4f4932db4..006cf5e609 100644 --- a/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp +++ b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp @@ -34,19 +34,30 @@ #include "ace-evaluator/ace_recursive.h" #include "ace-evaluator/ace_version.h" #include "ace-evaluator/ace_radial.h" + +#include "ace/ace_b_basis.h" +#include "ace/ace_b_evaluator.h" + #include namespace LAMMPS_NS { -struct ACEImpl { - ACEImpl() : basis_set(nullptr), ace(nullptr) {} - ~ACEImpl() - { - delete basis_set; - delete ace; - } - ACECTildeBasisSet *basis_set; - ACERecursiveEvaluator *ace; -}; + struct ACEALImpl { + ACEALImpl() : basis_set(nullptr), ace(nullptr), ctilde_basis_set(nullptr), rec_ace(nullptr) {} + + ~ACEALImpl() + { + delete basis_set; + delete ace; + + delete ctilde_basis_set; + delete rec_ace; + } + + ACEBBasisSet *basis_set; + ACEBEvaluator *ace; + ACECTildeBasisSet *ctilde_basis_set; + ACERecursiveEvaluator *rec_ace; + }; } // namespace LAMMPS_NS using namespace LAMMPS_NS; @@ -57,7 +68,7 @@ enum{FS,FS_SHIFTEDSCALED}; /* ---------------------------------------------------------------------- */ template -PairPACEExtrapolationKokkos::PairPACEExtrapolationKokkos(LAMMPS *lmp) : PairPACE(lmp) +PairPACEExtrapolationKokkos::PairPACEExtrapolationKokkos(LAMMPS *lmp) : PairPACEExtrapolation(lmp) { respa_enable = 0; @@ -107,9 +118,9 @@ void PairPACEExtrapolationKokkos::grow(int natom, int maxneigh) MemKK::realloc_kokkos(A, "pace:A", natom, nelements, nradmax + 1, (lmax + 1) * (lmax + 1)); MemKK::realloc_kokkos(A_rank1, "pace:A_rank1", natom, nelements, nradbase); - MemKK::realloc_kokkos(A_list, "pace:A_list", natom, idx_rho_max, basis_set->rankmax); + MemKK::realloc_kokkos(A_list, "pace:A_list", natom, idx_ms_combs_max, basis_set->rankmax); //size is +1 of max to avoid out-of-boundary array access in double-triangular scheme - MemKK::realloc_kokkos(A_forward_prod, "pace:A_forward_prod", natom, idx_rho_max, basis_set->rankmax + 1); + MemKK::realloc_kokkos(A_forward_prod, "pace:A_forward_prod", natom, idx_ms_combs_max, basis_set->rankmax + 1); MemKK::realloc_kokkos(e_atom, "pace:e_atom", natom); MemKK::realloc_kokkos(rhos, "pace:rhos", natom, basis_set->ndensitymax + 1); // +1 density for core repulsion @@ -121,7 +132,7 @@ void PairPACEExtrapolationKokkos::grow(int natom, int maxneigh) // hard-core repulsion MemKK::realloc_kokkos(rho_core, "pace:rho_core", natom); MemKK::realloc_kokkos(dF_drho_core, "pace:dF_drho_core", natom); - MemKK::realloc_kokkos(dB_flatten, "pace:dB_flatten", natom, idx_rho_max, basis_set->rankmax); + MemKK::realloc_kokkos(dB_flatten, "pace:dB_flatten", natom, idx_ms_combs_max, basis_set->rankmax); } if (((int)ylm.extent(0) < natom) || ((int)ylm.extent(1) < maxneigh)) { @@ -263,120 +274,134 @@ void PairPACEExtrapolationKokkos::copy_tilde() // flatten loops, get per-element count and max - idx_rho_max = 0; + idx_ms_combs_max = 0; int total_basis_size_max = 0; - MemKK::realloc_kokkos(d_idx_rho_count, "pace:idx_rho_count", nelements); - auto h_idx_rho_count = Kokkos::create_mirror_view(d_idx_rho_count); + MemKK::realloc_kokkos(d_idx_ms_combs_count, "pace:idx_ms_combs_count", nelements); + auto h_idx_ms_combs_count = Kokkos::create_mirror_view(d_idx_ms_combs_count); - for (int n = 0; n < nelements; n++) { - int idx_rho = 0; - const int total_basis_size_rank1 = basis_set->total_basis_size_rank1[n]; - const int total_basis_size = basis_set->total_basis_size[n]; + for (int mu = 0; mu < nelements; mu++) { + int idx_ms_combs = 0; + const int total_basis_size_rank1 = basis_set->total_basis_size_rank1[mu]; + const int total_basis_size = basis_set->total_basis_size[mu]; - ACECTildeBasisFunction *basis = basis_set->basis[n]; + ACEBBasisFunction *basis = basis_set->basis[mu]; // rank=1 for (int func_rank1_ind = 0; func_rank1_ind < total_basis_size_rank1; ++func_rank1_ind) - idx_rho++; + idx_ms_combs++; // rank > 1 for (int func_ind = 0; func_ind < total_basis_size; ++func_ind) { - ACECTildeBasisFunction *func = &basis[func_ind]; + ACEBBasisFunction *func = &basis[func_ind]; // loop over {ms} combinations in sum for (int ms_ind = 0; ms_ind < func->num_ms_combs; ++ms_ind) - idx_rho++; + idx_ms_combs++; } - h_idx_rho_count(n) = idx_rho; - idx_rho_max = MAX(idx_rho_max, idx_rho); + h_idx_ms_combs_count(mu) = idx_ms_combs; + idx_ms_combs_max = MAX(idx_ms_combs_max, idx_ms_combs); total_basis_size_max = MAX(total_basis_size_max, total_basis_size_rank1 + total_basis_size); } - Kokkos::deep_copy(d_idx_rho_count, h_idx_rho_count); + Kokkos::deep_copy(d_idx_ms_combs_count, h_idx_ms_combs_count); MemKK::realloc_kokkos(d_rank, "pace:rank", nelements, total_basis_size_max); MemKK::realloc_kokkos(d_num_ms_combs, "pace:num_ms_combs", nelements, total_basis_size_max); - MemKK::realloc_kokkos(d_offsets, "pace:offsets", nelements, idx_rho_max); + MemKK::realloc_kokkos(d_func_inds, "pace:func_inds", nelements, idx_ms_combs_max); MemKK::realloc_kokkos(d_mus, "pace:mus", nelements, total_basis_size_max, basis_set->rankmax); MemKK::realloc_kokkos(d_ns, "pace:ns", nelements, total_basis_size_max, basis_set->rankmax); MemKK::realloc_kokkos(d_ls, "pace:ls", nelements, total_basis_size_max, basis_set->rankmax); - MemKK::realloc_kokkos(d_ms_combs, "pace:ms_combs", nelements, idx_rho_max, basis_set->rankmax); - MemKK::realloc_kokkos(d_ctildes, "pace:ctildes", nelements, idx_rho_max, basis_set->ndensitymax); + MemKK::realloc_kokkos(d_ms_combs, "pace:ms_combs", nelements, idx_ms_combs_max, basis_set->rankmax); + //MemKK::realloc_kokkos(d_ctildes, "pace:ctildes", nelements, idx_ms_combs_max, basis_set->ndensitymax); + MemKK::realloc_kokkos(d_gen_cgs, "pace:gen_cgs", nelements, idx_ms_combs_max); + MemKK::realloc_kokkos(d_coeffs, "pace:coeffs", nelements, total_basis_size_max, basis_set->ndensitymax); auto h_rank = Kokkos::create_mirror_view(d_rank); auto h_num_ms_combs = Kokkos::create_mirror_view(d_num_ms_combs); - auto h_offsets = Kokkos::create_mirror_view(d_offsets); + auto h_func_inds = Kokkos::create_mirror_view(d_func_inds); auto h_mus = Kokkos::create_mirror_view(d_mus); auto h_ns = Kokkos::create_mirror_view(d_ns); auto h_ls = Kokkos::create_mirror_view(d_ls); auto h_ms_combs = Kokkos::create_mirror_view(d_ms_combs); - auto h_ctildes = Kokkos::create_mirror_view(d_ctildes); +// auto h_ctildes = Kokkos::create_mirror_view(d_ctildes); + auto h_gen_cgs = Kokkos::create_mirror_view(d_gen_cgs); + auto h_coeffs = Kokkos::create_mirror_view(d_coeffs); // copy values on host - for (int n = 0; n < nelements; n++) { - const int total_basis_size_rank1 = basis_set->total_basis_size_rank1[n]; - const int total_basis_size = basis_set->total_basis_size[n]; + for (int mu = 0; mu < nelements; mu++) { + const int total_basis_size_rank1 = basis_set->total_basis_size_rank1[mu]; + const int total_basis_size = basis_set->total_basis_size[mu]; - ACECTildeBasisFunction *basis_rank1 = basis_set->basis_rank1[n]; - ACECTildeBasisFunction *basis = basis_set->basis[n]; + ACEBBasisFunction *basis_rank1 = basis_set->basis_rank1[mu]; + ACEBBasisFunction *basis = basis_set->basis[mu]; - const int ndensity = basis_set->map_embedding_specifications.at(n).ndensity; + const int ndensity = basis_set->map_embedding_specifications.at(mu).ndensity; - int idx_rho = 0; + int idx_ms_comb = 0; // rank=1 - for (int offset = 0; offset < total_basis_size_rank1; ++offset) { - ACECTildeBasisFunction *func = &basis_rank1[offset]; - h_rank(n, offset) = 1; - h_mus(n, offset, 0) = func->mus[0]; - h_ns(n, offset, 0) = func->ns[0]; - for (int p = 0; p < ndensity; p++) - h_ctildes(n, idx_rho, p) = func->ctildes[p]; - h_offsets(n, idx_rho) = offset; - idx_rho++; + for (int func_ind = 0; func_ind < total_basis_size_rank1; ++func_ind) { + ACEBBasisFunction *func = &basis_rank1[func_ind]; + h_rank(mu, func_ind) = 1; + h_mus(mu, func_ind, 0) = func->mus[0]; + h_ns(mu, func_ind, 0) = func->ns[0]; + + for (int p = 0; p < ndensity; ++p) + h_coeffs(mu, func_ind, p) = func->coeff[p]; + + h_gen_cgs(mu, idx_ms_comb) = func->gen_cgs[0]; + + h_func_inds(mu, idx_ms_comb) = func_ind; + idx_ms_comb++; } // rank > 1 for (int func_ind = 0; func_ind < total_basis_size; ++func_ind) { - ACECTildeBasisFunction *func = &basis[func_ind]; + ACEBBasisFunction *func = &basis[func_ind]; // TODO: check if func->ctildes are zero, then skip - const int offset = total_basis_size_rank1 + func_ind; + const int func_ind_through = total_basis_size_rank1 + func_ind; - const int rank = h_rank(n, offset) = func->rank; - h_num_ms_combs(n, offset) = func->num_ms_combs; + const int rank = h_rank(mu, func_ind_through) = func->rank; + h_num_ms_combs(mu, func_ind_through) = func->num_ms_combs; for (int t = 0; t < rank; t++) { - h_mus(n, offset, t) = func->mus[t]; - h_ns(n, offset, t) = func->ns[t]; - h_ls(n, offset, t) = func->ls[t]; + h_mus(mu, func_ind_through, t) = func->mus[t]; + h_ns(mu, func_ind_through, t) = func->ns[t]; + h_ls(mu, func_ind_through, t) = func->ls[t]; } + for (int p = 0; p < ndensity; ++p) + h_coeffs(mu, func_ind_through, p) = func->coeff[p]; + + // loop over {ms} combinations in sum for (int ms_ind = 0; ms_ind < func->num_ms_combs; ++ms_ind) { auto ms = &func->ms_combs[ms_ind * rank]; // current ms-combination (of length = rank) for (int t = 0; t < rank; t++) - h_ms_combs(n, idx_rho, t) = ms[t]; + h_ms_combs(mu, idx_ms_comb, t) = ms[t]; - for (int p = 0; p < ndensity; ++p) { - // real-part only multiplication - h_ctildes(n, idx_rho, p) = func->ctildes[ms_ind * ndensity + p]; - } - h_offsets(n, idx_rho) = offset; - idx_rho++; + + h_gen_cgs(mu, idx_ms_comb) = func->gen_cgs[ms_ind]; + + + h_func_inds(mu, idx_ms_comb) = func_ind_through; + idx_ms_comb++; } } } Kokkos::deep_copy(d_rank, h_rank); Kokkos::deep_copy(d_num_ms_combs, h_num_ms_combs); - Kokkos::deep_copy(d_offsets, h_offsets); + Kokkos::deep_copy(d_func_inds, h_func_inds); Kokkos::deep_copy(d_mus, h_mus); Kokkos::deep_copy(d_ns, h_ns); Kokkos::deep_copy(d_ls, h_ls); Kokkos::deep_copy(d_ms_combs, h_ms_combs); - Kokkos::deep_copy(d_ctildes, h_ctildes); +// Kokkos::deep_copy(d_ctildes, h_ctildes); + Kokkos::deep_copy(d_gen_cgs, h_gen_cgs); + Kokkos::deep_copy(d_coeffs, h_coeffs); } /* ---------------------------------------------------------------------- @@ -391,7 +416,7 @@ void PairPACEExtrapolationKokkos::init_style() error->all(FLERR,"Pair style pace/kk can currently only run on a single " "CPU thread"); - PairPACE::init_style(); + PairPACEExtrapolation::init_style(); return; } @@ -436,7 +461,7 @@ void PairPACEExtrapolationKokkos::init_style() template double PairPACEExtrapolationKokkos::init_one(int i, int j) { - double cutone = PairPACE::init_one(i,j); + double cutone = PairPACEExtrapolation::init_one(i,j); k_scale.h_view(i,j) = k_scale.h_view(j,i) = scale[i][j]; k_scale.template modify(); @@ -454,7 +479,7 @@ double PairPACEExtrapolationKokkos::init_one(int i, int j) template void PairPACEExtrapolationKokkos::coeff(int narg, char **arg) { - PairPACE::coeff(narg,arg); + PairPACEExtrapolation::coeff(narg,arg); // Set up element lists @@ -471,7 +496,7 @@ void PairPACEExtrapolationKokkos::coeff(int narg, char **arg) template void PairPACEExtrapolationKokkos::allocate() { - PairPACE::allocate(); + PairPACEExtrapolation::allocate(); int n = atom->ntypes + 1; MemKK::realloc_kokkos(d_map, "pace:map", n); @@ -508,11 +533,10 @@ void PairPACEExtrapolationKokkos::compute(int eflag_in, int vflag_in { if (host_flag) { atomKK->sync(Host,X_MASK|TYPE_MASK); - PairPACE::compute(eflag_in,vflag_in); + PairPACEExtrapolation::compute(eflag_in,vflag_in); atomKK->modified(Host,F_MASK); return; } - eflag = eflag_in; vflag = vflag_in; @@ -521,7 +545,6 @@ void PairPACEExtrapolationKokkos::compute(int eflag_in, int vflag_in ev_init(eflag,vflag,0); // reallocate per-atom arrays if necessary - if (eflag_atom) { memoryKK->destroy_kokkos(k_eatom,eatom); memoryKK->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom"); @@ -532,14 +555,10 @@ void PairPACEExtrapolationKokkos::compute(int eflag_in, int vflag_in memoryKK->create_kokkos(k_vatom,vatom,maxvatom,"pair:vatom"); d_vatom = k_vatom.view(); } - copymode = 1; if (!force->newton_pair) error->all(FLERR,"PairPACEExtrapolationKokkos requires 'newton on'"); - if (recursive) - error->all(FLERR,"Must use 'product' algorithm with pair pace/kk on the GPU"); - atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK); x = atomKK->k_x.view(); f = atomKK->k_f.view(); @@ -636,7 +655,7 @@ void PairPACEExtrapolationKokkos::compute(int eflag_in, int vflag_in //ComputeRho { - typename Kokkos::RangePolicy policy_rho(0,chunk_size*idx_rho_max); + typename Kokkos::RangePolicy policy_rho(0, chunk_size * idx_ms_combs_max); Kokkos::parallel_for("ComputeRho",policy_rho,*this); } @@ -648,7 +667,7 @@ void PairPACEExtrapolationKokkos::compute(int eflag_in, int vflag_in //ComputeWeights { - typename Kokkos::RangePolicy policy_weights(0,chunk_size*idx_rho_max); + typename Kokkos::RangePolicy policy_weights(0, chunk_size * idx_ms_combs_max); Kokkos::parallel_for("ComputeWeights",policy_weights,*this); } @@ -910,63 +929,63 @@ template KOKKOS_INLINE_FUNCTION void PairPACEExtrapolationKokkos::operator() (TagPairPACEComputeRho, const int& iter) const { - const int idx_rho = iter / chunk_size; + const int idx_ms_comb = iter / chunk_size; const int ii = iter % chunk_size; const int i = d_ilist[ii + chunk_offset]; const int mu_i = d_map(type(i)); - if (idx_rho >= d_idx_rho_count(mu_i)) return; + if (idx_ms_comb >= d_idx_ms_combs_count(mu_i)) return; const int ndensity = d_ndensity(mu_i); - const int offset = d_offsets(mu_i, idx_rho); - const int rank = d_rank(mu_i, offset); + const int func_ind = d_func_inds(mu_i, idx_ms_comb); + const int rank = d_rank(mu_i, func_ind); const int r = rank - 1; // Basis functions B with iterative product and density rho(p) calculation if (rank == 1) { - const int mu = d_mus(mu_i, offset, 0); - const int n = d_ns(mu_i, offset, 0); + const int mu = d_mus(mu_i, func_ind, 0); + const int n = d_ns(mu_i, func_ind, 0); double A_cur = A_rank1(ii, mu, n - 1); for (int p = 0; p < ndensity; ++p) { //for rank=1 (r=0) only 1 ms-combination exists (ms_ind=0), so index of func.ctildes is 0..ndensity-1 - Kokkos::atomic_add(&rhos(ii, p), d_ctildes(mu_i, idx_rho, p) * A_cur); + Kokkos::atomic_add(&rhos(ii, p), d_coeffs(mu_i, func_ind, p) * d_gen_cgs(mu_i, idx_ms_comb) * A_cur); } } else { // rank > 1 // loop over {ms} combinations in sum // loop over m, collect B = product of A with given ms - A_forward_prod(ii, idx_rho, 0) = complex::one(); + A_forward_prod(ii, idx_ms_comb, 0) = complex::one(); // fill forward A-product triangle for (int t = 0; t < rank; t++) { //TODO: optimize ns[t]-1 -> ns[t] during functions construction - const int mu = d_mus(mu_i, offset, t); - const int n = d_ns(mu_i, offset, t); - const int l = d_ls(mu_i, offset, t); - const int m = d_ms_combs(mu_i, idx_rho, t); // current ms-combination (of length = rank) + const int mu = d_mus(mu_i, func_ind, t); + const int n = d_ns(mu_i, func_ind, t); + const int l = d_ls(mu_i, func_ind, t); + const int m = d_ms_combs(mu_i, idx_ms_comb, t); // current ms-combination (of length = rank) const int idx = l * (l + 1) + m; // (l, m) - A_list(ii, idx_rho, t) = A(ii, mu, n - 1, idx); - A_forward_prod(ii, idx_rho, t + 1) = A_forward_prod(ii, idx_rho, t) * A_list(ii, idx_rho, t); + A_list(ii, idx_ms_comb, t) = A(ii, mu, n - 1, idx); + A_forward_prod(ii, idx_ms_comb, t + 1) = A_forward_prod(ii, idx_ms_comb, t) * A_list(ii, idx_ms_comb, t); } complex A_backward_prod = complex::one(); // fill backward A-product triangle for (int t = r; t >= 1; t--) { - const complex dB = A_forward_prod(ii, idx_rho, t) * A_backward_prod; // dB - product of all A's except t-th - dB_flatten(ii, idx_rho, t) = dB; + const complex dB = A_forward_prod(ii, idx_ms_comb, t) * A_backward_prod; // dB - product of all A's except t-th + dB_flatten(ii, idx_ms_comb, t) = dB; - A_backward_prod = A_backward_prod * A_list(ii, idx_rho, t); + A_backward_prod = A_backward_prod * A_list(ii, idx_ms_comb, t); } - dB_flatten(ii, idx_rho, 0) = A_forward_prod(ii, idx_rho, 0) * A_backward_prod; + dB_flatten(ii, idx_ms_comb, 0) = A_forward_prod(ii, idx_ms_comb, 0) * A_backward_prod; - const complex B = A_forward_prod(ii, idx_rho, rank); + const complex B = A_forward_prod(ii, idx_ms_comb, rank); for (int p = 0; p < ndensity; ++p) { // real-part only multiplication - Kokkos::atomic_add(&rhos(ii, p), B.real_part_product(d_ctildes(mu_i, idx_rho, p))); + Kokkos::atomic_add(&rhos(ii, p), B.real_part_product(d_coeffs(mu_i, func_ind, p) * d_gen_cgs(mu_i, idx_ms_comb))); } } } @@ -1011,43 +1030,43 @@ template KOKKOS_INLINE_FUNCTION void PairPACEExtrapolationKokkos::operator() (TagPairPACEComputeWeights, const int& iter) const { - const int idx_rho = iter / chunk_size; + const int idx_ms_comb = iter / chunk_size; const int ii = iter % chunk_size; const int i = d_ilist[ii + chunk_offset]; const int mu_i = d_map(type(i)); - if (idx_rho >= d_idx_rho_count(mu_i)) return; + if (idx_ms_comb >= d_idx_ms_combs_count(mu_i)) return; const int ndensity = d_ndensity(mu_i); - const int offset = d_offsets(mu_i, idx_rho); - const int rank = d_rank(mu_i, offset); + const int func_ind = d_func_inds(mu_i, idx_ms_comb); + const int rank = d_rank(mu_i, func_ind); // Weights and theta calculation if (rank == 1) { - const int mu = d_mus(mu_i, offset, 0); - const int n = d_ns(mu_i, offset, 0); + const int mu = d_mus(mu_i, func_ind, 0); + const int n = d_ns(mu_i, func_ind, 0); double theta = 0.0; for (int p = 0; p < ndensity; ++p) { // for rank=1 (r=0) only 1 ms-combination exists (ms_ind=0), so index of func.ctildes is 0..ndensity-1 - theta += dF_drho(ii, p) * d_ctildes(mu_i, idx_rho, p); + theta += dF_drho(ii, p) * d_coeffs(mu_i, func_ind, p) * d_gen_cgs(mu_i, idx_ms_comb); } Kokkos::atomic_add(&weights_rank1(ii, mu, n - 1), theta); } else { // rank > 1 double theta = 0.0; for (int p = 0; p < ndensity; ++p) - theta += dF_drho(ii, p) * d_ctildes(mu_i, idx_rho, p); + theta += dF_drho(ii, p) * d_coeffs(mu_i, func_ind, p) * d_gen_cgs(mu_i, idx_ms_comb); theta *= 0.5; // 0.5 factor due to possible double counting ??? for (int t = 0; t < rank; ++t) { - const int m_t = d_ms_combs(mu_i, idx_rho, t); + const int m_t = d_ms_combs(mu_i, idx_ms_comb, t); const int factor = (m_t % 2 == 0 ? 1 : -1); - const complex dB = dB_flatten(ii, idx_rho, t); - const int mu_t = d_mus(mu_i, offset, t); - const int n_t = d_ns(mu_i, offset, t); - const int l_t = d_ls(mu_i, offset, t); + const complex dB = dB_flatten(ii, idx_ms_comb, t); + const int mu_t = d_mus(mu_i, func_ind, t); + const int n_t = d_ns(mu_i, func_ind, t); + const int l_t = d_ls(mu_i, func_ind, t); const int idx = l_t * (l_t + 1) + m_t; // (l, m) const complex value = theta * dB; Kokkos::atomic_add(&(weights(ii, mu_t, n_t - 1, idx).re), value.re); @@ -1687,15 +1706,17 @@ double PairPACEExtrapolationKokkos::memory_usage() bytes += MemKK::memory_usage(d_npoti); bytes += MemKK::memory_usage(d_wpre); bytes += MemKK::memory_usage(d_mexp); - bytes += MemKK::memory_usage(d_idx_rho_count); + bytes += MemKK::memory_usage(d_idx_ms_combs_count); bytes += MemKK::memory_usage(d_rank); bytes += MemKK::memory_usage(d_num_ms_combs); - bytes += MemKK::memory_usage(d_offsets); + bytes += MemKK::memory_usage(d_func_inds); bytes += MemKK::memory_usage(d_mus); bytes += MemKK::memory_usage(d_ns); bytes += MemKK::memory_usage(d_ls); bytes += MemKK::memory_usage(d_ms_combs); - bytes += MemKK::memory_usage(d_ctildes); +// bytes += MemKK::memory_usage(d_ctildes); + bytes += MemKK::memory_usage(d_gen_cgs); + bytes += MemKK::memory_usage(d_coeffs); bytes += MemKK::memory_usage(alm); bytes += MemKK::memory_usage(blm); bytes += MemKK::memory_usage(cl); diff --git a/src/KOKKOS/pair_pace_extrapolation_kokkos.h b/src/KOKKOS/pair_pace_extrapolation_kokkos.h index 482d068725..568d12b90c 100644 --- a/src/KOKKOS/pair_pace_extrapolation_kokkos.h +++ b/src/KOKKOS/pair_pace_extrapolation_kokkos.h @@ -23,7 +23,7 @@ PairStyle(pace/extrapolation/kk/host,PairPACEExtrapolationKokkos); #ifndef LMP_PAIR_PACE_EXTRAPOLATION_KOKKOS_H #define LMP_PAIR_PACE_EXTRAPOLATION_KOKKOS_H -#include "pair_pace.h" +#include "pair_pace_extrapolation.h" #include "ace-evaluator/ace_radial.h" #include "kokkos_type.h" #include "pair_kokkos.h" @@ -31,7 +31,7 @@ PairStyle(pace/extrapolation/kk/host,PairPACEExtrapolationKokkos); namespace LAMMPS_NS { template -class PairPACEExtrapolationKokkos : public PairPACE { +class PairPACEExtrapolationKokkos : public PairPACEExtrapolation { public: struct TagPairPACEComputeNeigh{}; struct TagPairPACEComputeRadial{}; @@ -95,7 +95,7 @@ class PairPACEExtrapolationKokkos : public PairPACE { void operator() (TagPairPACEComputeForce,const int& ii, EV_FLOAT&) const; protected: - int inum, maxneigh, chunk_size, chunk_offset, idx_rho_max; + int inum, maxneigh, chunk_size, chunk_offset, idx_ms_combs_max; int host_flag; int eflag, vflag; @@ -274,15 +274,17 @@ class PairPACEExtrapolationKokkos : public PairPACE { t_ace_2d d_mexp; // tilde - t_ace_1i d_idx_rho_count; + t_ace_1i d_idx_ms_combs_count; t_ace_2i d_rank; t_ace_2i d_num_ms_combs; - t_ace_2i d_offsets; + t_ace_2i d_func_inds; t_ace_3i d_mus; t_ace_3i d_ns; t_ace_3i d_ls; t_ace_3i d_ms_combs; - t_ace_3d d_ctildes; +// t_ace_3d d_ctildes; + t_ace_2d d_gen_cgs; + t_ace_3d d_coeffs; t_ace_3d3 f_ij; diff --git a/src/ML-PACE/pair_pace_extrapolation.cpp b/src/ML-PACE/pair_pace_extrapolation.cpp index ec185e75df..2c5e5fafe9 100644 --- a/src/ML-PACE/pair_pace_extrapolation.cpp +++ b/src/ML-PACE/pair_pace_extrapolation.cpp @@ -98,6 +98,8 @@ PairPACEExtrapolation::PairPACEExtrapolation(LAMMPS *lmp) : Pair(lmp) scale = nullptr; flag_compute_extrapolation_grade = 0; extrapolation_grade_gamma = nullptr; + + chunksize = 4096; } /* ---------------------------------------------------------------------- @@ -133,7 +135,7 @@ void PairPACEExtrapolation::compute(int eflag, int vflag) double **x = atom->x; double **f = atom->f; - tagint *tag = atom->tag; +// tagint *tag = atom->tag; int *type = atom->type; // number of atoms in cell int nlocal = atom->nlocal; @@ -141,7 +143,7 @@ void PairPACEExtrapolation::compute(int eflag, int vflag) int newton_pair = force->newton_pair; // number of atoms including ghost atoms - int nall = nlocal + atom->nghost; +// int nall = nlocal + atom->nghost; // inum: length of the neighborlists list inum = list->inum; @@ -283,7 +285,20 @@ void PairPACEExtrapolation::allocate() void PairPACEExtrapolation::settings(int narg, char **arg) { - if (narg > 0) error->all(FLERR, "Pair style pace/extrapolation supports no keywords"); +// if (narg > 2) error->all(FLERR, "Pair style pace/extrapolation supports no keywords"); + if (narg > 2) utils::missing_cmd_args(FLERR, "pair_style pace/extrapolation", error); + // ACE potentials are parameterized in metal units + if (strcmp("metal", update->unit_style) != 0) + error->all(FLERR, "ACE potentials require 'metal' units"); + + int iarg = 0; + while (iarg < narg) { + if (strcmp(arg[iarg], "chunksize") == 0) { + chunksize = utils::inumeric(FLERR, arg[iarg + 1], false, lmp); + iarg += 2; + } else + error->all(FLERR, "Unknown pair_style pace keyword: {}", arg[iarg]); + } if (comm->me == 0) utils::logmesg(lmp, "ACE/AL version: {}.{}.{}\n", VERSION_YEAR, VERSION_MONTH, VERSION_DAY); @@ -343,7 +358,6 @@ void PairPACEExtrapolation::coeff(int narg, char **arg) aceimpl->rec_ace->element_type_mapping.init(atom->ntypes + 1); aceimpl->rec_ace->element_type_mapping.fill(-1); //-1 means atom not included into potential - FILE *species_type_file = nullptr; const int n = atom->ntypes; element_names.resize(n); diff --git a/src/ML-PACE/pair_pace_extrapolation.h b/src/ML-PACE/pair_pace_extrapolation.h index c5d9da23db..6f7eeb279e 100644 --- a/src/ML-PACE/pair_pace_extrapolation.h +++ b/src/ML-PACE/pair_pace_extrapolation.h @@ -49,13 +49,15 @@ class PairPACEExtrapolation : public Pair { struct ACEALImpl *aceimpl; int nmax; - void allocate(); + virtual void allocate(); std::vector element_names; // list of elements (used by dump pace/extrapolation) double *extrapolation_grade_gamma; //per-atom gamma value int flag_compute_extrapolation_grade; double **scale; + + int chunksize; }; } // namespace LAMMPS_NS From 014b892e3b13fd3e160d17fe5f7135052a810b49 Mon Sep 17 00:00:00 2001 From: Yury Lysogorskiy Date: Mon, 2 Jan 2023 23:34:24 +0100 Subject: [PATCH 03/12] WIP: - add TagPairPACEComputeGamma kernel - add d_total_basis_size - add gamma_flag, d_ASI, projections and gamma --- src/KOKKOS/pair_pace_extrapolation_kokkos.cpp | 109 ++++++++++++++---- src/KOKKOS/pair_pace_extrapolation_kokkos.h | 12 +- 2 files changed, 100 insertions(+), 21 deletions(-) diff --git a/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp index 006cf5e609..c5ca97d036 100644 --- a/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp +++ b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp @@ -13,7 +13,7 @@ ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- - Contributing author: Stan Moore (SNL) + Contributing author: Yury Lysogorskiy (ICAMS) ------------------------------------------------------------------------- */ #include "pair_pace_extrapolation_kokkos.h" @@ -29,9 +29,9 @@ #include "neighbor_kokkos.h" #include "neigh_request.h" -#include "ace-evaluator/ace_c_basis.h" -#include "ace-evaluator/ace_evaluator.h" -#include "ace-evaluator/ace_recursive.h" +//#include "ace-evaluator/ace_c_basis.h" +//#include "ace-evaluator/ace_evaluator.h" +//#include "ace-evaluator/ace_recursive.h" #include "ace-evaluator/ace_version.h" #include "ace-evaluator/ace_radial.h" @@ -42,21 +42,16 @@ namespace LAMMPS_NS { struct ACEALImpl { - ACEALImpl() : basis_set(nullptr), ace(nullptr), ctilde_basis_set(nullptr), rec_ace(nullptr) {} + ACEALImpl() : basis_set(nullptr), ace(nullptr) {} ~ACEALImpl() { delete basis_set; delete ace; - - delete ctilde_basis_set; - delete rec_ace; } ACEBBasisSet *basis_set; ACEBEvaluator *ace; - ACECTildeBasisSet *ctilde_basis_set; - ACERecursiveEvaluator *rec_ace; }; } // namespace LAMMPS_NS @@ -132,7 +127,12 @@ void PairPACEExtrapolationKokkos::grow(int natom, int maxneigh) // hard-core repulsion MemKK::realloc_kokkos(rho_core, "pace:rho_core", natom); MemKK::realloc_kokkos(dF_drho_core, "pace:dF_drho_core", natom); + MemKK::realloc_kokkos(dB_flatten, "pace:dB_flatten", natom, idx_ms_combs_max, basis_set->rankmax); + + //B-projections + MemKK::realloc_kokkos(projections, "pace:projections", natom, total_num_functions_max); // per-atom B-projections + MemKK::realloc_kokkos(gamma, "pace:gamma", natom); // per-atom gamma } if (((int)ylm.extent(0) < natom) || ((int)ylm.extent(1) < maxneigh)) { @@ -271,14 +271,17 @@ template void PairPACEExtrapolationKokkos::copy_tilde() { auto basis_set = aceimpl->basis_set; + auto b_evaluator = aceimpl->ace; // flatten loops, get per-element count and max idx_ms_combs_max = 0; - int total_basis_size_max = 0; + total_num_functions_max = 0; MemKK::realloc_kokkos(d_idx_ms_combs_count, "pace:idx_ms_combs_count", nelements); + MemKK::realloc_kokkos(d_total_basis_size, "pace:total_basis_size", nelements); auto h_idx_ms_combs_count = Kokkos::create_mirror_view(d_idx_ms_combs_count); + auto h_total_basis_size = Kokkos::create_mirror_view(d_total_basis_size); for (int mu = 0; mu < nelements; mu++) { int idx_ms_combs = 0; @@ -301,21 +304,25 @@ void PairPACEExtrapolationKokkos::copy_tilde() } h_idx_ms_combs_count(mu) = idx_ms_combs; idx_ms_combs_max = MAX(idx_ms_combs_max, idx_ms_combs); - total_basis_size_max = MAX(total_basis_size_max, total_basis_size_rank1 + total_basis_size); + total_num_functions_max = MAX(total_num_functions_max, total_basis_size_rank1 + total_basis_size); + h_total_basis_size(mu) = total_basis_size_rank1 + total_basis_size; } Kokkos::deep_copy(d_idx_ms_combs_count, h_idx_ms_combs_count); + Kokkos::deep_copy(d_total_basis_size, h_total_basis_size); - MemKK::realloc_kokkos(d_rank, "pace:rank", nelements, total_basis_size_max); - MemKK::realloc_kokkos(d_num_ms_combs, "pace:num_ms_combs", nelements, total_basis_size_max); + MemKK::realloc_kokkos(d_rank, "pace:rank", nelements, total_num_functions_max); + MemKK::realloc_kokkos(d_num_ms_combs, "pace:num_ms_combs", nelements, total_num_functions_max); MemKK::realloc_kokkos(d_func_inds, "pace:func_inds", nelements, idx_ms_combs_max); - MemKK::realloc_kokkos(d_mus, "pace:mus", nelements, total_basis_size_max, basis_set->rankmax); - MemKK::realloc_kokkos(d_ns, "pace:ns", nelements, total_basis_size_max, basis_set->rankmax); - MemKK::realloc_kokkos(d_ls, "pace:ls", nelements, total_basis_size_max, basis_set->rankmax); + MemKK::realloc_kokkos(d_mus, "pace:mus", nelements, total_num_functions_max, basis_set->rankmax); + MemKK::realloc_kokkos(d_ns, "pace:ns", nelements, total_num_functions_max, basis_set->rankmax); + MemKK::realloc_kokkos(d_ls, "pace:ls", nelements, total_num_functions_max, basis_set->rankmax); MemKK::realloc_kokkos(d_ms_combs, "pace:ms_combs", nelements, idx_ms_combs_max, basis_set->rankmax); - //MemKK::realloc_kokkos(d_ctildes, "pace:ctildes", nelements, idx_ms_combs_max, basis_set->ndensitymax); MemKK::realloc_kokkos(d_gen_cgs, "pace:gen_cgs", nelements, idx_ms_combs_max); - MemKK::realloc_kokkos(d_coeffs, "pace:coeffs", nelements, total_basis_size_max, basis_set->ndensitymax); + MemKK::realloc_kokkos(d_coeffs, "pace:coeffs", nelements, total_num_functions_max, basis_set->ndensitymax); + // active set inverted + MemKK::realloc_kokkos(d_ASI, "pace:ASI", nelements, total_num_functions_max, total_num_functions_max); + auto h_rank = Kokkos::create_mirror_view(d_rank); auto h_num_ms_combs = Kokkos::create_mirror_view(d_num_ms_combs); @@ -327,6 +334,8 @@ void PairPACEExtrapolationKokkos::copy_tilde() // auto h_ctildes = Kokkos::create_mirror_view(d_ctildes); auto h_gen_cgs = Kokkos::create_mirror_view(d_gen_cgs); auto h_coeffs = Kokkos::create_mirror_view(d_coeffs); +// asi + auto h_ASI = Kokkos::create_mirror_view(d_ASI); // copy values on host @@ -390,6 +399,14 @@ void PairPACEExtrapolationKokkos::copy_tilde() idx_ms_comb++; } } + + // ASI + const auto &A_as_inv = b_evaluator->A_active_set_inv.at(mu); + for(int i = 0; i < total_basis_size_rank1 + total_basis_size; i++) + for(int j = 0; j < total_basis_size_rank1 + total_basis_size; j++){ + h_ASI(mu,i,j)=A_as_inv(i,j); + } + } Kokkos::deep_copy(d_rank, h_rank); @@ -402,6 +419,7 @@ void PairPACEExtrapolationKokkos::copy_tilde() // Kokkos::deep_copy(d_ctildes, h_ctildes); Kokkos::deep_copy(d_gen_cgs, h_gen_cgs); Kokkos::deep_copy(d_coeffs, h_coeffs); + Kokkos::deep_copy(d_ASI, h_ASI); } /* ---------------------------------------------------------------------- @@ -591,6 +609,7 @@ void PairPACEExtrapolationKokkos::compute(int eflag_in, int vflag_in chunk_size = MIN(chunksize,inum); // "chunksize" variable is set by user chunk_offset = 0; + gamma_flag = 1; grow(chunk_size, maxneigh); @@ -604,6 +623,9 @@ void PairPACEExtrapolationKokkos::compute(int eflag_in, int vflag_in Kokkos::deep_copy(A_rank1, 0.0); Kokkos::deep_copy(rhos, 0.0); + Kokkos::deep_copy(projections, 0.0); + Kokkos::deep_copy(gamma, 0.0); + EV_FLOAT ev_tmp; if (chunk_size > inum - chunk_offset) @@ -665,6 +687,12 @@ void PairPACEExtrapolationKokkos::compute(int eflag_in, int vflag_in Kokkos::parallel_for("ComputeFS",policy_fs,*this); } + //ComputeGamma + if (gamma_flag) { + typename Kokkos::RangePolicy policy_gamma(0,chunk_size); + Kokkos::parallel_for("ComputeGamma",policy_gamma,*this); + } + //ComputeWeights { typename Kokkos::RangePolicy policy_weights(0, chunk_size * idx_ms_combs_max); @@ -952,6 +980,12 @@ void PairPACEExtrapolationKokkos::operator() (TagPairPACEComputeRho, //for rank=1 (r=0) only 1 ms-combination exists (ms_ind=0), so index of func.ctildes is 0..ndensity-1 Kokkos::atomic_add(&rhos(ii, p), d_coeffs(mu_i, func_ind, p) * d_gen_cgs(mu_i, idx_ms_comb) * A_cur); } + + + //gamma_i + if(gamma_flag) + Kokkos::atomic_add(&projections(ii, func_ind), d_gen_cgs(mu_i, idx_ms_comb) * A_cur); + } else { // rank > 1 // loop over {ms} combinations in sum @@ -987,6 +1021,9 @@ void PairPACEExtrapolationKokkos::operator() (TagPairPACEComputeRho, // real-part only multiplication Kokkos::atomic_add(&rhos(ii, p), B.real_part_product(d_coeffs(mu_i, func_ind, p) * d_gen_cgs(mu_i, idx_ms_comb))); } + //gamma_i + if(gamma_flag) + Kokkos::atomic_add(&projections(ii, func_ind), B.real_part_product(d_gen_cgs(mu_i, idx_ms_comb))); } } @@ -1024,6 +1061,35 @@ void PairPACEExtrapolationKokkos::operator() (TagPairPACEComputeFS, } } + +/* ---------------------------------------------------------------------- */ + + +template +KOKKOS_INLINE_FUNCTION +void PairPACEExtrapolationKokkos::operator() (TagPairPACEComputeGamma, const int& ii) const +{ + const int i = d_ilist[ii + chunk_offset]; + const int mu_i = d_map(type(i)); + const int basis_size = d_total_basis_size(mu_i); + + + double gamma_max = 0; + for (int j = 0; j gamma_max) + gamma_max = abs(current_gamma); + } + + // tally energy contribution + gamma(ii) = gamma_max; + +} /* ---------------------------------------------------------------------- */ template @@ -1714,13 +1780,16 @@ double PairPACEExtrapolationKokkos::memory_usage() bytes += MemKK::memory_usage(d_ns); bytes += MemKK::memory_usage(d_ls); bytes += MemKK::memory_usage(d_ms_combs); -// bytes += MemKK::memory_usage(d_ctildes); bytes += MemKK::memory_usage(d_gen_cgs); bytes += MemKK::memory_usage(d_coeffs); bytes += MemKK::memory_usage(alm); bytes += MemKK::memory_usage(blm); bytes += MemKK::memory_usage(cl); bytes += MemKK::memory_usage(dl); + bytes += MemKK::memory_usage(d_total_basis_size); + bytes += MemKK::memory_usage(d_ASI); + bytes += MemKK::memory_usage(projections); + bytes += MemKK::memory_usage(gamma); if (k_splines_gk.h_view.data()) { for (int i = 0; i < nelements; i++) { diff --git a/src/KOKKOS/pair_pace_extrapolation_kokkos.h b/src/KOKKOS/pair_pace_extrapolation_kokkos.h index 568d12b90c..a5b4f8abc7 100644 --- a/src/KOKKOS/pair_pace_extrapolation_kokkos.h +++ b/src/KOKKOS/pair_pace_extrapolation_kokkos.h @@ -40,6 +40,7 @@ class PairPACEExtrapolationKokkos : public PairPACEExtrapolation { struct TagPairPACEConjugateAi{}; struct TagPairPACEComputeRho{}; struct TagPairPACEComputeFS{}; + struct TagPairPACEComputeGamma{}; struct TagPairPACEComputeWeights{}; struct TagPairPACEComputeDerivative{}; @@ -80,6 +81,9 @@ class PairPACEExtrapolationKokkos : public PairPACEExtrapolation { KOKKOS_INLINE_FUNCTION void operator() (TagPairPACEComputeFS,const int& ii) const; + KOKKOS_INLINE_FUNCTION + void operator() (TagPairPACEComputeGamma, const int& ii) const; + KOKKOS_INLINE_FUNCTION void operator() (TagPairPACEComputeWeights,const int& iter) const; @@ -95,8 +99,9 @@ class PairPACEExtrapolationKokkos : public PairPACEExtrapolation { void operator() (TagPairPACEComputeForce,const int& ii, EV_FLOAT&) const; protected: - int inum, maxneigh, chunk_size, chunk_offset, idx_ms_combs_max; + int inum, maxneigh, chunk_size, chunk_offset, idx_ms_combs_max, total_num_functions_max; int host_flag; + int gamma_flag; int eflag, vflag; @@ -236,6 +241,10 @@ class PairPACEExtrapolationKokkos : public PairPACEExtrapolation { t_ace_3d d_values; t_ace_3d d_derivatives; + // inverted active set + t_ace_3d d_ASI; + t_ace_2d projections; + t_ace_1d gamma; // Spherical Harmonics void pre_compute_harmonics(int); @@ -275,6 +284,7 @@ class PairPACEExtrapolationKokkos : public PairPACEExtrapolation { // tilde t_ace_1i d_idx_ms_combs_count; + t_ace_1i d_total_basis_size; t_ace_2i d_rank; t_ace_2i d_num_ms_combs; t_ace_2i d_func_inds; From 1f36bc49ab234151b95525d3e7c502af845782a5 Mon Sep 17 00:00:00 2001 From: Yury Lysogorskiy Date: Tue, 3 Jan 2023 14:08:31 +0100 Subject: [PATCH 04/12] - add extract and extract_peratom methods - rename device gamma array to d_gamma - make host h_gamma array - copy from h_gamma to host extrapolation_grade_gamma array for each chunk - transpose two last dimensions of d_ASI (small improvement of performance) - manage grows of extrapolation_grade_gamma --- src/KOKKOS/pair_pace_extrapolation_kokkos.cpp | 91 +++++++++++++++---- src/KOKKOS/pair_pace_extrapolation_kokkos.h | 14 ++- 2 files changed, 82 insertions(+), 23 deletions(-) diff --git a/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp index c5ca97d036..cbaf92d800 100644 --- a/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp +++ b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp @@ -29,9 +29,6 @@ #include "neighbor_kokkos.h" #include "neigh_request.h" -//#include "ace-evaluator/ace_c_basis.h" -//#include "ace-evaluator/ace_evaluator.h" -//#include "ace-evaluator/ace_recursive.h" #include "ace-evaluator/ace_version.h" #include "ace-evaluator/ace_radial.h" @@ -132,7 +129,7 @@ void PairPACEExtrapolationKokkos::grow(int natom, int maxneigh) //B-projections MemKK::realloc_kokkos(projections, "pace:projections", natom, total_num_functions_max); // per-atom B-projections - MemKK::realloc_kokkos(gamma, "pace:gamma", natom); // per-atom gamma + MemKK::realloc_kokkos(d_gamma, "pace:gamma", natom); // per-atom gamma } if (((int)ylm.extent(0) < natom) || ((int)ylm.extent(1) < maxneigh)) { @@ -321,7 +318,9 @@ void PairPACEExtrapolationKokkos::copy_tilde() MemKK::realloc_kokkos(d_gen_cgs, "pace:gen_cgs", nelements, idx_ms_combs_max); MemKK::realloc_kokkos(d_coeffs, "pace:coeffs", nelements, total_num_functions_max, basis_set->ndensitymax); // active set inverted - MemKK::realloc_kokkos(d_ASI, "pace:ASI", nelements, total_num_functions_max, total_num_functions_max); + t_ace_3d d_ASI_temp; + MemKK::realloc_kokkos(d_ASI_temp, "pace:ASI_temp", nelements, total_num_functions_max, total_num_functions_max); + auto h_rank = Kokkos::create_mirror_view(d_rank); @@ -331,11 +330,10 @@ void PairPACEExtrapolationKokkos::copy_tilde() auto h_ns = Kokkos::create_mirror_view(d_ns); auto h_ls = Kokkos::create_mirror_view(d_ls); auto h_ms_combs = Kokkos::create_mirror_view(d_ms_combs); -// auto h_ctildes = Kokkos::create_mirror_view(d_ctildes); auto h_gen_cgs = Kokkos::create_mirror_view(d_gen_cgs); auto h_coeffs = Kokkos::create_mirror_view(d_coeffs); // asi - auto h_ASI = Kokkos::create_mirror_view(d_ASI); + auto h_ASI = Kokkos::create_mirror_view(d_ASI_temp); // copy values on host @@ -404,7 +402,7 @@ void PairPACEExtrapolationKokkos::copy_tilde() const auto &A_as_inv = b_evaluator->A_active_set_inv.at(mu); for(int i = 0; i < total_basis_size_rank1 + total_basis_size; i++) for(int j = 0; j < total_basis_size_rank1 + total_basis_size; j++){ - h_ASI(mu,i,j)=A_as_inv(i,j); + h_ASI(mu,i,j)=A_as_inv(j,i); // transpose back for better performance on GPU } } @@ -416,10 +414,10 @@ void PairPACEExtrapolationKokkos::copy_tilde() Kokkos::deep_copy(d_ns, h_ns); Kokkos::deep_copy(d_ls, h_ls); Kokkos::deep_copy(d_ms_combs, h_ms_combs); -// Kokkos::deep_copy(d_ctildes, h_ctildes); Kokkos::deep_copy(d_gen_cgs, h_gen_cgs); Kokkos::deep_copy(d_coeffs, h_coeffs); - Kokkos::deep_copy(d_ASI, h_ASI); + Kokkos::deep_copy(d_ASI_temp, h_ASI); + d_ASI = d_ASI_temp; // copy from temopary array to const array } /* ---------------------------------------------------------------------- @@ -573,6 +571,15 @@ void PairPACEExtrapolationKokkos::compute(int eflag_in, int vflag_in memoryKK->create_kokkos(k_vatom,vatom,maxvatom,"pair:vatom"); d_vatom = k_vatom.view(); } + + if (gamma_flag && atom->nlocal > nmax) { + memory->destroy(extrapolation_grade_gamma); + nmax = atom->nlocal; + memory->create(extrapolation_grade_gamma, nmax, "pace/atom:gamma"); + //zeroify array + memset(extrapolation_grade_gamma, 0, nmax * sizeof(*extrapolation_grade_gamma)); + } + copymode = 1; if (!force->newton_pair) error->all(FLERR,"PairPACEExtrapolationKokkos requires 'newton on'"); @@ -609,7 +616,7 @@ void PairPACEExtrapolationKokkos::compute(int eflag_in, int vflag_in chunk_size = MIN(chunksize,inum); // "chunksize" variable is set by user chunk_offset = 0; - gamma_flag = 1; + grow(chunk_size, maxneigh); @@ -624,7 +631,7 @@ void PairPACEExtrapolationKokkos::compute(int eflag_in, int vflag_in Kokkos::deep_copy(rhos, 0.0); Kokkos::deep_copy(projections, 0.0); - Kokkos::deep_copy(gamma, 0.0); + Kokkos::deep_copy(d_gamma, 0.0); EV_FLOAT ev_tmp; @@ -729,8 +736,15 @@ void PairPACEExtrapolationKokkos::compute(int eflag_in, int vflag_in } } ev += ev_tmp; - chunk_offset += chunk_size; + //if gamma_flag - copy current d_gamma to extrapolation_grade_gamma + if(gamma_flag){ + h_gamma = Kokkos::create_mirror_view(d_gamma); + Kokkos:deep_copy(h_gamma, d_gamma); + memcpy(extrapolation_grade_gamma+chunk_offset, (void *) h_gamma.data(), sizeof(double)*chunk_size); + } + + chunk_offset += chunk_size; } // end while if (need_dup) @@ -1075,19 +1089,21 @@ void PairPACEExtrapolationKokkos::operator() (TagPairPACEComputeGamm double gamma_max = 0; - for (int j = 0; j gamma_max) - gamma_max = abs(current_gamma); + current_gamma = fabs(current_gamma); + if (current_gamma > gamma_max) + gamma_max = current_gamma; } // tally energy contribution - gamma(ii) = gamma_max; + d_gamma(ii) = gamma_max; } /* ---------------------------------------------------------------------- */ @@ -1789,7 +1805,7 @@ double PairPACEExtrapolationKokkos::memory_usage() bytes += MemKK::memory_usage(d_total_basis_size); bytes += MemKK::memory_usage(d_ASI); bytes += MemKK::memory_usage(projections); - bytes += MemKK::memory_usage(gamma); + bytes += MemKK::memory_usage(d_gamma); if (k_splines_gk.h_view.data()) { for (int i = 0; i < nelements; i++) { @@ -1812,3 +1828,38 @@ template class PairPACEExtrapolationKokkos; template class PairPACEExtrapolationKokkos; #endif } + +/* ---------------------------------------------------------------------- */ +/* ---------------------------------------------------------------------- + extract method for extracting value of scale variable + ---------------------------------------------------------------------- */ +template +void *PairPACEExtrapolationKokkos::extract(const char *str, int &dim) +{ + //check if str=="gamma_flag" then compute extrapolation grades on this iteration + dim = 0; + if (strcmp(str, "gamma_flag") == 0) return (void *) &gamma_flag; + + dim = 2; + if (strcmp(str, "scale") == 0) return (void *) scale; + return nullptr; +} + +/* ---------------------------------------------------------------------- + peratom requests from FixPair + return ptr to requested data + also return ncol = # of quantites per atom + 0 = per-atom vector + 1 or more = # of columns in per-atom array + return NULL if str is not recognized +---------------------------------------------------------------------- */ +template +void *PairPACEExtrapolationKokkos::extract_peratom(const char *str, int &ncol) +{ + if (strcmp(str, "gamma") == 0) { + ncol = 0; + return (void *) extrapolation_grade_gamma; + } + + return nullptr; +} \ No newline at end of file diff --git a/src/KOKKOS/pair_pace_extrapolation_kokkos.h b/src/KOKKOS/pair_pace_extrapolation_kokkos.h index a5b4f8abc7..6983fcc338 100644 --- a/src/KOKKOS/pair_pace_extrapolation_kokkos.h +++ b/src/KOKKOS/pair_pace_extrapolation_kokkos.h @@ -98,6 +98,10 @@ class PairPACEExtrapolationKokkos : public PairPACEExtrapolation { KOKKOS_INLINE_FUNCTION void operator() (TagPairPACEComputeForce,const int& ii, EV_FLOAT&) const; + + void *extract(const char *str, int &dim); + void *extract_peratom(const char *str, int &ncol); + protected: int inum, maxneigh, chunk_size, chunk_offset, idx_ms_combs_max, total_num_functions_max; int host_flag; @@ -203,6 +207,7 @@ class PairPACEExtrapolationKokkos : public PairPACEExtrapolation { typedef Kokkos::View t_ace_2d; typedef Kokkos::View t_ace_2d3; typedef Kokkos::View t_ace_3d; + typedef Kokkos::View tc_ace_3d; typedef Kokkos::View t_ace_3d3; typedef Kokkos::View t_ace_3d4; typedef Kokkos::View t_ace_4d; @@ -213,6 +218,8 @@ class PairPACEExtrapolationKokkos : public PairPACEExtrapolation { typedef Kokkos::View t_ace_4c; typedef Kokkos::View t_ace_4c3; + typedef Kokkos::View::HostMirror th_ace_1d; + t_ace_3d A_rank1; t_ace_4c A; @@ -242,11 +249,12 @@ class PairPACEExtrapolationKokkos : public PairPACEExtrapolation { t_ace_3d d_derivatives; // inverted active set - t_ace_3d d_ASI; + tc_ace_3d d_ASI; t_ace_2d projections; - t_ace_1d gamma; - // Spherical Harmonics + t_ace_1d d_gamma; + th_ace_1d h_gamma; + // Spherical Harmonics void pre_compute_harmonics(int); KOKKOS_INLINE_FUNCTION From e033cebcdd8274299969f1e4e172c232db9ce7b8 Mon Sep 17 00:00:00 2001 From: Yury Lysogorskiy Date: Tue, 3 Jan 2023 14:30:04 +0100 Subject: [PATCH 05/12] update lammps-user-pace version and checksum --- cmake/Modules/Packages/ML-PACE.cmake | 4 ++-- lib/pace/Install.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cmake/Modules/Packages/ML-PACE.cmake b/cmake/Modules/Packages/ML-PACE.cmake index 0159f36c34..b64584e5a7 100644 --- a/cmake/Modules/Packages/ML-PACE.cmake +++ b/cmake/Modules/Packages/ML-PACE.cmake @@ -1,6 +1,6 @@ -set(PACELIB_URL "https://github.com/ICAMS/lammps-user-pace/archive/refs/tags/v.2022.10.15.tar.gz" CACHE STRING "URL for PACE evaluator library sources") +set(PACELIB_URL "https://github.com/ICAMS/lammps-user-pace/archive/refs/tags/v.2023.01.3.tar.gz" CACHE STRING "URL for PACE evaluator library sources") -set(PACELIB_MD5 "848ad6a6cc79fa82745927001fb1c9b5" CACHE STRING "MD5 checksum of PACE evaluator library tarball") +set(PACELIB_MD5 "f418d32b60e531063ac4285bf702b468" CACHE STRING "MD5 checksum of PACE evaluator library tarball") mark_as_advanced(PACELIB_URL) mark_as_advanced(PACELIB_MD5) diff --git a/lib/pace/Install.py b/lib/pace/Install.py index 9f132a9580..f7f50d4f6d 100644 --- a/lib/pace/Install.py +++ b/lib/pace/Install.py @@ -18,11 +18,11 @@ from install_helpers import fullpath, geturl, checkmd5sum # settings thisdir = fullpath('.') -version ='v.2022.10.15' +version ='v.2023.01.3' # known checksums for different PACE versions. used to validate the download. checksums = { \ - 'v.2022.10.15': '848ad6a6cc79fa82745927001fb1c9b5' + 'v.2023.01.3': 'f418d32b60e531063ac4285bf702b468' } parser = ArgumentParser(prog='Install.py', description="LAMMPS library build wrapper script") From 4d50109731a83f8c1a69ba81f0173f72f26a1364 Mon Sep 17 00:00:00 2001 From: Yury Lysogorskiy Date: Tue, 3 Jan 2023 14:43:46 +0100 Subject: [PATCH 06/12] add check that LINEAR ASI must be used --- src/KOKKOS/pair_pace_extrapolation_kokkos.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp index cbaf92d800..ba5ecafeac 100644 --- a/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp +++ b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp @@ -429,7 +429,7 @@ void PairPACEExtrapolationKokkos::init_style() { if (host_flag) { if (lmp->kokkos->nthreads > 1) - error->all(FLERR,"Pair style pace/kk can currently only run on a single " + error->all(FLERR,"Pair style pace/extrapolation/kk can currently only run on a single " "CPU thread"); PairPACEExtrapolation::init_style(); @@ -497,6 +497,10 @@ void PairPACEExtrapolationKokkos::coeff(int narg, char **arg) { PairPACEExtrapolation::coeff(narg,arg); + auto b_evaluator = aceimpl->ace; + if (!b_evaluator->get_is_linear_extrapolation_grade()) { + error->all(FLERR,"Must use LINEAR ASI with pair pace/extrapolation/kk"); + } // Set up element lists auto h_map = Kokkos::create_mirror_view(d_map); From cfbc2d88944ffa03852e81f5176d11cde7622088 Mon Sep 17 00:00:00 2001 From: Yury Lysogorskiy Date: Tue, 3 Jan 2023 14:48:25 +0100 Subject: [PATCH 07/12] update pair_pace.rst --- doc/src/pair_pace.rst | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/src/pair_pace.rst b/doc/src/pair_pace.rst index 697a9965b6..01f7df63f6 100644 --- a/doc/src/pair_pace.rst +++ b/doc/src/pair_pace.rst @@ -1,6 +1,7 @@ .. index:: pair_style pace .. index:: pair_style pace/kk .. index:: pair_style pace/extrapolation +.. index:: pair_style pace/extrapolation/kk pair_style pace command ======================= @@ -127,6 +128,9 @@ but not more often than every 20 steps. On all other steps `pair_style pace recursive` will be used. +When using the pair style *pace/extrapolation* with the KOKKOS package on GPUs +product B-basis evaluator is always used and only *linear* ASI is supported. + ---------- See the :doc:`pair_coeff ` page for alternate ways @@ -186,4 +190,4 @@ recursive, chunksize = 4096, .. _Lysogorskiy2022: -**(Lysogorskiy2022)** Lysogorskiy, Bochkarev, Mrovec, Drautz, TBS (2022). +**(Lysogorskiy2022)** Lysogorskiy, Bochkarev, Mrovec, Drautz, arXiv:2212.08716 (2022). From 295d8a69035497ea0b575b6764c349cd82a9af5c Mon Sep 17 00:00:00 2001 From: Yury Lysogorskiy Date: Tue, 3 Jan 2023 15:58:34 +0100 Subject: [PATCH 08/12] fix_pair.cpp: respect lmp->suffix when looking for pair_style name match --- src/fix_pair.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/fix_pair.cpp b/src/fix_pair.cpp index 5e5ccbf810..bd389f6632 100644 --- a/src/fix_pair.cpp +++ b/src/fix_pair.cpp @@ -36,6 +36,10 @@ FixPair::FixPair(LAMMPS *lmp, int narg, char **arg) : if (nevery < 1) error->all(FLERR,"Illegal fix pair every value: {}", nevery); pairname = utils::strdup(arg[4]); + if(lmp->suffix) { + strcat(pairname,"/"); + strcat(pairname,lmp->suffix); + } pstyle = force->pair_match(pairname,1,0); if (pstyle == nullptr) error->all(FLERR,"Pair style {} for fix pair not found", pairname); From a3572b61d84d17c5efec076663c8607aa1325c13 Mon Sep 17 00:00:00 2001 From: Stan Gerald Moore Date: Tue, 3 Jan 2023 10:09:47 -0700 Subject: [PATCH 09/12] Small cleanup --- src/KOKKOS/Install.sh | 2 + src/KOKKOS/pair_pace_extrapolation_kokkos.cpp | 129 +++++++++--------- src/ML-PACE/pair_pace_extrapolation.cpp | 4 - 3 files changed, 64 insertions(+), 71 deletions(-) diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh index a5bf6437fc..77a9932d90 100755 --- a/src/KOKKOS/Install.sh +++ b/src/KOKKOS/Install.sh @@ -320,6 +320,8 @@ action pair_multi_lucy_rx_kokkos.cpp pair_multi_lucy_rx.cpp action pair_multi_lucy_rx_kokkos.h pair_multi_lucy_rx.h action pair_pace_kokkos.cpp pair_pace.cpp action pair_pace_kokkos.h pair_pace.h +action pair_pace_extrapolation_kokkos.cpp pair_pace_extrapolation.cpp +action pair_pace_extrapolation_kokkos.h pair_pace_extrapolation.h action pair_reaxff_kokkos.cpp pair_reaxff.cpp action pair_reaxff_kokkos.h pair_reaxff.h action pair_snap_kokkos.cpp pair_snap.cpp diff --git a/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp index ba5ecafeac..6369c7fba5 100644 --- a/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp +++ b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp @@ -38,19 +38,18 @@ #include namespace LAMMPS_NS { - struct ACEALImpl { - ACEALImpl() : basis_set(nullptr), ace(nullptr) {} + struct ACEALImpl { + ACEALImpl() : basis_set(nullptr), ace(nullptr) {} - ~ACEALImpl() - { - delete basis_set; - delete ace; - } + ~ACEALImpl() { + delete basis_set; + delete ace; + } - ACEBBasisSet *basis_set; - ACEBEvaluator *ace; - }; -} // namespace LAMMPS_NS + ACEBBasisSet *basis_set; + ACEBEvaluator *ace; + }; +} // namespace LAMMPS_NS using namespace LAMMPS_NS; using namespace MathConst; @@ -187,7 +186,7 @@ void PairPACEExtrapolationKokkos::copy_pertype() h_rho_core_cutoff[n] = basis_set->map_embedding_specifications.at(n).rho_core_cutoff; h_drho_core_cutoff[n] = basis_set->map_embedding_specifications.at(n).drho_core_cutoff; - h_E0vals(n)= basis_set->E0vals(n); + h_E0vals(n) = basis_set->E0vals(n); h_ndensity(n) = basis_set->map_embedding_specifications.at(n).ndensity; @@ -321,8 +320,6 @@ void PairPACEExtrapolationKokkos::copy_tilde() t_ace_3d d_ASI_temp; MemKK::realloc_kokkos(d_ASI_temp, "pace:ASI_temp", nelements, total_num_functions_max, total_num_functions_max); - - auto h_rank = Kokkos::create_mirror_view(d_rank); auto h_num_ms_combs = Kokkos::create_mirror_view(d_num_ms_combs); auto h_func_inds = Kokkos::create_mirror_view(d_func_inds); @@ -332,7 +329,7 @@ void PairPACEExtrapolationKokkos::copy_tilde() auto h_ms_combs = Kokkos::create_mirror_view(d_ms_combs); auto h_gen_cgs = Kokkos::create_mirror_view(d_gen_cgs); auto h_coeffs = Kokkos::create_mirror_view(d_coeffs); -// asi + // asi auto h_ASI = Kokkos::create_mirror_view(d_ASI_temp); // copy values on host @@ -380,7 +377,7 @@ void PairPACEExtrapolationKokkos::copy_tilde() } for (int p = 0; p < ndensity; ++p) - h_coeffs(mu, func_ind_through, p) = func->coeff[p]; + h_coeffs(mu, func_ind_through, p) = func->coeff[p]; // loop over {ms} combinations in sum @@ -400,11 +397,10 @@ void PairPACEExtrapolationKokkos::copy_tilde() // ASI const auto &A_as_inv = b_evaluator->A_active_set_inv.at(mu); - for(int i = 0; i < total_basis_size_rank1 + total_basis_size; i++) - for(int j = 0; j < total_basis_size_rank1 + total_basis_size; j++){ - h_ASI(mu,i,j)=A_as_inv(j,i); // transpose back for better performance on GPU + for (int i = 0; i < total_basis_size_rank1 + total_basis_size; i++) + for (int j = 0; j < total_basis_size_rank1 + total_basis_size; j++){ + h_ASI(mu,i,j) = A_as_inv(j,i); // transpose back for better performance on GPU } - } Kokkos::deep_copy(d_rank, h_rank); @@ -565,6 +561,7 @@ void PairPACEExtrapolationKokkos::compute(int eflag_in, int vflag_in ev_init(eflag,vflag,0); // reallocate per-atom arrays if necessary + if (eflag_atom) { memoryKK->destroy_kokkos(k_eatom,eatom); memoryKK->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom"); @@ -742,7 +739,7 @@ void PairPACEExtrapolationKokkos::compute(int eflag_in, int vflag_in ev += ev_tmp; //if gamma_flag - copy current d_gamma to extrapolation_grade_gamma - if(gamma_flag){ + if (gamma_flag){ h_gamma = Kokkos::create_mirror_view(d_gamma); Kokkos:deep_copy(h_gamma, d_gamma); memcpy(extrapolation_grade_gamma+chunk_offset, (void *) h_gamma.data(), sizeof(double)*chunk_size); @@ -1001,7 +998,7 @@ void PairPACEExtrapolationKokkos::operator() (TagPairPACEComputeRho, //gamma_i - if(gamma_flag) + if (gamma_flag) Kokkos::atomic_add(&projections(ii, func_ind), d_gen_cgs(mu_i, idx_ms_comb) * A_cur); } else { // rank > 1 @@ -1040,14 +1037,13 @@ void PairPACEExtrapolationKokkos::operator() (TagPairPACEComputeRho, Kokkos::atomic_add(&rhos(ii, p), B.real_part_product(d_coeffs(mu_i, func_ind, p) * d_gen_cgs(mu_i, idx_ms_comb))); } //gamma_i - if(gamma_flag) - Kokkos::atomic_add(&projections(ii, func_ind), B.real_part_product(d_gen_cgs(mu_i, idx_ms_comb))); + if (gamma_flag) + Kokkos::atomic_add(&projections(ii, func_ind), B.real_part_product(d_gen_cgs(mu_i, idx_ms_comb))); } } /* ---------------------------------------------------------------------- */ - template KOKKOS_INLINE_FUNCTION void PairPACEExtrapolationKokkos::operator() (TagPairPACEComputeFS, const int& ii) const @@ -1079,37 +1075,34 @@ void PairPACEExtrapolationKokkos::operator() (TagPairPACEComputeFS, } } - /* ---------------------------------------------------------------------- */ - template KOKKOS_INLINE_FUNCTION void PairPACEExtrapolationKokkos::operator() (TagPairPACEComputeGamma, const int& ii) const { - const int i = d_ilist[ii + chunk_offset]; - const int mu_i = d_map(type(i)); - const int basis_size = d_total_basis_size(mu_i); + const int i = d_ilist[ii + chunk_offset]; + const int mu_i = d_map(type(i)); + const int basis_size = d_total_basis_size(mu_i); + double gamma_max = 0; + double current_gamma; + for (int j = 0; j gamma_max) - gamma_max = current_gamma; - } - - // tally energy contribution - d_gamma(ii) = gamma_max; + current_gamma = fabs(current_gamma); + if (current_gamma > gamma_max) + gamma_max = current_gamma; + } + // tally energy contribution + d_gamma(ii) = gamma_max; } + /* ---------------------------------------------------------------------- */ template @@ -1824,29 +1817,20 @@ double PairPACEExtrapolationKokkos::memory_usage() return bytes; } -/* ---------------------------------------------------------------------- */ - -namespace LAMMPS_NS { -template class PairPACEExtrapolationKokkos; -#ifdef LMP_KOKKOS_GPU -template class PairPACEExtrapolationKokkos; -#endif -} - -/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- extract method for extracting value of scale variable ---------------------------------------------------------------------- */ + template void *PairPACEExtrapolationKokkos::extract(const char *str, int &dim) { - //check if str=="gamma_flag" then compute extrapolation grades on this iteration - dim = 0; - if (strcmp(str, "gamma_flag") == 0) return (void *) &gamma_flag; + //check if str=="gamma_flag" then compute extrapolation grades on this iteration + dim = 0; + if (strcmp(str, "gamma_flag") == 0) return (void *) &gamma_flag; - dim = 2; - if (strcmp(str, "scale") == 0) return (void *) scale; - return nullptr; + dim = 2; + if (strcmp(str, "scale") == 0) return (void *) scale; + return nullptr; } /* ---------------------------------------------------------------------- @@ -1857,13 +1841,24 @@ void *PairPACEExtrapolationKokkos::extract(const char *str, int &dim 1 or more = # of columns in per-atom array return NULL if str is not recognized ---------------------------------------------------------------------- */ + template void *PairPACEExtrapolationKokkos::extract_peratom(const char *str, int &ncol) { - if (strcmp(str, "gamma") == 0) { - ncol = 0; - return (void *) extrapolation_grade_gamma; - } + if (strcmp(str, "gamma") == 0) { + ncol = 0; + return (void *) extrapolation_grade_gamma; + } + + return nullptr; +} + +/* ---------------------------------------------------------------------- */ + +namespace LAMMPS_NS { +template class PairPACEExtrapolationKokkos; +#ifdef LMP_KOKKOS_GPU +template class PairPACEExtrapolationKokkos; +#endif +} - return nullptr; -} \ No newline at end of file diff --git a/src/ML-PACE/pair_pace_extrapolation.cpp b/src/ML-PACE/pair_pace_extrapolation.cpp index 2c5e5fafe9..8a0116526a 100644 --- a/src/ML-PACE/pair_pace_extrapolation.cpp +++ b/src/ML-PACE/pair_pace_extrapolation.cpp @@ -135,16 +135,12 @@ void PairPACEExtrapolation::compute(int eflag, int vflag) double **x = atom->x; double **f = atom->f; -// tagint *tag = atom->tag; int *type = atom->type; // number of atoms in cell int nlocal = atom->nlocal; int newton_pair = force->newton_pair; - // number of atoms including ghost atoms -// int nall = nlocal + atom->nghost; - // inum: length of the neighborlists list inum = list->inum; From bcb5285ef9fca4211295d45bd900448b4064ec01 Mon Sep 17 00:00:00 2001 From: Axel Kohlmeyer Date: Tue, 3 Jan 2023 14:15:12 -0500 Subject: [PATCH 10/12] update / correct suffix handling in fix pair --- doc/src/fix_pair.rst | 9 +++++++-- src/fix_pair.cpp | 17 +++++++++++++---- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/doc/src/fix_pair.rst b/doc/src/fix_pair.rst index abb44718cd..44c91f18ee 100644 --- a/doc/src/fix_pair.rst +++ b/doc/src/fix_pair.rst @@ -1,7 +1,7 @@ .. index:: fix pair fix pair command -======================= +================ Syntax """""" @@ -47,7 +47,12 @@ These are example use cases: The *N* argument determines how often the fix is invoked. The *pstyle* argument is the name of the pair style. It can be a -sub-style used in a :doc:`pair_style hybrid ` command. +sub-style used in a :doc:`pair_style hybrid ` command. If +there are multiple sub-styles using the same pair style, then *pstyle* +should be specified as "style:N", where *N* is the number of the +instance of the pair style you wish monitor (e.g., the first or second). +For example, *pstyle* could be specified as "pace/extrapolation" or +"amoeba" or "eam:1" or "eam:2". One or more *name/flag* pairs of arguments follow. Each *name* is a per-atom quantity which the pair style must recognize as an extraction diff --git a/src/fix_pair.cpp b/src/fix_pair.cpp index bd389f6632..db7b24dfea 100644 --- a/src/fix_pair.cpp +++ b/src/fix_pair.cpp @@ -36,11 +36,20 @@ FixPair::FixPair(LAMMPS *lmp, int narg, char **arg) : if (nevery < 1) error->all(FLERR,"Illegal fix pair every value: {}", nevery); pairname = utils::strdup(arg[4]); - if(lmp->suffix) { - strcat(pairname,"/"); - strcat(pairname,lmp->suffix); + char *cptr; + int nsub = 0; + if ((cptr = strchr(pairname,':'))) { + *cptr = '\0'; + nsub = utils::inumeric(FLERR,cptr+1,false,lmp); } - pstyle = force->pair_match(pairname,1,0); + + if (lmp->suffix_enable) { + if (lmp->suffix) + pstyle = force->pair_match(fmt::format("{}/{}",pairname,lmp->suffix),1,nsub); + if ((pstyle == nullptr) && lmp->suffix2) + pstyle = force->pair_match(fmt::format("{}/{}",pairname,lmp->suffix2),1,nsub); + } + if (pstyle == nullptr) pstyle = force->pair_match(pairname,1,nsub); if (pstyle == nullptr) error->all(FLERR,"Pair style {} for fix pair not found", pairname); nfield = (narg-5) / 2; From 87a8cfe299f3e194bc6b24d19660a0ed69cb9ca8 Mon Sep 17 00:00:00 2001 From: Yury Lysogorskiy Date: Wed, 4 Jan 2023 12:46:50 +0100 Subject: [PATCH 11/12] - replace #include "ace-evaluator/ace_radial.h" in pair_pace_kokkos.h and pair_pace_extrapolation_kokkos.h with forward declaration "class SplineInterpolator;" - move SplineInterpolatorKokkos::operator=(const SplineInterpolator &spline) to .cpp files --- src/KOKKOS/pair_pace_extrapolation_kokkos.cpp | 18 ++++++++++++++++ src/KOKKOS/pair_pace_extrapolation_kokkos.h | 21 +++---------------- src/KOKKOS/pair_pace_kokkos.cpp | 18 ++++++++++++++++ src/KOKKOS/pair_pace_kokkos.h | 21 +++---------------- 4 files changed, 42 insertions(+), 36 deletions(-) diff --git a/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp index 6369c7fba5..e6ae694cad 100644 --- a/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp +++ b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp @@ -1672,7 +1672,25 @@ void PairPACEExtrapolationKokkos::evaluate_splines(const int ii, con } /* ---------------------------------------------------------------------- */ +template +void PairPACEExtrapolationKokkos::SplineInterpolatorKokkos::operator=(const SplineInterpolator &spline) { + cutoff = spline.cutoff; + deltaSplineBins = spline.deltaSplineBins; + ntot = spline.ntot; + nlut = spline.nlut; + invrscalelookup = spline.invrscalelookup; + rscalelookup = spline.rscalelookup; + num_of_functions = spline.num_of_functions; + lookupTable = t_ace_3d4("lookupTable", ntot+1, num_of_functions); + auto h_lookupTable = Kokkos::create_mirror_view(lookupTable); + for (int i = 0; i < ntot+1; i++) + for (int j = 0; j < num_of_functions; j++) + for (int k = 0; k < 4; k++) + h_lookupTable(i, j, k) = spline.lookupTable(i, j, k); + Kokkos::deep_copy(lookupTable, h_lookupTable); +} +/* ---------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION void PairPACEExtrapolationKokkos::SplineInterpolatorKokkos::calcSplines(const int ii, const int jj, const double r, const t_ace_3d &d_values, const t_ace_3d &d_derivatives) const diff --git a/src/KOKKOS/pair_pace_extrapolation_kokkos.h b/src/KOKKOS/pair_pace_extrapolation_kokkos.h index 6983fcc338..a65841f8f6 100644 --- a/src/KOKKOS/pair_pace_extrapolation_kokkos.h +++ b/src/KOKKOS/pair_pace_extrapolation_kokkos.h @@ -24,10 +24,11 @@ PairStyle(pace/extrapolation/kk/host,PairPACEExtrapolationKokkos); #define LMP_PAIR_PACE_EXTRAPOLATION_KOKKOS_H #include "pair_pace_extrapolation.h" -#include "ace-evaluator/ace_radial.h" #include "kokkos_type.h" #include "pair_kokkos.h" +class SplineInterpolator; + namespace LAMMPS_NS { template @@ -313,23 +314,7 @@ class PairPACEExtrapolationKokkos : public PairPACEExtrapolation { t_ace_3d4 lookupTable; - void operator=(const SplineInterpolator &spline) { - cutoff = spline.cutoff; - deltaSplineBins = spline.deltaSplineBins; - ntot = spline.ntot; - nlut = spline.nlut; - invrscalelookup = spline.invrscalelookup; - rscalelookup = spline.rscalelookup; - num_of_functions = spline.num_of_functions; - - lookupTable = t_ace_3d4("lookupTable", ntot+1, num_of_functions); - auto h_lookupTable = Kokkos::create_mirror_view(lookupTable); - for (int i = 0; i < ntot+1; i++) - for (int j = 0; j < num_of_functions; j++) - for (int k = 0; k < 4; k++) - h_lookupTable(i, j, k) = spline.lookupTable(i, j, k); - Kokkos::deep_copy(lookupTable, h_lookupTable); - } + void operator=(const SplineInterpolator &spline); void deallocate() { lookupTable = t_ace_3d4(); diff --git a/src/KOKKOS/pair_pace_kokkos.cpp b/src/KOKKOS/pair_pace_kokkos.cpp index dd5b97fe32..6f1e3feaf8 100644 --- a/src/KOKKOS/pair_pace_kokkos.cpp +++ b/src/KOKKOS/pair_pace_kokkos.cpp @@ -1574,7 +1574,25 @@ void PairPACEKokkos::evaluate_splines(const int ii, const int jj, do } /* ---------------------------------------------------------------------- */ +template +void PairPACEKokkos::SplineInterpolatorKokkos::operator=(const SplineInterpolator &spline) { + cutoff = spline.cutoff; + deltaSplineBins = spline.deltaSplineBins; + ntot = spline.ntot; + nlut = spline.nlut; + invrscalelookup = spline.invrscalelookup; + rscalelookup = spline.rscalelookup; + num_of_functions = spline.num_of_functions; + lookupTable = t_ace_3d4("lookupTable", ntot+1, num_of_functions); + auto h_lookupTable = Kokkos::create_mirror_view(lookupTable); + for (int i = 0; i < ntot+1; i++) + for (int j = 0; j < num_of_functions; j++) + for (int k = 0; k < 4; k++) + h_lookupTable(i, j, k) = spline.lookupTable(i, j, k); + Kokkos::deep_copy(lookupTable, h_lookupTable); +} +/* ---------------------------------------------------------------------- */ template KOKKOS_INLINE_FUNCTION void PairPACEKokkos::SplineInterpolatorKokkos::calcSplines(const int ii, const int jj, const double r, const t_ace_3d &d_values, const t_ace_3d &d_derivatives) const diff --git a/src/KOKKOS/pair_pace_kokkos.h b/src/KOKKOS/pair_pace_kokkos.h index cc2a3cb674..39cfd100f8 100644 --- a/src/KOKKOS/pair_pace_kokkos.h +++ b/src/KOKKOS/pair_pace_kokkos.h @@ -24,10 +24,11 @@ PairStyle(pace/kk/host,PairPACEKokkos); #define LMP_PAIR_PACE_KOKKOS_H #include "pair_pace.h" -#include "ace-evaluator/ace_radial.h" #include "kokkos_type.h" #include "pair_kokkos.h" +class SplineInterpolator; + namespace LAMMPS_NS { template @@ -293,23 +294,7 @@ class PairPACEKokkos : public PairPACE { t_ace_3d4 lookupTable; - void operator=(const SplineInterpolator &spline) { - cutoff = spline.cutoff; - deltaSplineBins = spline.deltaSplineBins; - ntot = spline.ntot; - nlut = spline.nlut; - invrscalelookup = spline.invrscalelookup; - rscalelookup = spline.rscalelookup; - num_of_functions = spline.num_of_functions; - - lookupTable = t_ace_3d4("lookupTable", ntot+1, num_of_functions); - auto h_lookupTable = Kokkos::create_mirror_view(lookupTable); - for (int i = 0; i < ntot+1; i++) - for (int j = 0; j < num_of_functions; j++) - for (int k = 0; k < 4; k++) - h_lookupTable(i, j, k) = spline.lookupTable(i, j, k); - Kokkos::deep_copy(lookupTable, h_lookupTable); - } + void operator=(const SplineInterpolator &spline); void deallocate() { lookupTable = t_ace_3d4(); From 20b6355888d3b005601b62439270cd749dae24e9 Mon Sep 17 00:00:00 2001 From: Yury Lysogorskiy Date: Wed, 4 Jan 2023 15:00:03 +0100 Subject: [PATCH 12/12] refactor fix_pair.h/cpp: extract method "query_pstyle" and call it also in void FixPair::init() --- src/fix_pair.cpp | 40 +++++++++++++++++++++++++--------------- src/fix_pair.h | 2 ++ 2 files changed, 27 insertions(+), 15 deletions(-) diff --git a/src/fix_pair.cpp b/src/fix_pair.cpp index db7b24dfea..59f9dbda4a 100644 --- a/src/fix_pair.cpp +++ b/src/fix_pair.cpp @@ -21,6 +21,7 @@ #include "memory.h" #include "pair.h" #include "update.h" +#include "fmt/format.h" using namespace LAMMPS_NS; using namespace FixConst; @@ -36,20 +37,7 @@ FixPair::FixPair(LAMMPS *lmp, int narg, char **arg) : if (nevery < 1) error->all(FLERR,"Illegal fix pair every value: {}", nevery); pairname = utils::strdup(arg[4]); - char *cptr; - int nsub = 0; - if ((cptr = strchr(pairname,':'))) { - *cptr = '\0'; - nsub = utils::inumeric(FLERR,cptr+1,false,lmp); - } - - if (lmp->suffix_enable) { - if (lmp->suffix) - pstyle = force->pair_match(fmt::format("{}/{}",pairname,lmp->suffix),1,nsub); - if ((pstyle == nullptr) && lmp->suffix2) - pstyle = force->pair_match(fmt::format("{}/{}",pairname,lmp->suffix2),1,nsub); - } - if (pstyle == nullptr) pstyle = force->pair_match(pairname,1,nsub); + query_pstyle(lmp); if (pstyle == nullptr) error->all(FLERR,"Pair style {} for fix pair not found", pairname); nfield = (narg-5) / 2; @@ -143,6 +131,28 @@ FixPair::FixPair(LAMMPS *lmp, int narg, char **arg) : lasttime = -1; } +/* ---------------------------------------------------------------------- */ + +void FixPair::query_pstyle(LAMMPS *lmp) { + char *cptr=nullptr; + int nsub = 0; + if ((cptr = strchr(pairname, ':'))) { + *cptr = '\0'; + nsub = utils::inumeric(FLERR,cptr+1,false,lmp); + } + pstyle = nullptr; + if (lmp->suffix_enable) { + if (lmp->suffix) { + pstyle = force->pair_match(fmt::format("{}/{}", pairname, lmp->suffix), 1, nsub); + if (pstyle == nullptr && (lmp->suffix2)) { + pstyle = force->pair_match(fmt::format("{}/{}", pairname, lmp->suffix2), 1, nsub); + } + } + } + if (pstyle == nullptr) pstyle = force->pair_match(pairname, 1, nsub); +} + + /* ---------------------------------------------------------------------- */ FixPair::~FixPair() @@ -184,7 +194,7 @@ void FixPair::init() { // insure pair style still exists - pstyle = force->pair_match(pairname,1,0); + query_pstyle(lmp); if (pstyle == nullptr) error->all(FLERR,"Pair style {} for fix pair not found", pairname); } diff --git a/src/fix_pair.h b/src/fix_pair.h index 765b261ee5..57fbdf8ecb 100644 --- a/src/fix_pair.h +++ b/src/fix_pair.h @@ -56,6 +56,8 @@ class FixPair : public Fix { class Pair *pstyle; double *vector; double **array; + + void query_pstyle(LAMMPS *lmp); }; } // namespace LAMMPS_NS