From 656505642405168137d531896e106172151791ce Mon Sep 17 00:00:00 2001
From: Yury Lysogorskiy <yury.lysogorskiy@rub.de>
Date: Mon, 2 Jan 2023 12:21:00 +0100
Subject: [PATCH 01/12] WIP: add pair_pace_extrapolation_kokkos.cpp/h

---
 src/KOKKOS/pair_pace_extrapolation_kokkos.cpp | 1724 +++++++++++++++++
 src/KOKKOS/pair_pace_extrapolation_kokkos.h   |  334 ++++
 2 files changed, 2058 insertions(+)
 create mode 100644 src/KOKKOS/pair_pace_extrapolation_kokkos.cpp
 create mode 100644 src/KOKKOS/pair_pace_extrapolation_kokkos.h
diff --git a/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp
new file mode 100644
index 0000000000..b4f4932db4
--- /dev/null
+++ b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp
@@ -0,0 +1,1724 @@
+// clang-format off
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   aE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Stan Moore (SNL)
+------------------------------------------------------------------------- */
+
+#include "pair_pace_extrapolation_kokkos.h"
+
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "kokkos.h"
+#include "math_const.h"
+#include "memory_kokkos.h"
+#include "neighbor_kokkos.h"
+#include "neigh_request.h"
+
+#include "ace-evaluator/ace_c_basis.h"
+#include "ace-evaluator/ace_evaluator.h"
+#include "ace-evaluator/ace_recursive.h"
+#include "ace-evaluator/ace_version.h"
+#include "ace-evaluator/ace_radial.h"
+#include <cstring>
+
+namespace LAMMPS_NS {
+struct ACEImpl {
+  ACEImpl() : basis_set(nullptr), ace(nullptr) {}
+  ~ACEImpl()
+  {
+    delete basis_set;
+    delete ace;
+  }
+  ACECTildeBasisSet *basis_set;
+  ACERecursiveEvaluator *ace;
+};
+}    // namespace LAMMPS_NS
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+enum{FS,FS_SHIFTEDSCALED};
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairPACEExtrapolationKokkos<DeviceType>::PairPACEExtrapolationKokkos(LAMMPS *lmp) : PairPACE(lmp)
+{
+  respa_enable = 0;
+
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+
+  host_flag = (execution_space == Host);
+}
+
+/* ----------------------------------------------------------------------
+   check if allocated, since class can be destructed when incomplete
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairPACEExtrapolationKokkos<DeviceType>::~PairPACEExtrapolationKokkos()
+{
+  if (copymode) return;
+
+  memoryKK->destroy_kokkos(k_eatom,eatom);
+  memoryKK->destroy_kokkos(k_vatom,vatom);
+
+  // deallocate views of views in serial to prevent issues in Kokkos tools
+
+  if (k_splines_gk.h_view.data()) {
+    for (int i = 0; i < nelements; i++) {
+      for (int j = 0; j < nelements; j++) {
+        k_splines_gk.h_view(i, j).deallocate();
+        k_splines_rnl.h_view(i, j).deallocate();
+        k_splines_hc.h_view(i, j).deallocate();
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairPACEExtrapolationKokkos<DeviceType>::grow(int natom, int maxneigh)
+{
+  auto basis_set = aceimpl->basis_set;
+
+  if ((int)A.extent(0) < natom) {
+
+    MemKK::realloc_kokkos(A, "pace:A", natom, nelements, nradmax + 1, (lmax + 1) * (lmax + 1));
+    MemKK::realloc_kokkos(A_rank1, "pace:A_rank1", natom, nelements, nradbase);
+
+    MemKK::realloc_kokkos(A_list, "pace:A_list", natom, idx_rho_max, basis_set->rankmax);
+    //size is +1 of max to avoid out-of-boundary array access in double-triangular scheme
+    MemKK::realloc_kokkos(A_forward_prod, "pace:A_forward_prod", natom, idx_rho_max, basis_set->rankmax + 1);
+
+    MemKK::realloc_kokkos(e_atom, "pace:e_atom", natom);
+    MemKK::realloc_kokkos(rhos, "pace:rhos", natom, basis_set->ndensitymax + 1); // +1 density for core repulsion
+    MemKK::realloc_kokkos(dF_drho, "pace:dF_drho", natom, basis_set->ndensitymax + 1); // +1 density for core repulsion
+
+    MemKK::realloc_kokkos(weights, "pace:weights", natom, nelements, nradmax + 1, (lmax + 1) * (lmax + 1));
+    MemKK::realloc_kokkos(weights_rank1, "pace:weights_rank1", natom, nelements, nradbase);
+
+    // hard-core repulsion
+    MemKK::realloc_kokkos(rho_core, "pace:rho_core", natom);
+    MemKK::realloc_kokkos(dF_drho_core, "pace:dF_drho_core", natom);
+    MemKK::realloc_kokkos(dB_flatten, "pace:dB_flatten", natom, idx_rho_max, basis_set->rankmax);
+  }
+
+  if (((int)ylm.extent(0) < natom) || ((int)ylm.extent(1) < maxneigh)) {
+
+    // radial functions
+    MemKK::realloc_kokkos(fr, "pace:fr", natom, maxneigh, nradmax, lmax + 1);
+    MemKK::realloc_kokkos(dfr, "pace:dfr", natom, maxneigh, nradmax, lmax + 1);
+    MemKK::realloc_kokkos(gr, "pace:gr", natom, maxneigh, nradbase);
+    MemKK::realloc_kokkos(dgr, "pace:dgr", natom, maxneigh, nradbase);
+    const int max_num_functions = MAX(nradbase, nradmax*(lmax + 1));
+    MemKK::realloc_kokkos(d_values, "pace:d_values", natom, maxneigh, max_num_functions);
+    MemKK::realloc_kokkos(d_derivatives, "pace:d_derivatives", natom, maxneigh, max_num_functions);
+
+    // hard-core repulsion
+    MemKK::realloc_kokkos(cr, "pace:cr", natom, maxneigh);
+    MemKK::realloc_kokkos(dcr, "pace:dcr", natom, maxneigh);
+
+    // spherical harmonics
+    MemKK::realloc_kokkos(plm, "pace:plm", natom, maxneigh, (lmax + 1) * (lmax + 1));
+    MemKK::realloc_kokkos(dplm, "pace:dplm", natom, maxneigh, (lmax + 1) * (lmax + 1));
+    MemKK::realloc_kokkos(ylm, "pace:ylm", natom, maxneigh, (lmax + 1) * (lmax + 1));
+    MemKK::realloc_kokkos(dylm, "pace:dylm", natom, maxneigh, (lmax + 1) * (lmax + 1));
+
+    // short neigh list
+    MemKK::realloc_kokkos(d_ncount, "pace:ncount", natom);
+    MemKK::realloc_kokkos(d_mu, "pace:mu", natom, maxneigh);
+    MemKK::realloc_kokkos(d_rhats, "pace:rhats", natom, maxneigh);
+    MemKK::realloc_kokkos(d_rnorms, "pace:rnorms", natom, maxneigh);
+    MemKK::realloc_kokkos(d_nearest, "pace:nearest", natom, maxneigh);
+
+    MemKK::realloc_kokkos(f_ij, "pace:f_ij", natom, maxneigh);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairPACEExtrapolationKokkos<DeviceType>::copy_pertype()
+{
+  auto basis_set = aceimpl->basis_set;
+
+  MemKK::realloc_kokkos(d_rho_core_cutoff, "pace:rho_core_cutoff", nelements);
+  MemKK::realloc_kokkos(d_drho_core_cutoff, "pace:drho_core_cutoff", nelements);
+  MemKK::realloc_kokkos(d_E0vals, "pace:E0vals", nelements);
+  MemKK::realloc_kokkos(d_ndensity, "pace:ndensity", nelements);
+  MemKK::realloc_kokkos(d_npoti, "pace:npoti", nelements);
+
+  auto h_rho_core_cutoff = Kokkos::create_mirror_view(d_rho_core_cutoff);
+  auto h_drho_core_cutoff = Kokkos::create_mirror_view(d_drho_core_cutoff);
+  auto h_E0vals = Kokkos::create_mirror_view(d_E0vals);
+  auto h_ndensity = Kokkos::create_mirror_view(d_ndensity);
+  auto h_npoti = Kokkos::create_mirror_view(d_npoti);
+
+  for (int n = 0; n < nelements; n++) {
+    h_rho_core_cutoff[n] = basis_set->map_embedding_specifications.at(n).rho_core_cutoff;
+    h_drho_core_cutoff[n] = basis_set->map_embedding_specifications.at(n).drho_core_cutoff;
+
+    h_E0vals(n)= basis_set->E0vals(n);
+
+    h_ndensity(n) = basis_set->map_embedding_specifications.at(n).ndensity;
+
+    string npoti = basis_set->map_embedding_specifications.at(n).npoti;
+    if (npoti == "FinnisSinclair")
+      h_npoti(n) = FS;
+    else if (npoti == "FinnisSinclairShiftedScaled")
+      h_npoti(n) = FS_SHIFTEDSCALED;
+  }
+
+  Kokkos::deep_copy(d_rho_core_cutoff, h_rho_core_cutoff);
+  Kokkos::deep_copy(d_drho_core_cutoff, h_drho_core_cutoff);
+  Kokkos::deep_copy(d_E0vals, h_E0vals);
+  Kokkos::deep_copy(d_ndensity, h_ndensity);
+  Kokkos::deep_copy(d_npoti, h_npoti);
+
+  MemKK::realloc_kokkos(d_wpre, "pace:wpre", nelements, basis_set->ndensitymax);
+  MemKK::realloc_kokkos(d_mexp, "pace:mexp", nelements, basis_set->ndensitymax);
+
+  auto h_wpre = Kokkos::create_mirror_view(d_wpre);
+  auto h_mexp = Kokkos::create_mirror_view(d_mexp);
+
+  for (int n = 0; n < nelements; n++) {
+    const int ndensity = basis_set->map_embedding_specifications.at(n).ndensity;
+    for (int p = 0; p < ndensity; p++) {
+      h_wpre(n, p) = basis_set->map_embedding_specifications.at(n).FS_parameters.at(p * 2 + 0);
+      h_mexp(n, p) = basis_set->map_embedding_specifications.at(n).FS_parameters.at(p * 2 + 1);
+    }
+  }
+
+  Kokkos::deep_copy(d_wpre, h_wpre);
+  Kokkos::deep_copy(d_mexp, h_mexp);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairPACEExtrapolationKokkos<DeviceType>::copy_splines()
+{
+  auto basis_set = aceimpl->basis_set;
+
+  if (k_splines_gk.d_view.data()) {
+    for (int i = 0; i < nelements; i++) {
+      for (int j = 0; j < nelements; j++) {
+        k_splines_gk.h_view(i, j).deallocate();
+        k_splines_rnl.h_view(i, j).deallocate();
+        k_splines_hc.h_view(i, j).deallocate();
+      }
+    }
+  }
+
+  k_splines_gk = Kokkos::DualView<SplineInterpolatorKokkos**, DeviceType>("pace:splines_gk", nelements, nelements);
+  k_splines_rnl = Kokkos::DualView<SplineInterpolatorKokkos**, DeviceType>("pace:splines_rnl", nelements, nelements);
+  k_splines_hc = Kokkos::DualView<SplineInterpolatorKokkos**, DeviceType>("pace:splines_hc", nelements, nelements);
+
+  ACERadialFunctions* radial_functions = dynamic_cast<ACERadialFunctions*>(basis_set->radial_functions);
+
+  for (int i = 0; i < nelements; i++) {
+    for (int j = 0; j < nelements; j++) {
+      k_splines_gk.h_view(i, j) = radial_functions->splines_gk(i, j);
+      k_splines_rnl.h_view(i, j) = radial_functions->splines_rnl(i, j);
+      k_splines_hc.h_view(i, j) = radial_functions->splines_hc(i, j);
+    }
+  }
+
+  k_splines_gk.modify_host();
+  k_splines_rnl.modify_host();
+  k_splines_hc.modify_host();
+
+  k_splines_gk.sync_device();
+  k_splines_rnl.sync_device();
+  k_splines_hc.sync_device();
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairPACEExtrapolationKokkos<DeviceType>::copy_tilde()
+{
+  auto basis_set = aceimpl->basis_set;
+
+  // flatten loops, get per-element count and max
+
+  idx_rho_max = 0;
+  int total_basis_size_max = 0;
+
+  MemKK::realloc_kokkos(d_idx_rho_count, "pace:idx_rho_count", nelements);
+  auto h_idx_rho_count = Kokkos::create_mirror_view(d_idx_rho_count);
+
+  for (int n = 0; n < nelements; n++) {
+    int idx_rho = 0;
+    const int total_basis_size_rank1 = basis_set->total_basis_size_rank1[n];
+    const int total_basis_size = basis_set->total_basis_size[n];
+
+    ACECTildeBasisFunction *basis = basis_set->basis[n];
+
+    // rank=1
+    for (int func_rank1_ind = 0; func_rank1_ind < total_basis_size_rank1; ++func_rank1_ind)
+      idx_rho++;
+
+    // rank > 1
+    for (int func_ind = 0; func_ind < total_basis_size; ++func_ind) {
+      ACECTildeBasisFunction *func = &basis[func_ind];
+
+      // loop over {ms} combinations in sum
+      for (int ms_ind = 0; ms_ind < func->num_ms_combs; ++ms_ind)
+        idx_rho++;
+    }
+    h_idx_rho_count(n) = idx_rho;
+    idx_rho_max = MAX(idx_rho_max, idx_rho);
+    total_basis_size_max = MAX(total_basis_size_max, total_basis_size_rank1 + total_basis_size);
+  }
+
+  Kokkos::deep_copy(d_idx_rho_count, h_idx_rho_count);
+
+  MemKK::realloc_kokkos(d_rank, "pace:rank", nelements, total_basis_size_max);
+  MemKK::realloc_kokkos(d_num_ms_combs, "pace:num_ms_combs", nelements, total_basis_size_max);
+  MemKK::realloc_kokkos(d_offsets, "pace:offsets", nelements, idx_rho_max);
+  MemKK::realloc_kokkos(d_mus, "pace:mus", nelements, total_basis_size_max, basis_set->rankmax);
+  MemKK::realloc_kokkos(d_ns, "pace:ns", nelements, total_basis_size_max, basis_set->rankmax);
+  MemKK::realloc_kokkos(d_ls, "pace:ls", nelements, total_basis_size_max, basis_set->rankmax);
+  MemKK::realloc_kokkos(d_ms_combs, "pace:ms_combs", nelements, idx_rho_max, basis_set->rankmax);
+  MemKK::realloc_kokkos(d_ctildes, "pace:ctildes", nelements, idx_rho_max, basis_set->ndensitymax);
+
+  auto h_rank = Kokkos::create_mirror_view(d_rank);
+  auto h_num_ms_combs = Kokkos::create_mirror_view(d_num_ms_combs);
+  auto h_offsets = Kokkos::create_mirror_view(d_offsets);
+  auto h_mus = Kokkos::create_mirror_view(d_mus);
+  auto h_ns = Kokkos::create_mirror_view(d_ns);
+  auto h_ls = Kokkos::create_mirror_view(d_ls);
+  auto h_ms_combs = Kokkos::create_mirror_view(d_ms_combs);
+  auto h_ctildes = Kokkos::create_mirror_view(d_ctildes);
+
+  // copy values on host
+
+  for (int n = 0; n < nelements; n++) {
+    const int total_basis_size_rank1 = basis_set->total_basis_size_rank1[n];
+    const int total_basis_size = basis_set->total_basis_size[n];
+
+    ACECTildeBasisFunction *basis_rank1 = basis_set->basis_rank1[n];
+    ACECTildeBasisFunction *basis = basis_set->basis[n];
+
+    const int ndensity = basis_set->map_embedding_specifications.at(n).ndensity;
+
+    int idx_rho = 0;
+
+    // rank=1
+    for (int offset = 0; offset < total_basis_size_rank1; ++offset) {
+      ACECTildeBasisFunction *func = &basis_rank1[offset];
+      h_rank(n, offset) = 1;
+      h_mus(n, offset, 0) = func->mus[0];
+      h_ns(n, offset, 0) = func->ns[0];
+      for (int p = 0; p < ndensity; p++)
+        h_ctildes(n, idx_rho, p) = func->ctildes[p];
+      h_offsets(n, idx_rho) = offset;
+      idx_rho++;
+    }
+
+    // rank > 1
+    for (int func_ind = 0; func_ind < total_basis_size; ++func_ind) {
+      ACECTildeBasisFunction *func = &basis[func_ind];
+      // TODO: check if func->ctildes are zero, then skip
+
+      const int offset = total_basis_size_rank1 + func_ind;
+
+      const int rank = h_rank(n, offset) = func->rank;
+      h_num_ms_combs(n, offset) = func->num_ms_combs;
+      for (int t = 0; t < rank; t++) {
+        h_mus(n, offset, t) = func->mus[t];
+        h_ns(n, offset, t) = func->ns[t];
+        h_ls(n, offset, t) = func->ls[t];
+      }
+
+      // loop over {ms} combinations in sum
+      for (int ms_ind = 0; ms_ind < func->num_ms_combs; ++ms_ind) {
+        auto ms = &func->ms_combs[ms_ind * rank]; // current ms-combination (of length = rank)
+        for (int t = 0; t < rank; t++)
+          h_ms_combs(n, idx_rho, t) = ms[t];
+
+        for (int p = 0; p < ndensity; ++p) {
+          // real-part only multiplication
+          h_ctildes(n, idx_rho, p) = func->ctildes[ms_ind * ndensity + p];
+        }
+        h_offsets(n, idx_rho) = offset;
+        idx_rho++;
+      }
+    }
+  }
+
+  Kokkos::deep_copy(d_rank, h_rank);
+  Kokkos::deep_copy(d_num_ms_combs, h_num_ms_combs);
+  Kokkos::deep_copy(d_offsets, h_offsets);
+  Kokkos::deep_copy(d_mus, h_mus);
+  Kokkos::deep_copy(d_ns, h_ns);
+  Kokkos::deep_copy(d_ls, h_ls);
+  Kokkos::deep_copy(d_ms_combs, h_ms_combs);
+  Kokkos::deep_copy(d_ctildes, h_ctildes);
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairPACEExtrapolationKokkos<DeviceType>::init_style()
+{
+  if (host_flag) {
+    if (lmp->kokkos->nthreads > 1)
+      error->all(FLERR,"Pair style pace/kk can currently only run on a single "
+                         "CPU thread");
+
+    PairPACE::init_style();
+    return;
+  }
+
+  if (atom->tag_enable == 0) error->all(FLERR, "Pair style PACE requires atom IDs");
+  if (force->newton_pair == 0) error->all(FLERR, "Pair style PACE requires newton pair on");
+
+  // neighbor list request for KOKKOS
+
+  neighflag = lmp->kokkos->neighflag;
+
+  auto request = neighbor->add_request(this, NeighConst::REQ_FULL);
+  request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
+                           !std::is_same<DeviceType,LMPDeviceType>::value);
+  request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
+  if (neighflag == FULL)
+    error->all(FLERR,"Must use half neighbor list style with pair pace/kk");
+
+  auto basis_set = aceimpl->basis_set;
+
+  nelements = basis_set->nelements;
+  lmax = basis_set->lmax;
+  nradmax = basis_set->nradmax;
+  nradbase = basis_set->nradbase;
+
+  // spherical harmonics
+
+  MemKK::realloc_kokkos(alm, "pace:alm", (lmax + 1) * (lmax + 1));
+  MemKK::realloc_kokkos(blm, "pace:blm", (lmax + 1) * (lmax + 1));
+  MemKK::realloc_kokkos(cl, "pace:cl", lmax + 1);
+  MemKK::realloc_kokkos(dl, "pace:dl", lmax + 1);
+
+  pre_compute_harmonics(lmax);
+  copy_pertype();
+  copy_splines();
+  copy_tilde();
+}
+
+/* ----------------------------------------------------------------------
+   init for one type pair i,j and corresponding j,i
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+double PairPACEExtrapolationKokkos<DeviceType>::init_one(int i, int j)
+{
+  double cutone = PairPACE::init_one(i,j);
+
+  k_scale.h_view(i,j) = k_scale.h_view(j,i) = scale[i][j];
+  k_scale.template modify<LMPHostType>();
+
+  k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutone*cutone;
+  k_cutsq.template modify<LMPHostType>();
+
+  return cutone;
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairPACEExtrapolationKokkos<DeviceType>::coeff(int narg, char **arg)
+{
+  PairPACE::coeff(narg,arg);
+
+  // Set up element lists
+
+  auto h_map = Kokkos::create_mirror_view(d_map);
+
+  for (int i = 1; i <= atom->ntypes; i++)
+    h_map(i) = map[i];
+
+  Kokkos::deep_copy(d_map,h_map);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairPACEExtrapolationKokkos<DeviceType>::allocate()
+{
+  PairPACE::allocate();
+
+  int n = atom->ntypes + 1;
+  MemKK::realloc_kokkos(d_map, "pace:map", n);
+
+  MemKK::realloc_kokkos(k_cutsq, "pace:cutsq", n, n);
+  d_cutsq = k_cutsq.template view<DeviceType>();
+
+  MemKK::realloc_kokkos(k_scale, "pace:scale", n, n);
+  d_scale = k_scale.template view<DeviceType>();
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct FindMaxNumNeighs {
+  typedef DeviceType device_type;
+  NeighListKokkos<DeviceType> k_list;
+
+  FindMaxNumNeighs(NeighListKokkos<DeviceType>* nl): k_list(*nl) {}
+  ~FindMaxNumNeighs() {k_list.copymode = 1;}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& ii, int& maxneigh) const {
+    const int i = k_list.d_ilist[ii];
+    const int num_neighs = k_list.d_numneigh[i];
+    if (maxneigh < num_neighs) maxneigh = num_neighs;
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairPACEExtrapolationKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
+{
+  if (host_flag) {
+    atomKK->sync(Host,X_MASK|TYPE_MASK);
+    PairPACE::compute(eflag_in,vflag_in);
+    atomKK->modified(Host,F_MASK);
+    return;
+  }
+
+  eflag = eflag_in;
+  vflag = vflag_in;
+
+  if (neighflag == FULL) no_virial_fdotr_compute = 1;
+
+  ev_init(eflag,vflag,0);
+
+  // reallocate per-atom arrays if necessary
+
+  if (eflag_atom) {
+    memoryKK->destroy_kokkos(k_eatom,eatom);
+    memoryKK->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom");
+    d_eatom = k_eatom.view<DeviceType>();
+  }
+  if (vflag_atom) {
+    memoryKK->destroy_kokkos(k_vatom,vatom);
+    memoryKK->create_kokkos(k_vatom,vatom,maxvatom,"pair:vatom");
+    d_vatom = k_vatom.view<DeviceType>();
+  }
+
+  copymode = 1;
+  if (!force->newton_pair)
+    error->all(FLERR,"PairPACEExtrapolationKokkos requires 'newton on'");
+
+  if (recursive)
+    error->all(FLERR,"Must use 'product' algorithm with pair pace/kk on the GPU");
+
+  atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK);
+  x = atomKK->k_x.view<DeviceType>();
+  f = atomKK->k_f.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+  k_scale.template sync<DeviceType>();
+  k_cutsq.template sync<DeviceType>();
+
+  NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(list);
+  d_numneigh = k_list->d_numneigh;
+  d_neighbors = k_list->d_neighbors;
+  d_ilist = k_list->d_ilist;
+  inum = list->inum;
+
+  need_dup = lmp->kokkos->need_dup<DeviceType>();
+  if (need_dup) {
+    dup_f     = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(f);
+    dup_vatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_vatom);
+  } else {
+    ndup_f     = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(f);
+    ndup_vatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_vatom);
+  }
+
+  maxneigh = 0;
+  Kokkos::parallel_reduce("pace::find_maxneigh", inum, FindMaxNumNeighs<DeviceType>(k_list), Kokkos::Max<int>(maxneigh));
+
+  int vector_length_default = 1;
+  int team_size_default = 1;
+  if (!host_flag)
+    team_size_default = 32;
+
+  chunk_size = MIN(chunksize,inum); // "chunksize" variable is set by user
+  chunk_offset = 0;
+
+  grow(chunk_size, maxneigh);
+
+  EV_FLOAT ev;
+
+  while (chunk_offset < inum) { // chunk up loop to prevent running out of memory
+
+    Kokkos::deep_copy(weights, 0.0);
+    Kokkos::deep_copy(weights_rank1, 0.0);
+    Kokkos::deep_copy(A, 0.0);
+    Kokkos::deep_copy(A_rank1, 0.0);
+    Kokkos::deep_copy(rhos, 0.0);
+
+    EV_FLOAT ev_tmp;
+
+    if (chunk_size > inum - chunk_offset)
+      chunk_size = inum - chunk_offset;
+
+    //Neigh
+    {
+      int vector_length = vector_length_default;
+      int team_size = team_size_default;
+      check_team_size_for<TagPairPACEComputeNeigh>(chunk_size,team_size,vector_length);
+      int scratch_size = scratch_size_helper<int>(team_size * maxneigh);
+      typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeNeigh> policy_neigh(chunk_size,team_size,vector_length);
+      policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+      Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
+    }
+
+    //ComputeRadial
+    {
+      int vector_length = vector_length_default;
+      int team_size = team_size_default;
+      check_team_size_for<TagPairPACEComputeRadial>(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
+      typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeRadial> policy_radial(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
+      Kokkos::parallel_for("ComputeRadial",policy_radial,*this);
+    }
+
+    //ComputeYlm
+    {
+      int vector_length = vector_length_default;
+      int team_size = 16;
+      check_team_size_for<TagPairPACEComputeYlm>(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
+      typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeYlm> policy_ylm(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
+      Kokkos::parallel_for("ComputeYlm",policy_ylm,*this);
+    }
+
+    //ComputeAi
+    {
+      int vector_length = vector_length_default;
+      int team_size = team_size_default;
+      check_team_size_for<TagPairPACEComputeAi>(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
+      typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeAi> policy_ai(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
+      Kokkos::parallel_for("ComputeAi",policy_ai,*this);
+    }
+
+    //ConjugateAi
+    {
+      typename Kokkos::RangePolicy<DeviceType,TagPairPACEConjugateAi> policy_conj_ai(0,chunk_size);
+      Kokkos::parallel_for("ConjugateAi",policy_conj_ai,*this);
+    }
+
+    //ComputeRho
+    {
+      typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeRho> policy_rho(0,chunk_size*idx_rho_max);
+      Kokkos::parallel_for("ComputeRho",policy_rho,*this);
+    }
+
+    //ComputeFS
+    {
+      typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeFS> policy_fs(0,chunk_size);
+      Kokkos::parallel_for("ComputeFS",policy_fs,*this);
+    }
+
+    //ComputeWeights
+    {
+      typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeWeights> policy_weights(0,chunk_size*idx_rho_max);
+      Kokkos::parallel_for("ComputeWeights",policy_weights,*this);
+    }
+
+    //ComputeDerivative
+    {
+      int vector_length = vector_length_default;
+      int team_size = team_size_default;
+      check_team_size_for<TagPairPACEComputeDerivative>(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
+      typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeDerivative> policy_derivative(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
+      Kokkos::parallel_for("ComputeDerivative",policy_derivative,*this);
+    }
+
+    //ComputeForce
+    {
+      if (evflag) {
+        if (neighflag == HALF) {
+          typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeForce<HALF,1> > policy_force(0,chunk_size);
+          Kokkos::parallel_reduce(policy_force, *this, ev_tmp);
+        } else if (neighflag == HALFTHREAD) {
+          typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeForce<HALFTHREAD,1> > policy_force(0,chunk_size);
+          Kokkos::parallel_reduce("ComputeForce",policy_force, *this, ev_tmp);
+        }
+      } else {
+        if (neighflag == HALF) {
+          typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeForce<HALF,0> > policy_force(0,chunk_size);
+          Kokkos::parallel_for(policy_force, *this);
+        } else if (neighflag == HALFTHREAD) {
+          typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeForce<HALFTHREAD,0> > policy_force(0,chunk_size);
+          Kokkos::parallel_for("ComputeForce",policy_force, *this);
+        }
+      }
+    }
+    ev += ev_tmp;
+    chunk_offset += chunk_size;
+
+  } // end while
+
+  if (need_dup)
+    Kokkos::Experimental::contribute(f, dup_f);
+
+  if (eflag_global) eng_vdwl += ev.evdwl;
+  if (vflag_global) {
+    virial[0] += ev.v[0];
+    virial[1] += ev.v[1];
+    virial[2] += ev.v[2];
+    virial[3] += ev.v[3];
+    virial[4] += ev.v[4];
+    virial[5] += ev.v[5];
+  }
+
+  if (vflag_fdotr) pair_virial_fdotr_compute(this);
+
+  if (eflag_atom) {
+    k_eatom.template modify<DeviceType>();
+    k_eatom.template sync<LMPHostType>();
+  }
+
+  if (vflag_atom) {
+    if (need_dup)
+      Kokkos::Experimental::contribute(d_vatom, dup_vatom);
+    k_vatom.template modify<DeviceType>();
+    k_vatom.template sync<LMPHostType>();
+  }
+
+  atomKK->modified(execution_space,F_MASK);
+
+  copymode = 0;
+
+  // free duplicated memory
+  if (need_dup) {
+    dup_f     = decltype(dup_f)();
+    dup_vatom = decltype(dup_vatom)();
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeNeigh>::member_type& team) const
+{
+  const int ii = team.league_rank();
+  const int i = d_ilist[ii + chunk_offset];
+  const int itype = type[i];
+  const X_FLOAT xtmp = x(i,0);
+  const X_FLOAT ytmp = x(i,1);
+  const X_FLOAT ztmp = x(i,2);
+  const int jnum = d_numneigh[i];
+
+  // get a pointer to scratch memory
+  // This is used to cache whether or not an atom is within the cutoff
+  // If it is, inside is assigned to 1, otherwise -1
+  const int team_rank = team.team_rank();
+  const int scratch_shift = team_rank * maxneigh; // offset into pointer for entire team
+  int* inside = (int*)team.team_shmem().get_shmem(team.team_size() * maxneigh * sizeof(int), 0) + scratch_shift;
+
+  // loop over list of all neighbors within force cutoff
+  // distsq[] = distance sq to each
+  // rlist[] = distance vector to each
+  // nearest[] = atom indices of neighbors
+
+  int ncount = 0;
+  Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,jnum),
+      [&] (const int jj, int& count) {
+    int j = d_neighbors(i,jj);
+    j &= NEIGHMASK;
+
+    const int jtype = type(j);
+
+    const F_FLOAT delx = xtmp - x(j,0);
+    const F_FLOAT dely = ytmp - x(j,1);
+    const F_FLOAT delz = ztmp - x(j,2);
+    const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+
+    inside[jj] = -1;
+    if (rsq < d_cutsq(itype,jtype)) {
+     inside[jj] = 1;
+     count++;
+    }
+  },ncount);
+
+  d_ncount(ii) = ncount;
+
+  Kokkos::parallel_scan(Kokkos::TeamThreadRange(team,jnum),
+      [&] (const int jj, int& offset, bool final) {
+
+    if (inside[jj] < 0) return;
+
+    if (final) {
+      int j = d_neighbors(i,jj);
+      j &= NEIGHMASK;
+      const F_FLOAT delx = xtmp - x(j,0);
+      const F_FLOAT dely = ytmp - x(j,1);
+      const F_FLOAT delz = ztmp - x(j,2);
+      const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+      const F_FLOAT r = sqrt(rsq);
+      const F_FLOAT rinv = 1.0/r;
+      const int mu_j = d_map(type(j));
+      d_mu(ii,offset) = mu_j;
+      d_rnorms(ii,offset) = r;
+      d_rhats(ii,offset,0) = -delx*rinv;
+      d_rhats(ii,offset,1) = -dely*rinv;
+      d_rhats(ii,offset,2) = -delz*rinv;
+      d_nearest(ii,offset) = j;
+    }
+    offset++;
+  });
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeRadial, const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeRadial>::member_type& team) const
+{
+  // Extract the atom number
+  int ii = team.team_rank() + team.team_size() * (team.league_rank() %
+           ((chunk_size+team.team_size()-1)/team.team_size()));
+  if (ii >= chunk_size) return;
+  const int i = d_ilist[ii + chunk_offset];
+
+  // Extract the neighbor number
+  const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
+  const int ncount = d_ncount(ii);
+  if (jj >= ncount) return;
+
+  const double r_norm = d_rnorms(ii, jj);
+  const int mu_i = d_map(type(i));
+  const int mu_j = d_mu(ii, jj);
+
+  evaluate_splines(ii, jj, r_norm, nradbase, nradmax, mu_i, mu_j);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeYlm, const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeYlm>::member_type& team) const
+{
+  // Extract the atom number
+  int ii = team.team_rank() + team.team_size() * (team.league_rank() %
+           ((chunk_size+team.team_size()-1)/team.team_size()));
+  if (ii >= chunk_size) return;
+
+  // Extract the neighbor number
+  const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
+  const int ncount = d_ncount(ii);
+  if (jj >= ncount) return;
+
+  const double xn = d_rhats(ii, jj, 0);
+  const double yn = d_rhats(ii, jj, 1);
+  const double zn = d_rhats(ii, jj, 2);
+  compute_ylm(ii,jj,xn,yn,zn,lmax);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeAi, const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeAi>::member_type& team) const
+{
+  // Extract the atom number
+  int ii = team.team_rank() + team.team_size() * (team.league_rank() %
+           ((chunk_size+team.team_size()-1)/team.team_size()));
+  if (ii >= chunk_size) return;
+
+  // Extract the neighbor number
+  const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
+  const int ncount = d_ncount(ii);
+  if (jj >= ncount) return;
+
+  const int mu_j = d_mu(ii, jj);
+
+  // rank = 1
+  for (int n = 0; n < nradbase; n++)
+    Kokkos::atomic_add(&A_rank1(ii, mu_j, n), gr(ii, jj, n) * Y00);
+
+  // rank > 1
+  for (int n = 0; n < nradmax; n++) {
+    for (int l = 0; l <= lmax; l++) {
+      for (int m = 0; m <= l; m++) {
+        const int idx = l * (l + 1) + m; // (l, m)
+        Kokkos::atomic_add(&A(ii, mu_j, n, idx).re, fr(ii, jj, n, l) * ylm(ii, jj, idx).re);
+        Kokkos::atomic_add(&A(ii, mu_j, n, idx).im, fr(ii, jj, n, l) * ylm(ii, jj, idx).im);
+      }
+    }
+  }
+
+  // hard-core repulsion
+  Kokkos::atomic_add(&rho_core(ii), cr(ii, jj));
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEConjugateAi, const int& ii) const
+{
+  //complex conjugate A's (for NEGATIVE (-m) terms)
+  // for rank > 1
+  for (int mu_j = 0; mu_j < nelements; mu_j++) {
+    for (int n = 0; n < nradmax; n++) {
+      for (int l = 0; l <= lmax; l++) {
+        //fill in -m part in the outer loop using the same m <-> -m symmetry as for Ylm
+        for (int m = 1; m <= l; m++) {
+          const int idx = l * (l + 1) + m; // (l, m)
+          const int idxm = l * (l + 1) - m; // (l, -m)
+          const int factor = m % 2 == 0 ? 1 : -1;
+          A(ii, mu_j, n, idxm) = A(ii, mu_j, n, idx).conj() * (double)factor;
+        }
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeRho, const int& iter) const
+{
+  const int idx_rho = iter / chunk_size;
+  const int ii = iter % chunk_size;
+
+  const int i = d_ilist[ii + chunk_offset];
+  const int mu_i = d_map(type(i));
+
+  if (idx_rho >= d_idx_rho_count(mu_i)) return;
+
+  const int ndensity = d_ndensity(mu_i);
+
+  const int offset = d_offsets(mu_i, idx_rho);
+  const int rank = d_rank(mu_i, offset);
+  const int r = rank - 1;
+
+  // Basis functions B with iterative product and density rho(p) calculation
+  if (rank == 1) {
+    const int mu = d_mus(mu_i, offset, 0);
+    const int n = d_ns(mu_i, offset, 0);
+    double A_cur = A_rank1(ii, mu, n - 1);
+    for (int p = 0; p < ndensity; ++p) {
+      //for rank=1 (r=0) only 1 ms-combination exists (ms_ind=0), so index of func.ctildes is 0..ndensity-1
+      Kokkos::atomic_add(&rhos(ii, p), d_ctildes(mu_i, idx_rho, p) * A_cur);
+    }
+  } else { // rank > 1
+    // loop over {ms} combinations in sum
+
+    // loop over m, collect B  = product of A with given ms
+    A_forward_prod(ii, idx_rho, 0) = complex::one();
+
+    // fill forward A-product triangle
+    for (int t = 0; t < rank; t++) {
+      //TODO: optimize ns[t]-1 -> ns[t] during functions construction
+      const int mu = d_mus(mu_i, offset, t);
+      const int n = d_ns(mu_i, offset, t);
+      const int l = d_ls(mu_i, offset, t);
+      const int m = d_ms_combs(mu_i, idx_rho, t); // current ms-combination (of length = rank)
+      const int idx = l * (l + 1) + m; // (l, m)
+      A_list(ii, idx_rho, t) = A(ii, mu, n - 1, idx);
+      A_forward_prod(ii, idx_rho, t + 1) = A_forward_prod(ii, idx_rho, t) * A_list(ii, idx_rho, t);
+    }
+
+    complex A_backward_prod = complex::one();
+
+    // fill backward A-product triangle
+    for (int t = r; t >= 1; t--) {
+      const complex dB = A_forward_prod(ii, idx_rho, t) * A_backward_prod; // dB - product of all A's except t-th
+      dB_flatten(ii, idx_rho, t) = dB;
+
+      A_backward_prod = A_backward_prod * A_list(ii, idx_rho, t);
+    }
+    dB_flatten(ii, idx_rho, 0) = A_forward_prod(ii, idx_rho, 0) * A_backward_prod;
+
+    const complex B = A_forward_prod(ii, idx_rho, rank);
+
+    for (int p = 0; p < ndensity; ++p) {
+      // real-part only multiplication
+      Kokkos::atomic_add(&rhos(ii, p), B.real_part_product(d_ctildes(mu_i, idx_rho, p)));
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeFS, const int& ii) const
+{
+  const int i = d_ilist[ii + chunk_offset];
+  const int mu_i = d_map(type(i));
+
+  const double rho_cut = d_rho_core_cutoff(mu_i);
+  const double drho_cut = d_drho_core_cutoff(mu_i);
+  const int ndensity = d_ndensity(mu_i);
+
+  double evdwl, fcut, dfcut;
+  evdwl = fcut = dfcut = 0.0;
+
+  inner_cutoff(rho_core(ii), rho_cut, drho_cut, fcut, dfcut);
+  FS_values_and_derivatives(ii, evdwl, mu_i);
+
+  dF_drho_core(ii) = evdwl * dfcut + 1;
+  for (int p = 0; p < ndensity; ++p)
+    dF_drho(ii, p) *= fcut;
+
+
+  // tally energy contribution
+  if (eflag) {
+    double evdwl_cut = evdwl * fcut + rho_core(ii);
+    // E0 shift
+    evdwl_cut += d_E0vals(mu_i);
+    e_atom(ii) = evdwl_cut;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeWeights, const int& iter) const
+{
+  const int idx_rho = iter / chunk_size;
+  const int ii = iter % chunk_size;
+
+  const int i = d_ilist[ii + chunk_offset];
+  const int mu_i = d_map(type(i));
+
+  if (idx_rho >= d_idx_rho_count(mu_i)) return;
+
+  const int ndensity = d_ndensity(mu_i);
+
+  const int offset = d_offsets(mu_i, idx_rho);
+  const int rank = d_rank(mu_i, offset);
+
+  // Weights and theta calculation
+
+  if (rank == 1) {
+    const int mu = d_mus(mu_i, offset, 0);
+    const int n = d_ns(mu_i, offset, 0);
+    double theta = 0.0;
+    for (int p = 0; p < ndensity; ++p) {
+      // for rank=1 (r=0) only 1 ms-combination exists (ms_ind=0), so index of func.ctildes is 0..ndensity-1
+      theta += dF_drho(ii, p) * d_ctildes(mu_i, idx_rho, p);
+    }
+    Kokkos::atomic_add(&weights_rank1(ii, mu, n - 1), theta);
+  } else { // rank > 1
+    double theta = 0.0;
+    for (int p = 0; p < ndensity; ++p)
+      theta += dF_drho(ii, p) * d_ctildes(mu_i, idx_rho, p);
+
+    theta *= 0.5; // 0.5 factor due to possible double counting ???
+    for (int t = 0; t < rank; ++t) {
+      const int m_t = d_ms_combs(mu_i, idx_rho, t);
+      const int factor = (m_t % 2 == 0 ? 1 : -1);
+      const complex dB = dB_flatten(ii, idx_rho, t);
+      const int mu_t = d_mus(mu_i, offset, t);
+      const int n_t = d_ns(mu_i, offset, t);
+      const int l_t = d_ls(mu_i, offset, t);
+      const int idx = l_t * (l_t + 1) + m_t; // (l, m)
+      const complex value = theta * dB;
+      Kokkos::atomic_add(&(weights(ii, mu_t, n_t - 1, idx).re), value.re);
+      Kokkos::atomic_add(&(weights(ii, mu_t, n_t - 1, idx).im), value.im);
+      // update -m_t (that could also be positive), because the basis is half_basis
+      const int idxm = l_t * (l_t + 1) - m_t; // (l, -m)
+      const complex valuem = theta * dB.conj() * (double)factor;
+      Kokkos::atomic_add(&(weights(ii, mu_t, n_t - 1, idxm).re), valuem.re);
+      Kokkos::atomic_add(&(weights(ii, mu_t, n_t - 1, idxm).im), valuem.im);
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeDerivative, const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeDerivative>::member_type& team) const
+{
+  // Extract the atom number
+  int ii = team.team_rank() + team.team_size() * (team.league_rank() %
+           ((chunk_size+team.team_size()-1)/team.team_size()));
+  if (ii >= chunk_size) return;
+  const int i = d_ilist[ii + chunk_offset];
+
+  // Extract the neighbor number
+  const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
+  const int ncount = d_ncount(ii);
+  if (jj >= ncount) return;
+
+  const int itype = type(i);
+  const double scale = d_scale(itype,itype);
+
+  const int mu_j = d_mu(ii, jj);
+  double r_hat[3];
+  r_hat[0] = d_rhats(ii, jj, 0);
+  r_hat[1] = d_rhats(ii, jj, 1);
+  r_hat[2] = d_rhats(ii, jj, 2);
+  const double r = d_rnorms(ii, jj);
+  const double rinv = 1.0/r;
+
+  double f_ji[3];
+  f_ji[0] = f_ji[1] = f_ji[2] = 0;
+
+  // for rank = 1
+  for (int n = 0; n < nradbase; ++n) {
+    if (weights_rank1(ii, mu_j, n) == 0) continue;
+    double &DG = dgr(ii, jj, n);
+    double DGR = DG * Y00;
+    DGR *= weights_rank1(ii, mu_j, n);
+    f_ji[0] += DGR * r_hat[0];
+    f_ji[1] += DGR * r_hat[1];
+    f_ji[2] += DGR * r_hat[2];
+  }
+
+  // for rank > 1
+  for (int n = 0; n < nradmax; n++) {
+    for (int l = 0; l <= lmax; l++) {
+      const double R_over_r = fr(ii, jj, n, l) * rinv;
+      const double DR = dfr(ii, jj, n, l);
+
+      // for m >= 0
+      for (int m = 0; m <= l; m++) {
+        const int idx = l * (l + 1) + m; // (l, m)
+        complex w = weights(ii, mu_j, n, idx);
+        if (w.re == 0.0 && w.im == 0.0) continue;
+        // counting for -m cases if m > 0
+        if (m > 0) {
+          w.re *= 2.0;
+          w.im *= 2.0;
+        }
+
+        complex DY[3];
+        DY[0] = dylm(ii, jj, idx, 0);
+        DY[1] = dylm(ii, jj, idx, 1);
+        DY[2] = dylm(ii, jj, idx, 2);
+        const complex Y_DR = ylm(ii, jj, idx) * DR;
+
+        complex grad_phi_nlm[3];
+        grad_phi_nlm[0] = Y_DR * r_hat[0] + DY[0] * R_over_r;
+        grad_phi_nlm[1] = Y_DR * r_hat[1] + DY[1] * R_over_r;
+        grad_phi_nlm[2] = Y_DR * r_hat[2] + DY[2] * R_over_r;
+        // real-part multiplication only
+        f_ji[0] += w.real_part_product(grad_phi_nlm[0]);
+        f_ji[1] += w.real_part_product(grad_phi_nlm[1]);
+        f_ji[2] += w.real_part_product(grad_phi_nlm[2]);
+      }
+    }
+  }
+
+  // hard-core repulsion
+  const double fpair = dF_drho_core(ii) * dcr(ii,jj);
+  f_ij(ii, jj, 0) = scale * f_ji[0] + fpair * r_hat[0];
+  f_ij(ii, jj, 1) = scale * f_ji[1] + fpair * r_hat[1];
+  f_ij(ii, jj, 2) = scale * f_ji[2] + fpair * r_hat[2];
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+template<int NEIGHFLAG, int EVFLAG>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeForce<NEIGHFLAG,EVFLAG>, const int& ii, EV_FLOAT& ev) const
+{
+  // The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial
+  const auto v_f = ScatterViewHelper<typename NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f);
+  const auto a_f = v_f.template access<typename AtomicDup<NEIGHFLAG,DeviceType>::value>();
+
+  const int i = d_ilist[ii + chunk_offset];
+  const int itype = type(i);
+  const double scale = d_scale(itype,itype);
+
+  const int ncount = d_ncount(ii);
+
+  F_FLOAT fitmp[3] = {0.0,0.0,0.0};
+  for (int jj = 0; jj < ncount; jj++) {
+    int j = d_nearest(ii,jj);
+
+    double r_hat[3];
+    r_hat[0] = d_rhats(ii, jj, 0);
+    r_hat[1] = d_rhats(ii, jj, 1);
+    r_hat[2] = d_rhats(ii, jj, 2);
+    const double r = d_rnorms(ii, jj);
+    const double delx = -r_hat[0]*r;
+    const double dely = -r_hat[1]*r;
+    const double delz = -r_hat[2]*r;
+
+    const double fpairx = f_ij(ii, jj, 0);
+    const double fpairy = f_ij(ii, jj, 1);
+    const double fpairz = f_ij(ii, jj, 2);
+
+    fitmp[0] += fpairx;
+    fitmp[1] += fpairy;
+    fitmp[2] += fpairz;
+    a_f(j,0) -= fpairx;
+    a_f(j,1) -= fpairy;
+    a_f(j,2) -= fpairz;
+
+    // tally per-atom virial contribution
+    if (EVFLAG && vflag_either)
+      v_tally_xyz<NEIGHFLAG>(ev, i, j, fpairx, fpairy, fpairz, delx, dely, delz);
+  }
+
+  a_f(i,0) += fitmp[0];
+  a_f(i,1) += fitmp[1];
+  a_f(i,2) += fitmp[2];
+
+  // tally energy contribution
+  if (EVFLAG && eflag_either) {
+    const double evdwl = scale*e_atom(ii);
+    //ev_tally_full(i, 2.0 * evdwl, 0.0, 0.0, 0.0, 0.0, 0.0);
+    if (eflag_global) ev.evdwl += evdwl;
+    if (eflag_atom) d_eatom[i] += evdwl;
+  }
+}
+
+template<class DeviceType>
+template<int NEIGHFLAG, int EVFLAG>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeForce<NEIGHFLAG,EVFLAG>,const int& ii) const {
+  EV_FLOAT ev;
+  this->template operator()<NEIGHFLAG,EVFLAG>(TagPairPACEComputeForce<NEIGHFLAG,EVFLAG>(), ii, ev);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+template<int NEIGHFLAG>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::v_tally_xyz(EV_FLOAT &ev, const int &i, const int &j,
+      const F_FLOAT &fx, const F_FLOAT &fy, const F_FLOAT &fz,
+      const F_FLOAT &delx, const F_FLOAT &dely, const F_FLOAT &delz) const
+{
+  // The vatom array is duplicated for OpenMP, atomic for CUDA, and neither for Serial
+
+  auto v_vatom = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_vatom),decltype(ndup_vatom)>::get(dup_vatom,ndup_vatom);
+  auto a_vatom = v_vatom.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();
+
+  const E_FLOAT v0 = delx*fx;
+  const E_FLOAT v1 = dely*fy;
+  const E_FLOAT v2 = delz*fz;
+  const E_FLOAT v3 = delx*fy;
+  const E_FLOAT v4 = delx*fz;
+  const E_FLOAT v5 = dely*fz;
+
+  if (vflag_global) {
+    ev.v[0] += v0;
+    ev.v[1] += v1;
+    ev.v[2] += v2;
+    ev.v[3] += v3;
+    ev.v[4] += v4;
+    ev.v[5] += v5;
+  }
+
+  if (vflag_atom) {
+    a_vatom(i,0) += 0.5*v0;
+    a_vatom(i,1) += 0.5*v1;
+    a_vatom(i,2) += 0.5*v2;
+    a_vatom(i,3) += 0.5*v3;
+    a_vatom(i,4) += 0.5*v4;
+    a_vatom(i,5) += 0.5*v5;
+    a_vatom(j,0) += 0.5*v0;
+    a_vatom(j,1) += 0.5*v1;
+    a_vatom(j,2) += 0.5*v2;
+    a_vatom(j,3) += 0.5*v3;
+    a_vatom(j,4) += 0.5*v4;
+    a_vatom(j,5) += 0.5*v5;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairPACEExtrapolationKokkos<DeviceType>::pre_compute_harmonics(int lmax)
+{
+  auto h_alm = Kokkos::create_mirror_view(alm);
+  auto h_blm = Kokkos::create_mirror_view(blm);
+  auto h_cl = Kokkos::create_mirror_view(cl);
+  auto h_dl = Kokkos::create_mirror_view(dl);
+
+  for (int l = 1; l <= lmax; l++) {
+    const double lsq = l * l;
+    const double ld = 2 * l;
+    const double l1 = (4 * lsq - 1);
+    const double l2 = lsq - ld + 1;
+    for (int m = 0; m < l - 1; m++) {
+      const double msq = m * m;
+      const double a = sqrt((double(l1)) / (double(lsq - msq)));
+      const double b = -sqrt((double(l2 - msq)) / (double(4 * l2 - 1)));
+      const int idx = l * (l + 1) + m; // (l, m)
+      h_alm(idx) = a;
+      h_blm(idx) = b;
+    }
+  }
+
+  for (int l = 1; l <= lmax; l++) {
+    h_cl(l) = -sqrt(1.0 + 0.5 / (double(l)));
+    h_dl(l) = sqrt(double(2 * (l - 1) + 3));
+  }
+
+  Kokkos::deep_copy(alm, h_alm);
+  Kokkos::deep_copy(blm, h_blm);
+  Kokkos::deep_copy(cl, h_cl);
+  Kokkos::deep_copy(dl, h_dl);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::compute_barplm(int ii, int jj, double rz, int lmax) const
+{
+  // requires -1 <= rz <= 1 , NO CHECKING IS PERFORMED !!!!!!!!!
+  // prefactors include 1/sqrt(2) factor compared to reference
+
+  // l=0, m=0
+  // plm(ii, jj, 0, 0) = Y00/sq1o4pi; //= sq1o4pi;
+  plm(ii, jj, 0) = Y00; //= 1;
+  dplm(ii, jj, 0) = 0.0;
+
+  if (lmax > 0) {
+
+    // l=1, m=0
+    plm(ii, jj, 2) = Y00 * sq3 * rz;
+    dplm(ii, jj, 2) = Y00 * sq3;
+
+    // l=1, m=1
+    plm(ii, jj, 3) = -sq3o2 * Y00;
+    dplm(ii, jj, 3) = 0.0;
+
+    // loop l = 2, lmax
+    for (int l = 2; l <= lmax; l++) {
+      for (int m = 0; m < l - 1; m++) {
+        const int idx = l * (l + 1) + m; // (l, m)
+        const int idx1 = (l - 1) * l + m; // (l - 1, m)
+        const int idx2 = (l - 2) * (l - 1) + m; // (l - 2, m)
+        plm(ii, jj, idx) = alm(idx) * (rz * plm(ii, jj, idx1) + blm(idx) * plm(ii, jj, idx2));
+        dplm(ii, jj, idx) = alm(idx) * (plm(ii, jj, idx1) + rz * dplm(ii, jj, idx1) + blm(idx) * dplm(ii, jj, idx2));
+      }
+      const int idx = l * (l + 1) + l; // (l, l)
+      const int idx1 = l * (l + 1) + l - 1; // (l, l - 1)
+      const int idx2 = (l - 1) * l + l - 1; // (l - 1, l - 1)
+      const double t = dl(l) * plm(ii, jj, idx2);
+      plm(ii, jj, idx1) = t * rz;
+      dplm(ii, jj, idx1) = t;
+      plm(ii, jj, idx) = cl(l) * plm(ii, jj, idx2);
+      dplm(ii, jj, idx) = 0.0;
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::compute_ylm(int ii, int jj, double rx, double ry, double rz, int lmax) const
+{
+  // requires rx^2 + ry^2 + rz^2 = 1 , NO CHECKING IS PERFORMED !!!!!!!!!
+
+  complex phase;
+  complex phasem, mphasem1;
+  complex dyx, dyy, dyz;
+  complex rdy;
+
+  phase.re = rx;
+  phase.im = ry;
+
+  // compute barplm
+  compute_barplm(ii, jj, rz, lmax);
+
+  // m = 0
+  for (int l = 0; l <= lmax; l++) {
+    const int idx = l * (l + 1);
+
+    ylm(ii, jj, idx).re = plm(ii, jj, idx);
+    ylm(ii, jj, idx).im = 0.0;
+
+    dyz.re = dplm(ii, jj, idx);
+    rdy.re = dyz.re * rz;
+
+    dylm(ii, jj, idx, 0).re = -rdy.re * rx;
+    dylm(ii, jj, idx, 0).im = 0.0;
+    dylm(ii, jj, idx, 1).re = -rdy.re * ry;
+    dylm(ii, jj, idx, 1).im = 0.0;
+    dylm(ii, jj, idx, 2).re = dyz.re - rdy.re * rz;
+    dylm(ii, jj, idx, 2).im = 0;
+  }
+  // m = 1
+  for (int l = 1; l <= lmax; l++) {
+    const int idx = l * (l + 1) + 1;
+
+    ylm(ii, jj, idx) = phase * plm(ii, jj, idx);
+
+    dyx.re = plm(ii, jj, idx);
+    dyx.im = 0.0;
+    dyy.re = 0.0;
+    dyy.im = plm(ii, jj, idx);
+    dyz.re = phase.re * dplm(ii, jj, idx);
+    dyz.im = phase.im * dplm(ii, jj, idx);
+
+    rdy.re = rx * dyx.re + +rz * dyz.re;
+    rdy.im = ry * dyy.im + rz * dyz.im;
+
+    dylm(ii, jj, idx, 0).re = dyx.re - rdy.re * rx;
+    dylm(ii, jj, idx, 0).im = -rdy.im * rx;
+    dylm(ii, jj, idx, 1).re = -rdy.re * ry;
+    dylm(ii, jj, idx, 1).im = dyy.im - rdy.im * ry;
+    dylm(ii, jj, idx, 2).re = dyz.re - rdy.re * rz;
+    dylm(ii, jj, idx, 2).im = dyz.im - rdy.im * rz;
+  }
+
+  // m > 1
+  phasem = phase;
+  for (int m = 2; m <= lmax; m++) {
+
+    mphasem1.re = phasem.re * double(m);
+    mphasem1.im = phasem.im * double(m);
+    phasem = phasem * phase;
+
+    for (int l = m; l <= lmax; l++) {
+      const int idx = l * (l + 1) + m;
+
+      ylm(ii, jj, idx).re = phasem.re * plm(ii, jj, idx);
+      ylm(ii, jj, idx).im = phasem.im * plm(ii, jj, idx);
+
+      dyx = mphasem1 * plm(ii, jj, idx);
+      dyy.re = -dyx.im;
+      dyy.im = dyx.re;
+      dyz = phasem * dplm(ii, jj, idx);
+
+      rdy.re = rx * dyx.re + ry * dyy.re + rz * dyz.re;
+      rdy.im = rx * dyx.im + ry * dyy.im + rz * dyz.im;
+
+      dylm(ii, jj, idx, 0).re = dyx.re - rdy.re * rx;
+      dylm(ii, jj, idx, 0).im = dyx.im - rdy.im * rx;
+      dylm(ii, jj, idx, 1).re = dyy.re - rdy.re * ry;
+      dylm(ii, jj, idx, 1).im = dyy.im - rdy.im * ry;
+      dylm(ii, jj, idx, 2).re = dyz.re - rdy.re * rz;
+      dylm(ii, jj, idx, 2).im = dyz.im - rdy.im * rz;
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::cutoff_func_poly(const double r, const double r_in, const double delta_in, double &fc, double &dfc) const
+{
+  if (r <= r_in-delta_in) {
+    fc = 1;
+    dfc = 0;
+  } else if (r >= r_in ) {
+    fc = 0;
+    dfc = 0;
+  } else {
+    double x = 1 - 2 * (1 + (r - r_in) / delta_in);
+    fc = 0.5 + 7.5 / 2. * (x / 4. - pow(x, 3) / 6. + pow(x, 5) / 20.);
+    dfc = -7.5 / delta_in * (0.25 - x * x / 2.0 + pow(x, 4) / 4.);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::Fexp(const double x, const double m, double &F, double &DF) const
+{
+  const double w = 1.e6;
+  const double eps = 1e-10;
+
+  const double lambda = pow(1.0 / w, m - 1.0);
+  if (abs(x) > eps) {
+    double g;
+    const double a = abs(x);
+    const double am = pow(a, m);
+    const double w3x3 = pow(w * a, 3); //// use cube
+    const double sign_factor = (signbit(x) ? -1 : 1);
+    if (w3x3 > 30.0)
+        g = 0.0;
+    else
+        g = exp(-w3x3);
+
+    const double omg = 1.0 - g;
+    F = sign_factor * (omg * am + lambda * g * a);
+    const double dg = -3.0 * w * w * w * a * a * g;
+    DF = m * pow(a, m - 1.0) * omg - am * dg + lambda * dg * a + lambda * g;
+  } else {
+    F = lambda * x;
+    DF = lambda;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::FexpShiftedScaled(const double rho, const double mexp, double &F, double &DF) const
+{
+  const double eps = 1e-10;
+
+  if (abs(mexp - 1.0) < eps) {
+    F = rho;
+    DF = 1;
+  } else {
+    const double a = abs(rho);
+    const double exprho = exp(-a);
+    const double nx = 1. / mexp;
+    const double xoff = pow(nx, (nx / (1.0 - nx))) * exprho;
+    const double yoff = pow(nx, (1 / (1.0 - nx))) * exprho;
+    const double sign_factor = (signbit(rho) ? -1 : 1);
+    F = sign_factor * (pow(xoff + a, mexp) - yoff);
+    DF = yoff + mexp * (-xoff + 1.0) * pow(xoff + a, mexp - 1.);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::inner_cutoff(const double rho_core, const double rho_cut, const double drho_cut,
+                                     double &fcut, double &dfcut) const
+{
+  double rho_low = rho_cut - drho_cut;
+  if (rho_core >= rho_cut) {
+    fcut = 0;
+    dfcut = 0;
+  } else if (rho_core <= rho_low) {
+    fcut = 1;
+    dfcut = 0;
+  } else {
+    cutoff_func_poly(rho_core, rho_cut, drho_cut, fcut, dfcut);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::FS_values_and_derivatives(const int ii, double &evdwl, const int mu_i) const
+{
+  double F, DF = 0;
+  int npoti = d_npoti(mu_i);
+  int ndensity = d_ndensity(mu_i);
+  for (int p = 0; p < ndensity; p++) {
+    const double wpre = d_wpre(mu_i, p);
+    const double mexp = d_mexp(mu_i, p);
+
+    if (npoti == FS)
+      Fexp(rhos(ii, p), mexp, F, DF);
+    else if (npoti == FS_SHIFTEDSCALED)
+      FexpShiftedScaled(rhos(ii, p), mexp, F, DF);
+
+    evdwl += F * wpre; // * weight (wpre)
+    dF_drho(ii, p) = DF * wpre; // * weight (wpre)
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::evaluate_splines(const int ii, const int jj, double r,
+                                                  int /*nradbase_c*/, int /*nradial_c*/,
+                                                  int mu_i, int mu_j) const
+{
+  auto &spline_gk = k_splines_gk.template view<DeviceType>()(mu_i, mu_j);
+  auto &spline_rnl = k_splines_rnl.template view<DeviceType>()(mu_i, mu_j);
+  auto &spline_hc = k_splines_hc.template view<DeviceType>()(mu_i, mu_j);
+
+  spline_gk.calcSplines(ii, jj, r, gr, dgr);
+
+  spline_rnl.calcSplines(ii, jj, r, d_values, d_derivatives);
+  for (int kk = 0; kk < (int)fr.extent(2); kk++) {
+    for (int ll = 0; ll < (int)fr.extent(3); ll++) {
+      const int flatten = kk*fr.extent(3) + ll;
+      fr(ii, jj, kk, ll) = d_values(ii, jj, flatten);
+      dfr(ii, jj, kk, ll) = d_derivatives(ii, jj, flatten);
+    }
+  }
+
+  spline_hc.calcSplines(ii, jj, r, d_values, d_derivatives);
+  cr(ii, jj) = d_values(ii, jj, 0);
+  dcr(ii, jj) = d_derivatives(ii, jj, 0);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::SplineInterpolatorKokkos::calcSplines(const int ii, const int jj, const double r, const t_ace_3d &d_values, const t_ace_3d &d_derivatives) const
+{
+  double wl, wl2, wl3, w2l1, w3l2;
+  double c[4];
+  double x = r * rscalelookup;
+  int nl = static_cast<int>(floor(x));
+
+  if (nl <= 0)
+    Kokkos::abort("Encountered very small distance. Stopping.");
+
+  if (nl < nlut) {
+    wl = x - double(nl);
+    wl2 = wl * wl;
+    wl3 = wl2 * wl;
+    w2l1 = 2.0 * wl;
+    w3l2 = 3.0 * wl2;
+    for (int func_id = 0; func_id < num_of_functions; func_id++) {
+      for (int idx = 0; idx < 4; idx++)
+        c[idx] = lookupTable(nl, func_id, idx);
+      d_values(ii, jj, func_id) = c[0] + c[1] * wl + c[2] * wl2 + c[3] * wl3;
+      d_derivatives(ii, jj, func_id) = (c[1] + c[2] * w2l1 + c[3] * w3l2) * rscalelookup;
+    }
+  } else { // fill with zeroes
+    for (int func_id = 0; func_id < num_of_functions; func_id++) {
+      d_values(ii, jj, func_id) = 0.0;
+      d_derivatives(ii, jj, func_id) = 0.0;
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+template<class TagStyle>
+void PairPACEExtrapolationKokkos<DeviceType>::check_team_size_for(int inum, int &team_size, int vector_length) {
+  int team_size_max;
+
+  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());
+
+  if (team_size*vector_length > team_size_max)
+    team_size = team_size_max/vector_length;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+template<class TagStyle>
+void PairPACEExtrapolationKokkos<DeviceType>::check_team_size_reduce(int inum, int &team_size, int vector_length) {
+  int team_size_max;
+
+  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelReduceTag());
+
+  if (team_size*vector_length > team_size_max)
+    team_size = team_size_max/vector_length;
+}
+
+template<class DeviceType>
+template<typename scratch_type>
+int PairPACEExtrapolationKokkos<DeviceType>::scratch_size_helper(int values_per_team) {
+  typedef Kokkos::View<scratch_type*, Kokkos::DefaultExecutionSpace::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > ScratchViewType;
+
+  return ScratchViewType::shmem_size(values_per_team);
+}
+
+/* ----------------------------------------------------------------------
+   memory usage of arrays
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+double PairPACEExtrapolationKokkos<DeviceType>::memory_usage()
+{
+  double bytes = 0;
+
+  bytes += MemKK::memory_usage(A);
+  bytes += MemKK::memory_usage(A_rank1);
+  bytes += MemKK::memory_usage(A_list);
+  bytes += MemKK::memory_usage(A_forward_prod);
+  bytes += MemKK::memory_usage(e_atom);
+  bytes += MemKK::memory_usage(rhos);
+  bytes += MemKK::memory_usage(dF_drho);
+  bytes += MemKK::memory_usage(weights);
+  bytes += MemKK::memory_usage(weights_rank1);
+  bytes += MemKK::memory_usage(rho_core);
+  bytes += MemKK::memory_usage(dF_drho_core);
+  bytes += MemKK::memory_usage(dB_flatten);
+  bytes += MemKK::memory_usage(fr);
+  bytes += MemKK::memory_usage(dfr);
+  bytes += MemKK::memory_usage(gr);
+  bytes += MemKK::memory_usage(dgr);
+  bytes += MemKK::memory_usage(d_values);
+  bytes += MemKK::memory_usage(d_derivatives);
+  bytes += MemKK::memory_usage(cr);
+  bytes += MemKK::memory_usage(dcr);
+  bytes += MemKK::memory_usage(plm);
+  bytes += MemKK::memory_usage(dplm);
+  bytes += MemKK::memory_usage(ylm);
+  bytes += MemKK::memory_usage(dylm);
+  bytes += MemKK::memory_usage(d_ncount);
+  bytes += MemKK::memory_usage(d_mu);
+  bytes += MemKK::memory_usage(d_rhats);
+  bytes += MemKK::memory_usage(d_rnorms);
+  bytes += MemKK::memory_usage(d_nearest);
+  bytes += MemKK::memory_usage(f_ij);
+  bytes += MemKK::memory_usage(d_rho_core_cutoff);
+  bytes += MemKK::memory_usage(d_drho_core_cutoff);
+  bytes += MemKK::memory_usage(d_E0vals);
+  bytes += MemKK::memory_usage(d_ndensity);
+  bytes += MemKK::memory_usage(d_npoti);
+  bytes += MemKK::memory_usage(d_wpre);
+  bytes += MemKK::memory_usage(d_mexp);
+  bytes += MemKK::memory_usage(d_idx_rho_count);
+  bytes += MemKK::memory_usage(d_rank);
+  bytes += MemKK::memory_usage(d_num_ms_combs);
+  bytes += MemKK::memory_usage(d_offsets);
+  bytes += MemKK::memory_usage(d_mus);
+  bytes += MemKK::memory_usage(d_ns);
+  bytes += MemKK::memory_usage(d_ls);
+  bytes += MemKK::memory_usage(d_ms_combs);
+  bytes += MemKK::memory_usage(d_ctildes);
+  bytes += MemKK::memory_usage(alm);
+  bytes += MemKK::memory_usage(blm);
+  bytes += MemKK::memory_usage(cl);
+  bytes += MemKK::memory_usage(dl);
+
+  if (k_splines_gk.h_view.data()) {
+    for (int i = 0; i < nelements; i++) {
+      for (int j = 0; j < nelements; j++) {
+        bytes += k_splines_gk.h_view(i, j).memory_usage();
+        bytes += k_splines_rnl.h_view(i, j).memory_usage();
+        bytes += k_splines_hc.h_view(i, j).memory_usage();
+      }
+    }
+  }
+
+  return bytes;
+}
+
+/* ---------------------------------------------------------------------- */
+
+namespace LAMMPS_NS {
+template class PairPACEExtrapolationKokkos<LMPDeviceType>;
+#ifdef LMP_KOKKOS_GPU
+template class PairPACEExtrapolationKokkos<LMPHostType>;
+#endif
+}
diff --git a/src/KOKKOS/pair_pace_extrapolation_kokkos.h b/src/KOKKOS/pair_pace_extrapolation_kokkos.h
new file mode 100644
index 0000000000..482d068725
--- /dev/null
+++ b/src/KOKKOS/pair_pace_extrapolation_kokkos.h
@@ -0,0 +1,334 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+// clang-format off
+PairStyle(pace/extrapolation/kk,PairPACEExtrapolationKokkos<LMPDeviceType>);
+PairStyle(pace/extrapolation/kk/device,PairPACEExtrapolationKokkos<LMPDeviceType>);
+PairStyle(pace/extrapolation/kk/host,PairPACEExtrapolationKokkos<LMPHostType>);
+// clang-format on
+#else
+
+// clang-format off
+#ifndef LMP_PAIR_PACE_EXTRAPOLATION_KOKKOS_H
+#define LMP_PAIR_PACE_EXTRAPOLATION_KOKKOS_H
+
+#include "pair_pace.h"
+#include "ace-evaluator/ace_radial.h"
+#include "kokkos_type.h"
+#include "pair_kokkos.h"
+
+namespace LAMMPS_NS {
+
+template<class DeviceType>
+class PairPACEExtrapolationKokkos : public PairPACE {
+ public:
+  struct TagPairPACEComputeNeigh{};
+  struct TagPairPACEComputeRadial{};
+  struct TagPairPACEComputeYlm{};
+  struct TagPairPACEComputeAi{};
+  struct TagPairPACEConjugateAi{};
+  struct TagPairPACEComputeRho{};
+  struct TagPairPACEComputeFS{};
+  struct TagPairPACEComputeWeights{};
+  struct TagPairPACEComputeDerivative{};
+
+  template<int NEIGHFLAG, int EVFLAG>
+  struct TagPairPACEComputeForce{};
+
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typedef EV_FLOAT value_type;
+  using complex = SNAComplex<double>;
+
+  PairPACEExtrapolationKokkos(class LAMMPS *);
+  ~PairPACEExtrapolationKokkos() override;
+
+  void compute(int, int) override;
+  void coeff(int, char **) override;
+  void init_style() override;
+  double init_one(int, int) override;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairPACEComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeNeigh>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairPACEComputeRadial,const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeRadial>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairPACEComputeYlm,const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeYlm>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairPACEComputeAi,const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeAi>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairPACEConjugateAi,const int& ii) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairPACEComputeRho,const int& iter) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairPACEComputeFS,const int& ii) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairPACEComputeWeights,const int& iter) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairPACEComputeDerivative,const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeDerivative>::member_type& team) const;
+
+  template<int NEIGHFLAG, int EVFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairPACEComputeForce<NEIGHFLAG,EVFLAG>,const int& ii) const;
+
+  template<int NEIGHFLAG, int EVFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairPACEComputeForce<NEIGHFLAG,EVFLAG>,const int& ii, EV_FLOAT&) const;
+
+ protected:
+  int inum, maxneigh, chunk_size, chunk_offset, idx_rho_max;
+  int host_flag;
+
+  int eflag, vflag;
+
+  int neighflag, max_ndensity;
+  int nelements, lmax, nradmax, nradbase;
+
+  typename AT::t_neighbors_2d d_neighbors;
+  typename AT::t_int_1d_randomread d_ilist;
+  typename AT::t_int_1d_randomread d_numneigh;
+
+  DAT::tdual_efloat_1d k_eatom;
+  DAT::tdual_virial_array k_vatom;
+  typename AT::t_efloat_1d d_eatom;
+  typename AT::t_virial_array d_vatom;
+
+  typename AT::t_x_array_randomread x;
+  typename AT::t_f_array f;
+  typename AT::t_int_1d_randomread type;
+
+  typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams;
+  tdual_fparams k_cutsq, k_scale;
+  typedef Kokkos::View<F_FLOAT**, DeviceType> t_fparams;
+  t_fparams d_cutsq, d_scale;
+
+  typename AT::t_int_1d d_map;
+
+  int need_dup;
+
+  using KKDeviceType = typename KKDevice<DeviceType>::value;
+
+  template<typename DataType, typename Layout>
+  using DupScatterView = KKScatterView<DataType, Layout, KKDeviceType, KKScatterSum, KKScatterDuplicated>;
+
+  template<typename DataType, typename Layout>
+  using NonDupScatterView = KKScatterView<DataType, Layout, KKDeviceType, KKScatterSum, KKScatterNonDuplicated>;
+
+  DupScatterView<F_FLOAT*[3], typename DAT::t_f_array::array_layout> dup_f;
+  DupScatterView<F_FLOAT*[6], typename DAT::t_virial_array::array_layout> dup_vatom;
+
+  NonDupScatterView<F_FLOAT*[3], typename DAT::t_f_array::array_layout> ndup_f;
+  NonDupScatterView<F_FLOAT*[6], typename DAT::t_virial_array::array_layout> ndup_vatom;
+
+  friend void pair_virial_fdotr_compute<PairPACEExtrapolationKokkos>(PairPACEExtrapolationKokkos*);
+
+  void grow(int, int);
+  void copy_pertype();
+  void copy_splines();
+  void copy_tilde();
+  void allocate() override;
+  void precompute_harmonics();
+  double memory_usage() override;
+
+  template<int NEIGHFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void v_tally_xyz(EV_FLOAT &ev, const int &i, const int &j,
+      const F_FLOAT &fx, const F_FLOAT &fy, const F_FLOAT &fz,
+      const F_FLOAT &delx, const F_FLOAT &dely, const F_FLOAT &delz) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void compute_barplm(int, int, double, int) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void compute_ylm(int, int, double, double, double, int) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void cutoff_func_poly(const double, const double, const double, double &, double &) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void Fexp(const double, const double, double &, double &) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void FexpShiftedScaled(const double, const double, double &, double &) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void inner_cutoff(const double, const double, const double, double &, double &) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void FS_values_and_derivatives(const int, double&, const int) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void evaluate_splines(const int, const int, double, int, int, int, int) const;
+
+  template<class TagStyle>
+  void check_team_size_for(int, int&, int);
+
+  template<class TagStyle>
+  void check_team_size_reduce(int, int&, int);
+
+  // Utility routine which wraps computing per-team scratch size requirements for
+  // ComputeNeigh, ComputeUi, and ComputeFusedDeidrj
+  template <typename scratch_type>
+  int scratch_size_helper(int values_per_team);
+
+  typedef Kokkos::View<int*, DeviceType> t_ace_1i;
+  typedef Kokkos::View<int**, DeviceType> t_ace_2i;
+  typedef Kokkos::View<int***, DeviceType> t_ace_3i;
+  typedef Kokkos::View<int****, DeviceType> t_ace_4i;
+  typedef Kokkos::View<double*, DeviceType> t_ace_1d;
+  typedef Kokkos::View<double**, DeviceType> t_ace_2d;
+  typedef Kokkos::View<double*[3], DeviceType> t_ace_2d3;
+  typedef Kokkos::View<double***, DeviceType> t_ace_3d;
+  typedef Kokkos::View<double**[3], DeviceType> t_ace_3d3;
+  typedef Kokkos::View<double**[4], DeviceType> t_ace_3d4;
+  typedef Kokkos::View<double****, DeviceType> t_ace_4d;
+  typedef Kokkos::View<complex*, DeviceType> t_ace_1c;
+  typedef Kokkos::View<complex**, DeviceType> t_ace_2c;
+  typedef Kokkos::View<complex***, DeviceType> t_ace_3c;
+  typedef Kokkos::View<complex**[3], DeviceType> t_ace_3c3;
+  typedef Kokkos::View<complex****, DeviceType> t_ace_4c;
+  typedef Kokkos::View<complex***[3], DeviceType> t_ace_4c3;
+
+  t_ace_3d A_rank1;
+  t_ace_4c A;
+
+  t_ace_3c A_list;
+  t_ace_3c A_forward_prod;
+
+  t_ace_3d weights_rank1;
+  t_ace_4c weights;
+
+  t_ace_1d e_atom;
+  t_ace_2d rhos;
+  t_ace_2d dF_drho;
+
+  // hard-core repulsion
+  t_ace_1d rho_core;
+  t_ace_3c dB_flatten;
+  t_ace_2d cr;
+  t_ace_2d dcr;
+  t_ace_1d dF_drho_core;
+
+  // radial functions
+  t_ace_4d fr;
+  t_ace_4d dfr;
+  t_ace_3d gr;
+  t_ace_3d dgr;
+  t_ace_3d d_values;
+  t_ace_3d d_derivatives;
+
+  // Spherical Harmonics
+
+  void pre_compute_harmonics(int);
+
+  KOKKOS_INLINE_FUNCTION
+  void compute_barplm(double rz, int lmaxi);
+
+  KOKKOS_INLINE_FUNCTION
+  void compute_ylm(double rx, double ry, double rz, int lmaxi);
+
+  t_ace_1d alm;
+  t_ace_1d blm;
+  t_ace_1d cl;
+  t_ace_1d dl;
+
+  t_ace_3d plm;
+  t_ace_3d dplm;
+
+  t_ace_3c ylm;
+  t_ace_4c3 dylm;
+
+  // short neigh list
+  t_ace_1i d_ncount;
+  t_ace_2d d_mu;
+  t_ace_2d d_rnorms;
+  t_ace_3d3 d_rhats;
+  t_ace_2i d_nearest;
+
+  // per-type
+  t_ace_1i d_ndensity;
+  t_ace_1i d_npoti;
+  t_ace_1d d_rho_core_cutoff;
+  t_ace_1d d_drho_core_cutoff;
+  t_ace_1d d_E0vals;
+  t_ace_2d d_wpre;
+  t_ace_2d d_mexp;
+
+  // tilde
+  t_ace_1i d_idx_rho_count;
+  t_ace_2i d_rank;
+  t_ace_2i d_num_ms_combs;
+  t_ace_2i d_offsets;
+  t_ace_3i d_mus;
+  t_ace_3i d_ns;
+  t_ace_3i d_ls;
+  t_ace_3i d_ms_combs;
+  t_ace_3d d_ctildes;
+
+  t_ace_3d3 f_ij;
+
+ public:
+  struct SplineInterpolatorKokkos {
+    int ntot, nlut, num_of_functions;
+    double cutoff, deltaSplineBins, invrscalelookup, rscalelookup;
+
+    t_ace_3d4 lookupTable;
+
+    void operator=(const SplineInterpolator &spline) {
+      cutoff = spline.cutoff;
+      deltaSplineBins = spline.deltaSplineBins;
+      ntot = spline.ntot;
+      nlut = spline.nlut;
+      invrscalelookup = spline.invrscalelookup;
+      rscalelookup = spline.rscalelookup;
+      num_of_functions = spline.num_of_functions;
+
+      lookupTable = t_ace_3d4("lookupTable", ntot+1, num_of_functions);
+      auto h_lookupTable = Kokkos::create_mirror_view(lookupTable);
+      for (int i = 0; i < ntot+1; i++)
+        for (int j = 0; j < num_of_functions; j++)
+          for (int k = 0; k < 4; k++)
+            h_lookupTable(i, j, k) = spline.lookupTable(i, j, k);
+      Kokkos::deep_copy(lookupTable, h_lookupTable);
+    }
+
+    void deallocate() {
+      lookupTable = t_ace_3d4();
+    }
+
+    double memory_usage() {
+      return lookupTable.span() * sizeof(typename decltype(lookupTable)::value_type);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void calcSplines(const int ii, const int jj, const double r, const t_ace_3d &d_values, const t_ace_3d &d_derivatives) const;
+  };
+
+  Kokkos::DualView<SplineInterpolatorKokkos**, DeviceType> k_splines_gk;
+  Kokkos::DualView<SplineInterpolatorKokkos**, DeviceType> k_splines_rnl;
+  Kokkos::DualView<SplineInterpolatorKokkos**, DeviceType> k_splines_hc;
+
+};
+}    // namespace LAMMPS_NS
+
+#endif
+#endif

From 1b92569187093deb1f620d33363e9000d00c4a79 Mon Sep 17 00:00:00 2001
From: Yury Lysogorskiy <yury.lysogorskiy@rub.de>
Date: Mon, 2 Jan 2023 18:29:12 +0100
Subject: [PATCH 02/12] WIP: pair_pace_extrapolation_kokkos.h/cpp: - rename
 idx_rho_max -> idx_ms_combs_max,  d_idx_rho_count ->d_idx_ms_combs_count,
 d_offsets->d_func_inds - remove d_ctildes, add d_gen_cgs and d_coeffs - use
 ACEBBasisFunction - update TagPairPACEComputeRho and
 TagPairPACEComputeWeights - pair_pace_extrapolation.h/cpp: add chunksize
 option

---
 src/KOKKOS/pair_pace_extrapolation_kokkos.cpp | 249 ++++++++++--------
 src/KOKKOS/pair_pace_extrapolation_kokkos.h   |  14 +-
 src/ML-PACE/pair_pace_extrapolation.cpp       |  22 +-
 src/ML-PACE/pair_pace_extrapolation.h         |   4 +-
 4 files changed, 164 insertions(+), 125 deletions(-)

diff --git a/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp
index b4f4932db4..006cf5e609 100644
--- a/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp
+++ b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp
@@ -34,19 +34,30 @@
 #include "ace-evaluator/ace_recursive.h"
 #include "ace-evaluator/ace_version.h"
 #include "ace-evaluator/ace_radial.h"
+
+#include "ace/ace_b_basis.h"
+#include "ace/ace_b_evaluator.h"
+
 #include <cstring>
 
 namespace LAMMPS_NS {
-struct ACEImpl {
-  ACEImpl() : basis_set(nullptr), ace(nullptr) {}
-  ~ACEImpl()
-  {
-    delete basis_set;
-    delete ace;
-  }
-  ACECTildeBasisSet *basis_set;
-  ACERecursiveEvaluator *ace;
-};
+    struct ACEALImpl {
+        ACEALImpl() : basis_set(nullptr), ace(nullptr), ctilde_basis_set(nullptr), rec_ace(nullptr) {}
+
+        ~ACEALImpl()
+        {
+            delete basis_set;
+            delete ace;
+
+            delete ctilde_basis_set;
+            delete rec_ace;
+        }
+
+        ACEBBasisSet *basis_set;
+        ACEBEvaluator *ace;
+        ACECTildeBasisSet *ctilde_basis_set;
+        ACERecursiveEvaluator *rec_ace;
+    };
 }    // namespace LAMMPS_NS
 
 using namespace LAMMPS_NS;
@@ -57,7 +68,7 @@ enum{FS,FS_SHIFTEDSCALED};
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
-PairPACEExtrapolationKokkos<DeviceType>::PairPACEExtrapolationKokkos(LAMMPS *lmp) : PairPACE(lmp)
+PairPACEExtrapolationKokkos<DeviceType>::PairPACEExtrapolationKokkos(LAMMPS *lmp) : PairPACEExtrapolation(lmp)
 {
   respa_enable = 0;
 
@@ -107,9 +118,9 @@ void PairPACEExtrapolationKokkos<DeviceType>::grow(int natom, int maxneigh)
     MemKK::realloc_kokkos(A, "pace:A", natom, nelements, nradmax + 1, (lmax + 1) * (lmax + 1));
     MemKK::realloc_kokkos(A_rank1, "pace:A_rank1", natom, nelements, nradbase);
 
-    MemKK::realloc_kokkos(A_list, "pace:A_list", natom, idx_rho_max, basis_set->rankmax);
+    MemKK::realloc_kokkos(A_list, "pace:A_list", natom, idx_ms_combs_max, basis_set->rankmax);
     //size is +1 of max to avoid out-of-boundary array access in double-triangular scheme
-    MemKK::realloc_kokkos(A_forward_prod, "pace:A_forward_prod", natom, idx_rho_max, basis_set->rankmax + 1);
+    MemKK::realloc_kokkos(A_forward_prod, "pace:A_forward_prod", natom, idx_ms_combs_max, basis_set->rankmax + 1);
 
     MemKK::realloc_kokkos(e_atom, "pace:e_atom", natom);
     MemKK::realloc_kokkos(rhos, "pace:rhos", natom, basis_set->ndensitymax + 1); // +1 density for core repulsion
@@ -121,7 +132,7 @@ void PairPACEExtrapolationKokkos<DeviceType>::grow(int natom, int maxneigh)
     // hard-core repulsion
     MemKK::realloc_kokkos(rho_core, "pace:rho_core", natom);
     MemKK::realloc_kokkos(dF_drho_core, "pace:dF_drho_core", natom);
-    MemKK::realloc_kokkos(dB_flatten, "pace:dB_flatten", natom, idx_rho_max, basis_set->rankmax);
+    MemKK::realloc_kokkos(dB_flatten, "pace:dB_flatten", natom, idx_ms_combs_max, basis_set->rankmax);
   }
 
   if (((int)ylm.extent(0) < natom) || ((int)ylm.extent(1) < maxneigh)) {
@@ -263,120 +274,134 @@ void PairPACEExtrapolationKokkos<DeviceType>::copy_tilde()
 
   // flatten loops, get per-element count and max
 
-  idx_rho_max = 0;
+  idx_ms_combs_max = 0;
   int total_basis_size_max = 0;
 
-  MemKK::realloc_kokkos(d_idx_rho_count, "pace:idx_rho_count", nelements);
-  auto h_idx_rho_count = Kokkos::create_mirror_view(d_idx_rho_count);
+  MemKK::realloc_kokkos(d_idx_ms_combs_count, "pace:idx_ms_combs_count", nelements);
+  auto h_idx_ms_combs_count = Kokkos::create_mirror_view(d_idx_ms_combs_count);
 
-  for (int n = 0; n < nelements; n++) {
-    int idx_rho = 0;
-    const int total_basis_size_rank1 = basis_set->total_basis_size_rank1[n];
-    const int total_basis_size = basis_set->total_basis_size[n];
+  for (int mu = 0; mu < nelements; mu++) {
+    int idx_ms_combs = 0;
+    const int total_basis_size_rank1 = basis_set->total_basis_size_rank1[mu];
+    const int total_basis_size = basis_set->total_basis_size[mu];
 
-    ACECTildeBasisFunction *basis = basis_set->basis[n];
+    ACEBBasisFunction *basis = basis_set->basis[mu];
 
     // rank=1
     for (int func_rank1_ind = 0; func_rank1_ind < total_basis_size_rank1; ++func_rank1_ind)
-      idx_rho++;
+      idx_ms_combs++;
 
     // rank > 1
     for (int func_ind = 0; func_ind < total_basis_size; ++func_ind) {
-      ACECTildeBasisFunction *func = &basis[func_ind];
+      ACEBBasisFunction *func = &basis[func_ind];
 
       // loop over {ms} combinations in sum
       for (int ms_ind = 0; ms_ind < func->num_ms_combs; ++ms_ind)
-        idx_rho++;
+        idx_ms_combs++;
     }
-    h_idx_rho_count(n) = idx_rho;
-    idx_rho_max = MAX(idx_rho_max, idx_rho);
+    h_idx_ms_combs_count(mu) = idx_ms_combs;
+    idx_ms_combs_max = MAX(idx_ms_combs_max, idx_ms_combs);
     total_basis_size_max = MAX(total_basis_size_max, total_basis_size_rank1 + total_basis_size);
   }
 
-  Kokkos::deep_copy(d_idx_rho_count, h_idx_rho_count);
+  Kokkos::deep_copy(d_idx_ms_combs_count, h_idx_ms_combs_count);
 
   MemKK::realloc_kokkos(d_rank, "pace:rank", nelements, total_basis_size_max);
   MemKK::realloc_kokkos(d_num_ms_combs, "pace:num_ms_combs", nelements, total_basis_size_max);
-  MemKK::realloc_kokkos(d_offsets, "pace:offsets", nelements, idx_rho_max);
+  MemKK::realloc_kokkos(d_func_inds, "pace:func_inds", nelements, idx_ms_combs_max);
   MemKK::realloc_kokkos(d_mus, "pace:mus", nelements, total_basis_size_max, basis_set->rankmax);
   MemKK::realloc_kokkos(d_ns, "pace:ns", nelements, total_basis_size_max, basis_set->rankmax);
   MemKK::realloc_kokkos(d_ls, "pace:ls", nelements, total_basis_size_max, basis_set->rankmax);
-  MemKK::realloc_kokkos(d_ms_combs, "pace:ms_combs", nelements, idx_rho_max, basis_set->rankmax);
-  MemKK::realloc_kokkos(d_ctildes, "pace:ctildes", nelements, idx_rho_max, basis_set->ndensitymax);
+  MemKK::realloc_kokkos(d_ms_combs, "pace:ms_combs", nelements, idx_ms_combs_max, basis_set->rankmax);
+  //MemKK::realloc_kokkos(d_ctildes, "pace:ctildes", nelements, idx_ms_combs_max, basis_set->ndensitymax);
+  MemKK::realloc_kokkos(d_gen_cgs, "pace:gen_cgs", nelements, idx_ms_combs_max);
+  MemKK::realloc_kokkos(d_coeffs, "pace:coeffs", nelements, total_basis_size_max, basis_set->ndensitymax);
 
   auto h_rank = Kokkos::create_mirror_view(d_rank);
   auto h_num_ms_combs = Kokkos::create_mirror_view(d_num_ms_combs);
-  auto h_offsets = Kokkos::create_mirror_view(d_offsets);
+  auto h_func_inds = Kokkos::create_mirror_view(d_func_inds);
   auto h_mus = Kokkos::create_mirror_view(d_mus);
   auto h_ns = Kokkos::create_mirror_view(d_ns);
   auto h_ls = Kokkos::create_mirror_view(d_ls);
   auto h_ms_combs = Kokkos::create_mirror_view(d_ms_combs);
-  auto h_ctildes = Kokkos::create_mirror_view(d_ctildes);
+//  auto h_ctildes = Kokkos::create_mirror_view(d_ctildes);
+  auto h_gen_cgs = Kokkos::create_mirror_view(d_gen_cgs);
+  auto h_coeffs = Kokkos::create_mirror_view(d_coeffs);
 
   // copy values on host
 
-  for (int n = 0; n < nelements; n++) {
-    const int total_basis_size_rank1 = basis_set->total_basis_size_rank1[n];
-    const int total_basis_size = basis_set->total_basis_size[n];
+  for (int mu = 0; mu < nelements; mu++) {
+    const int total_basis_size_rank1 = basis_set->total_basis_size_rank1[mu];
+    const int total_basis_size = basis_set->total_basis_size[mu];
 
-    ACECTildeBasisFunction *basis_rank1 = basis_set->basis_rank1[n];
-    ACECTildeBasisFunction *basis = basis_set->basis[n];
+    ACEBBasisFunction *basis_rank1 = basis_set->basis_rank1[mu];
+    ACEBBasisFunction *basis = basis_set->basis[mu];
 
-    const int ndensity = basis_set->map_embedding_specifications.at(n).ndensity;
+    const int ndensity = basis_set->map_embedding_specifications.at(mu).ndensity;
 
-    int idx_rho = 0;
+    int idx_ms_comb = 0;
 
     // rank=1
-    for (int offset = 0; offset < total_basis_size_rank1; ++offset) {
-      ACECTildeBasisFunction *func = &basis_rank1[offset];
-      h_rank(n, offset) = 1;
-      h_mus(n, offset, 0) = func->mus[0];
-      h_ns(n, offset, 0) = func->ns[0];
-      for (int p = 0; p < ndensity; p++)
-        h_ctildes(n, idx_rho, p) = func->ctildes[p];
-      h_offsets(n, idx_rho) = offset;
-      idx_rho++;
+    for (int func_ind = 0; func_ind < total_basis_size_rank1; ++func_ind) {
+      ACEBBasisFunction *func = &basis_rank1[func_ind];
+      h_rank(mu, func_ind) = 1;
+      h_mus(mu, func_ind, 0) = func->mus[0];
+      h_ns(mu, func_ind, 0) = func->ns[0];
+
+      for (int p = 0; p < ndensity; ++p)
+            h_coeffs(mu, func_ind, p) = func->coeff[p];
+
+      h_gen_cgs(mu, idx_ms_comb) = func->gen_cgs[0];
+
+      h_func_inds(mu, idx_ms_comb) = func_ind;
+      idx_ms_comb++;
     }
 
     // rank > 1
     for (int func_ind = 0; func_ind < total_basis_size; ++func_ind) {
-      ACECTildeBasisFunction *func = &basis[func_ind];
+      ACEBBasisFunction *func = &basis[func_ind];
       // TODO: check if func->ctildes are zero, then skip
 
-      const int offset = total_basis_size_rank1 + func_ind;
+      const int func_ind_through = total_basis_size_rank1 + func_ind;
 
-      const int rank = h_rank(n, offset) = func->rank;
-      h_num_ms_combs(n, offset) = func->num_ms_combs;
+      const int rank = h_rank(mu, func_ind_through) = func->rank;
+      h_num_ms_combs(mu, func_ind_through) = func->num_ms_combs;
       for (int t = 0; t < rank; t++) {
-        h_mus(n, offset, t) = func->mus[t];
-        h_ns(n, offset, t) = func->ns[t];
-        h_ls(n, offset, t) = func->ls[t];
+        h_mus(mu, func_ind_through, t) = func->mus[t];
+        h_ns(mu, func_ind_through, t) = func->ns[t];
+        h_ls(mu, func_ind_through, t) = func->ls[t];
       }
 
+      for (int p = 0; p < ndensity; ++p)
+            h_coeffs(mu, func_ind_through, p) = func->coeff[p];
+
+
       // loop over {ms} combinations in sum
       for (int ms_ind = 0; ms_ind < func->num_ms_combs; ++ms_ind) {
         auto ms = &func->ms_combs[ms_ind * rank]; // current ms-combination (of length = rank)
         for (int t = 0; t < rank; t++)
-          h_ms_combs(n, idx_rho, t) = ms[t];
+          h_ms_combs(mu, idx_ms_comb, t) = ms[t];
 
-        for (int p = 0; p < ndensity; ++p) {
-          // real-part only multiplication
-          h_ctildes(n, idx_rho, p) = func->ctildes[ms_ind * ndensity + p];
-        }
-        h_offsets(n, idx_rho) = offset;
-        idx_rho++;
+
+        h_gen_cgs(mu, idx_ms_comb) = func->gen_cgs[ms_ind];
+
+
+        h_func_inds(mu, idx_ms_comb) = func_ind_through;
+        idx_ms_comb++;
       }
     }
   }
 
   Kokkos::deep_copy(d_rank, h_rank);
   Kokkos::deep_copy(d_num_ms_combs, h_num_ms_combs);
-  Kokkos::deep_copy(d_offsets, h_offsets);
+  Kokkos::deep_copy(d_func_inds, h_func_inds);
   Kokkos::deep_copy(d_mus, h_mus);
   Kokkos::deep_copy(d_ns, h_ns);
   Kokkos::deep_copy(d_ls, h_ls);
   Kokkos::deep_copy(d_ms_combs, h_ms_combs);
-  Kokkos::deep_copy(d_ctildes, h_ctildes);
+//  Kokkos::deep_copy(d_ctildes, h_ctildes);
+  Kokkos::deep_copy(d_gen_cgs, h_gen_cgs);
+  Kokkos::deep_copy(d_coeffs, h_coeffs);
 }
 
 /* ----------------------------------------------------------------------
@@ -391,7 +416,7 @@ void PairPACEExtrapolationKokkos<DeviceType>::init_style()
       error->all(FLERR,"Pair style pace/kk can currently only run on a single "
                          "CPU thread");
 
-    PairPACE::init_style();
+    PairPACEExtrapolation::init_style();
     return;
   }
 
@@ -436,7 +461,7 @@ void PairPACEExtrapolationKokkos<DeviceType>::init_style()
 template<class DeviceType>
 double PairPACEExtrapolationKokkos<DeviceType>::init_one(int i, int j)
 {
-  double cutone = PairPACE::init_one(i,j);
+  double cutone = PairPACEExtrapolation::init_one(i,j);
 
   k_scale.h_view(i,j) = k_scale.h_view(j,i) = scale[i][j];
   k_scale.template modify<LMPHostType>();
@@ -454,7 +479,7 @@ double PairPACEExtrapolationKokkos<DeviceType>::init_one(int i, int j)
 template<class DeviceType>
 void PairPACEExtrapolationKokkos<DeviceType>::coeff(int narg, char **arg)
 {
-  PairPACE::coeff(narg,arg);
+  PairPACEExtrapolation::coeff(narg,arg);
 
   // Set up element lists
 
@@ -471,7 +496,7 @@ void PairPACEExtrapolationKokkos<DeviceType>::coeff(int narg, char **arg)
 template<class DeviceType>
 void PairPACEExtrapolationKokkos<DeviceType>::allocate()
 {
-  PairPACE::allocate();
+  PairPACEExtrapolation::allocate();
 
   int n = atom->ntypes + 1;
   MemKK::realloc_kokkos(d_map, "pace:map", n);
@@ -508,11 +533,10 @@ void PairPACEExtrapolationKokkos<DeviceType>::compute(int eflag_in, int vflag_in
 {
   if (host_flag) {
     atomKK->sync(Host,X_MASK|TYPE_MASK);
-    PairPACE::compute(eflag_in,vflag_in);
+    PairPACEExtrapolation::compute(eflag_in,vflag_in);
     atomKK->modified(Host,F_MASK);
     return;
   }
-
   eflag = eflag_in;
   vflag = vflag_in;
 
@@ -521,7 +545,6 @@ void PairPACEExtrapolationKokkos<DeviceType>::compute(int eflag_in, int vflag_in
   ev_init(eflag,vflag,0);
 
   // reallocate per-atom arrays if necessary
-
   if (eflag_atom) {
     memoryKK->destroy_kokkos(k_eatom,eatom);
     memoryKK->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom");
@@ -532,14 +555,10 @@ void PairPACEExtrapolationKokkos<DeviceType>::compute(int eflag_in, int vflag_in
     memoryKK->create_kokkos(k_vatom,vatom,maxvatom,"pair:vatom");
     d_vatom = k_vatom.view<DeviceType>();
   }
-
   copymode = 1;
   if (!force->newton_pair)
     error->all(FLERR,"PairPACEExtrapolationKokkos requires 'newton on'");
 
-  if (recursive)
-    error->all(FLERR,"Must use 'product' algorithm with pair pace/kk on the GPU");
-
   atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK);
   x = atomKK->k_x.view<DeviceType>();
   f = atomKK->k_f.view<DeviceType>();
@@ -636,7 +655,7 @@ void PairPACEExtrapolationKokkos<DeviceType>::compute(int eflag_in, int vflag_in
 
     //ComputeRho
     {
-      typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeRho> policy_rho(0,chunk_size*idx_rho_max);
+      typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeRho> policy_rho(0, chunk_size * idx_ms_combs_max);
       Kokkos::parallel_for("ComputeRho",policy_rho,*this);
     }
 
@@ -648,7 +667,7 @@ void PairPACEExtrapolationKokkos<DeviceType>::compute(int eflag_in, int vflag_in
 
     //ComputeWeights
     {
-      typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeWeights> policy_weights(0,chunk_size*idx_rho_max);
+      typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeWeights> policy_weights(0, chunk_size * idx_ms_combs_max);
       Kokkos::parallel_for("ComputeWeights",policy_weights,*this);
     }
 
@@ -910,63 +929,63 @@ template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeRho, const int& iter) const
 {
-  const int idx_rho = iter / chunk_size;
+  const int idx_ms_comb = iter / chunk_size;
   const int ii = iter % chunk_size;
 
   const int i = d_ilist[ii + chunk_offset];
   const int mu_i = d_map(type(i));
 
-  if (idx_rho >= d_idx_rho_count(mu_i)) return;
+  if (idx_ms_comb >= d_idx_ms_combs_count(mu_i)) return;
 
   const int ndensity = d_ndensity(mu_i);
 
-  const int offset = d_offsets(mu_i, idx_rho);
-  const int rank = d_rank(mu_i, offset);
+  const int func_ind = d_func_inds(mu_i, idx_ms_comb);
+  const int rank = d_rank(mu_i, func_ind);
   const int r = rank - 1;
 
   // Basis functions B with iterative product and density rho(p) calculation
   if (rank == 1) {
-    const int mu = d_mus(mu_i, offset, 0);
-    const int n = d_ns(mu_i, offset, 0);
+    const int mu = d_mus(mu_i, func_ind, 0);
+    const int n = d_ns(mu_i, func_ind, 0);
     double A_cur = A_rank1(ii, mu, n - 1);
     for (int p = 0; p < ndensity; ++p) {
       //for rank=1 (r=0) only 1 ms-combination exists (ms_ind=0), so index of func.ctildes is 0..ndensity-1
-      Kokkos::atomic_add(&rhos(ii, p), d_ctildes(mu_i, idx_rho, p) * A_cur);
+      Kokkos::atomic_add(&rhos(ii, p), d_coeffs(mu_i, func_ind, p) * d_gen_cgs(mu_i, idx_ms_comb) * A_cur);
     }
   } else { // rank > 1
     // loop over {ms} combinations in sum
 
     // loop over m, collect B  = product of A with given ms
-    A_forward_prod(ii, idx_rho, 0) = complex::one();
+    A_forward_prod(ii, idx_ms_comb, 0) = complex::one();
 
     // fill forward A-product triangle
     for (int t = 0; t < rank; t++) {
       //TODO: optimize ns[t]-1 -> ns[t] during functions construction
-      const int mu = d_mus(mu_i, offset, t);
-      const int n = d_ns(mu_i, offset, t);
-      const int l = d_ls(mu_i, offset, t);
-      const int m = d_ms_combs(mu_i, idx_rho, t); // current ms-combination (of length = rank)
+      const int mu = d_mus(mu_i, func_ind, t);
+      const int n = d_ns(mu_i, func_ind, t);
+      const int l = d_ls(mu_i, func_ind, t);
+      const int m = d_ms_combs(mu_i, idx_ms_comb, t); // current ms-combination (of length = rank)
       const int idx = l * (l + 1) + m; // (l, m)
-      A_list(ii, idx_rho, t) = A(ii, mu, n - 1, idx);
-      A_forward_prod(ii, idx_rho, t + 1) = A_forward_prod(ii, idx_rho, t) * A_list(ii, idx_rho, t);
+      A_list(ii, idx_ms_comb, t) = A(ii, mu, n - 1, idx);
+      A_forward_prod(ii, idx_ms_comb, t + 1) = A_forward_prod(ii, idx_ms_comb, t) * A_list(ii, idx_ms_comb, t);
     }
 
     complex A_backward_prod = complex::one();
 
     // fill backward A-product triangle
     for (int t = r; t >= 1; t--) {
-      const complex dB = A_forward_prod(ii, idx_rho, t) * A_backward_prod; // dB - product of all A's except t-th
-      dB_flatten(ii, idx_rho, t) = dB;
+      const complex dB = A_forward_prod(ii, idx_ms_comb, t) * A_backward_prod; // dB - product of all A's except t-th
+      dB_flatten(ii, idx_ms_comb, t) = dB;
 
-      A_backward_prod = A_backward_prod * A_list(ii, idx_rho, t);
+      A_backward_prod = A_backward_prod * A_list(ii, idx_ms_comb, t);
     }
-    dB_flatten(ii, idx_rho, 0) = A_forward_prod(ii, idx_rho, 0) * A_backward_prod;
+    dB_flatten(ii, idx_ms_comb, 0) = A_forward_prod(ii, idx_ms_comb, 0) * A_backward_prod;
 
-    const complex B = A_forward_prod(ii, idx_rho, rank);
+    const complex B = A_forward_prod(ii, idx_ms_comb, rank);
 
     for (int p = 0; p < ndensity; ++p) {
       // real-part only multiplication
-      Kokkos::atomic_add(&rhos(ii, p), B.real_part_product(d_ctildes(mu_i, idx_rho, p)));
+      Kokkos::atomic_add(&rhos(ii, p), B.real_part_product(d_coeffs(mu_i, func_ind, p) * d_gen_cgs(mu_i, idx_ms_comb)));
     }
   }
 }
@@ -1011,43 +1030,43 @@ template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeWeights, const int& iter) const
 {
-  const int idx_rho = iter / chunk_size;
+  const int idx_ms_comb = iter / chunk_size;
   const int ii = iter % chunk_size;
 
   const int i = d_ilist[ii + chunk_offset];
   const int mu_i = d_map(type(i));
 
-  if (idx_rho >= d_idx_rho_count(mu_i)) return;
+  if (idx_ms_comb >= d_idx_ms_combs_count(mu_i)) return;
 
   const int ndensity = d_ndensity(mu_i);
 
-  const int offset = d_offsets(mu_i, idx_rho);
-  const int rank = d_rank(mu_i, offset);
+  const int func_ind = d_func_inds(mu_i, idx_ms_comb);
+  const int rank = d_rank(mu_i, func_ind);
 
   // Weights and theta calculation
 
   if (rank == 1) {
-    const int mu = d_mus(mu_i, offset, 0);
-    const int n = d_ns(mu_i, offset, 0);
+    const int mu = d_mus(mu_i, func_ind, 0);
+    const int n = d_ns(mu_i, func_ind, 0);
     double theta = 0.0;
     for (int p = 0; p < ndensity; ++p) {
       // for rank=1 (r=0) only 1 ms-combination exists (ms_ind=0), so index of func.ctildes is 0..ndensity-1
-      theta += dF_drho(ii, p) * d_ctildes(mu_i, idx_rho, p);
+      theta += dF_drho(ii, p) * d_coeffs(mu_i, func_ind, p) * d_gen_cgs(mu_i, idx_ms_comb);
     }
     Kokkos::atomic_add(&weights_rank1(ii, mu, n - 1), theta);
   } else { // rank > 1
     double theta = 0.0;
     for (int p = 0; p < ndensity; ++p)
-      theta += dF_drho(ii, p) * d_ctildes(mu_i, idx_rho, p);
+      theta += dF_drho(ii, p) * d_coeffs(mu_i, func_ind, p) * d_gen_cgs(mu_i, idx_ms_comb);
 
     theta *= 0.5; // 0.5 factor due to possible double counting ???
     for (int t = 0; t < rank; ++t) {
-      const int m_t = d_ms_combs(mu_i, idx_rho, t);
+      const int m_t = d_ms_combs(mu_i, idx_ms_comb, t);
       const int factor = (m_t % 2 == 0 ? 1 : -1);
-      const complex dB = dB_flatten(ii, idx_rho, t);
-      const int mu_t = d_mus(mu_i, offset, t);
-      const int n_t = d_ns(mu_i, offset, t);
-      const int l_t = d_ls(mu_i, offset, t);
+      const complex dB = dB_flatten(ii, idx_ms_comb, t);
+      const int mu_t = d_mus(mu_i, func_ind, t);
+      const int n_t = d_ns(mu_i, func_ind, t);
+      const int l_t = d_ls(mu_i, func_ind, t);
       const int idx = l_t * (l_t + 1) + m_t; // (l, m)
       const complex value = theta * dB;
       Kokkos::atomic_add(&(weights(ii, mu_t, n_t - 1, idx).re), value.re);
@@ -1687,15 +1706,17 @@ double PairPACEExtrapolationKokkos<DeviceType>::memory_usage()
   bytes += MemKK::memory_usage(d_npoti);
   bytes += MemKK::memory_usage(d_wpre);
   bytes += MemKK::memory_usage(d_mexp);
-  bytes += MemKK::memory_usage(d_idx_rho_count);
+  bytes += MemKK::memory_usage(d_idx_ms_combs_count);
   bytes += MemKK::memory_usage(d_rank);
   bytes += MemKK::memory_usage(d_num_ms_combs);
-  bytes += MemKK::memory_usage(d_offsets);
+  bytes += MemKK::memory_usage(d_func_inds);
   bytes += MemKK::memory_usage(d_mus);
   bytes += MemKK::memory_usage(d_ns);
   bytes += MemKK::memory_usage(d_ls);
   bytes += MemKK::memory_usage(d_ms_combs);
-  bytes += MemKK::memory_usage(d_ctildes);
+//  bytes += MemKK::memory_usage(d_ctildes);
+  bytes += MemKK::memory_usage(d_gen_cgs);
+  bytes += MemKK::memory_usage(d_coeffs);
   bytes += MemKK::memory_usage(alm);
   bytes += MemKK::memory_usage(blm);
   bytes += MemKK::memory_usage(cl);
diff --git a/src/KOKKOS/pair_pace_extrapolation_kokkos.h b/src/KOKKOS/pair_pace_extrapolation_kokkos.h
index 482d068725..568d12b90c 100644
--- a/src/KOKKOS/pair_pace_extrapolation_kokkos.h
+++ b/src/KOKKOS/pair_pace_extrapolation_kokkos.h
@@ -23,7 +23,7 @@ PairStyle(pace/extrapolation/kk/host,PairPACEExtrapolationKokkos<LMPHostType>);
 #ifndef LMP_PAIR_PACE_EXTRAPOLATION_KOKKOS_H
 #define LMP_PAIR_PACE_EXTRAPOLATION_KOKKOS_H
 
-#include "pair_pace.h"
+#include "pair_pace_extrapolation.h"
 #include "ace-evaluator/ace_radial.h"
 #include "kokkos_type.h"
 #include "pair_kokkos.h"
@@ -31,7 +31,7 @@ PairStyle(pace/extrapolation/kk/host,PairPACEExtrapolationKokkos<LMPHostType>);
 namespace LAMMPS_NS {
 
 template<class DeviceType>
-class PairPACEExtrapolationKokkos : public PairPACE {
+class PairPACEExtrapolationKokkos : public PairPACEExtrapolation {
  public:
   struct TagPairPACEComputeNeigh{};
   struct TagPairPACEComputeRadial{};
@@ -95,7 +95,7 @@ class PairPACEExtrapolationKokkos : public PairPACE {
   void operator() (TagPairPACEComputeForce<NEIGHFLAG,EVFLAG>,const int& ii, EV_FLOAT&) const;
 
  protected:
-  int inum, maxneigh, chunk_size, chunk_offset, idx_rho_max;
+  int inum, maxneigh, chunk_size, chunk_offset, idx_ms_combs_max;
   int host_flag;
 
   int eflag, vflag;
@@ -274,15 +274,17 @@ class PairPACEExtrapolationKokkos : public PairPACE {
   t_ace_2d d_mexp;
 
   // tilde
-  t_ace_1i d_idx_rho_count;
+  t_ace_1i d_idx_ms_combs_count;
   t_ace_2i d_rank;
   t_ace_2i d_num_ms_combs;
-  t_ace_2i d_offsets;
+  t_ace_2i d_func_inds;
   t_ace_3i d_mus;
   t_ace_3i d_ns;
   t_ace_3i d_ls;
   t_ace_3i d_ms_combs;
-  t_ace_3d d_ctildes;
+//  t_ace_3d d_ctildes;
+  t_ace_2d d_gen_cgs;
+  t_ace_3d d_coeffs;
 
   t_ace_3d3 f_ij;
 
diff --git a/src/ML-PACE/pair_pace_extrapolation.cpp b/src/ML-PACE/pair_pace_extrapolation.cpp
index ec185e75df..2c5e5fafe9 100644
--- a/src/ML-PACE/pair_pace_extrapolation.cpp
+++ b/src/ML-PACE/pair_pace_extrapolation.cpp
@@ -98,6 +98,8 @@ PairPACEExtrapolation::PairPACEExtrapolation(LAMMPS *lmp) : Pair(lmp)
   scale = nullptr;
   flag_compute_extrapolation_grade = 0;
   extrapolation_grade_gamma = nullptr;
+
+  chunksize = 4096;
 }
 
 /* ----------------------------------------------------------------------
@@ -133,7 +135,7 @@ void PairPACEExtrapolation::compute(int eflag, int vflag)
 
   double **x = atom->x;
   double **f = atom->f;
-  tagint *tag = atom->tag;
+//  tagint *tag = atom->tag;
   int *type = atom->type;
   // number of atoms in cell
   int nlocal = atom->nlocal;
@@ -141,7 +143,7 @@ void PairPACEExtrapolation::compute(int eflag, int vflag)
   int newton_pair = force->newton_pair;
 
   // number of atoms including ghost atoms
-  int nall = nlocal + atom->nghost;
+//  int nall = nlocal + atom->nghost;
 
   // inum: length of the neighborlists list
   inum = list->inum;
@@ -283,7 +285,20 @@ void PairPACEExtrapolation::allocate()
 
 void PairPACEExtrapolation::settings(int narg, char **arg)
 {
-  if (narg > 0) error->all(FLERR, "Pair style pace/extrapolation supports no keywords");
+//  if (narg > 2) error->all(FLERR, "Pair style pace/extrapolation supports no keywords");
+  if (narg > 2) utils::missing_cmd_args(FLERR, "pair_style pace/extrapolation", error);
+  // ACE potentials are parameterized in metal units
+  if (strcmp("metal", update->unit_style) != 0)
+    error->all(FLERR, "ACE potentials require 'metal' units");
+
+  int iarg = 0;
+  while (iarg < narg) {
+      if (strcmp(arg[iarg], "chunksize") == 0) {
+          chunksize = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
+          iarg += 2;
+      } else
+          error->all(FLERR, "Unknown pair_style pace keyword: {}", arg[iarg]);
+  }
 
   if (comm->me == 0)
     utils::logmesg(lmp, "ACE/AL version: {}.{}.{}\n", VERSION_YEAR, VERSION_MONTH, VERSION_DAY);
@@ -343,7 +358,6 @@ void PairPACEExtrapolation::coeff(int narg, char **arg)
   aceimpl->rec_ace->element_type_mapping.init(atom->ntypes + 1);
   aceimpl->rec_ace->element_type_mapping.fill(-1);    //-1 means atom not included into potential
 
-  FILE *species_type_file = nullptr;
 
   const int n = atom->ntypes;
   element_names.resize(n);
diff --git a/src/ML-PACE/pair_pace_extrapolation.h b/src/ML-PACE/pair_pace_extrapolation.h
index c5d9da23db..6f7eeb279e 100644
--- a/src/ML-PACE/pair_pace_extrapolation.h
+++ b/src/ML-PACE/pair_pace_extrapolation.h
@@ -49,13 +49,15 @@ class PairPACEExtrapolation : public Pair {
   struct ACEALImpl *aceimpl;
   int nmax;
 
-  void allocate();
+  virtual void allocate();
   std::vector<std::string> element_names;    // list of elements (used by dump pace/extrapolation)
   double *extrapolation_grade_gamma;         //per-atom gamma value
 
   int flag_compute_extrapolation_grade;
 
   double **scale;
+
+  int chunksize;
 };
 
 }    // namespace LAMMPS_NS

From 014b892e3b13fd3e160d17fe5f7135052a810b49 Mon Sep 17 00:00:00 2001
From: Yury Lysogorskiy <yury.lysogorskiy@rub.de>
Date: Mon, 2 Jan 2023 23:34:24 +0100
Subject: [PATCH 03/12] WIP: - add TagPairPACEComputeGamma kernel - add
 d_total_basis_size - add gamma_flag, d_ASI, projections and gamma

---
 src/KOKKOS/pair_pace_extrapolation_kokkos.cpp | 109 ++++++++++++++----
 src/KOKKOS/pair_pace_extrapolation_kokkos.h   |  12 +-
 2 files changed, 100 insertions(+), 21 deletions(-)

diff --git a/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp
index 006cf5e609..c5ca97d036 100644
--- a/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp
+++ b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp
@@ -13,7 +13,7 @@
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing author: Stan Moore (SNL)
+   Contributing author: Yury Lysogorskiy (ICAMS)
 ------------------------------------------------------------------------- */
 
 #include "pair_pace_extrapolation_kokkos.h"
@@ -29,9 +29,9 @@
 #include "neighbor_kokkos.h"
 #include "neigh_request.h"
 
-#include "ace-evaluator/ace_c_basis.h"
-#include "ace-evaluator/ace_evaluator.h"
-#include "ace-evaluator/ace_recursive.h"
+//#include "ace-evaluator/ace_c_basis.h"
+//#include "ace-evaluator/ace_evaluator.h"
+//#include "ace-evaluator/ace_recursive.h"
 #include "ace-evaluator/ace_version.h"
 #include "ace-evaluator/ace_radial.h"
 
@@ -42,21 +42,16 @@
 
 namespace LAMMPS_NS {
     struct ACEALImpl {
-        ACEALImpl() : basis_set(nullptr), ace(nullptr), ctilde_basis_set(nullptr), rec_ace(nullptr) {}
+        ACEALImpl() : basis_set(nullptr), ace(nullptr) {}
 
         ~ACEALImpl()
         {
             delete basis_set;
             delete ace;
-
-            delete ctilde_basis_set;
-            delete rec_ace;
         }
 
         ACEBBasisSet *basis_set;
         ACEBEvaluator *ace;
-        ACECTildeBasisSet *ctilde_basis_set;
-        ACERecursiveEvaluator *rec_ace;
     };
 }    // namespace LAMMPS_NS
 
@@ -132,7 +127,12 @@ void PairPACEExtrapolationKokkos<DeviceType>::grow(int natom, int maxneigh)
     // hard-core repulsion
     MemKK::realloc_kokkos(rho_core, "pace:rho_core", natom);
     MemKK::realloc_kokkos(dF_drho_core, "pace:dF_drho_core", natom);
+
     MemKK::realloc_kokkos(dB_flatten, "pace:dB_flatten", natom, idx_ms_combs_max, basis_set->rankmax);
+
+    //B-projections
+    MemKK::realloc_kokkos(projections, "pace:projections", natom, total_num_functions_max); // per-atom B-projections
+    MemKK::realloc_kokkos(gamma, "pace:gamma", natom); // per-atom gamma
   }
 
   if (((int)ylm.extent(0) < natom) || ((int)ylm.extent(1) < maxneigh)) {
@@ -271,14 +271,17 @@ template<class DeviceType>
 void PairPACEExtrapolationKokkos<DeviceType>::copy_tilde()
 {
   auto basis_set = aceimpl->basis_set;
+  auto b_evaluator = aceimpl->ace;
 
   // flatten loops, get per-element count and max
 
   idx_ms_combs_max = 0;
-  int total_basis_size_max = 0;
+  total_num_functions_max = 0;
 
   MemKK::realloc_kokkos(d_idx_ms_combs_count, "pace:idx_ms_combs_count", nelements);
+  MemKK::realloc_kokkos(d_total_basis_size, "pace:total_basis_size", nelements);
   auto h_idx_ms_combs_count = Kokkos::create_mirror_view(d_idx_ms_combs_count);
+  auto h_total_basis_size = Kokkos::create_mirror_view(d_total_basis_size);
 
   for (int mu = 0; mu < nelements; mu++) {
     int idx_ms_combs = 0;
@@ -301,21 +304,25 @@ void PairPACEExtrapolationKokkos<DeviceType>::copy_tilde()
     }
     h_idx_ms_combs_count(mu) = idx_ms_combs;
     idx_ms_combs_max = MAX(idx_ms_combs_max, idx_ms_combs);
-    total_basis_size_max = MAX(total_basis_size_max, total_basis_size_rank1 + total_basis_size);
+    total_num_functions_max = MAX(total_num_functions_max, total_basis_size_rank1 + total_basis_size);
+    h_total_basis_size(mu) = total_basis_size_rank1 + total_basis_size;
   }
 
   Kokkos::deep_copy(d_idx_ms_combs_count, h_idx_ms_combs_count);
+  Kokkos::deep_copy(d_total_basis_size, h_total_basis_size);
 
-  MemKK::realloc_kokkos(d_rank, "pace:rank", nelements, total_basis_size_max);
-  MemKK::realloc_kokkos(d_num_ms_combs, "pace:num_ms_combs", nelements, total_basis_size_max);
+  MemKK::realloc_kokkos(d_rank, "pace:rank", nelements, total_num_functions_max);
+  MemKK::realloc_kokkos(d_num_ms_combs, "pace:num_ms_combs", nelements, total_num_functions_max);
   MemKK::realloc_kokkos(d_func_inds, "pace:func_inds", nelements, idx_ms_combs_max);
-  MemKK::realloc_kokkos(d_mus, "pace:mus", nelements, total_basis_size_max, basis_set->rankmax);
-  MemKK::realloc_kokkos(d_ns, "pace:ns", nelements, total_basis_size_max, basis_set->rankmax);
-  MemKK::realloc_kokkos(d_ls, "pace:ls", nelements, total_basis_size_max, basis_set->rankmax);
+  MemKK::realloc_kokkos(d_mus, "pace:mus", nelements, total_num_functions_max, basis_set->rankmax);
+  MemKK::realloc_kokkos(d_ns, "pace:ns", nelements, total_num_functions_max, basis_set->rankmax);
+  MemKK::realloc_kokkos(d_ls, "pace:ls", nelements, total_num_functions_max, basis_set->rankmax);
   MemKK::realloc_kokkos(d_ms_combs, "pace:ms_combs", nelements, idx_ms_combs_max, basis_set->rankmax);
-  //MemKK::realloc_kokkos(d_ctildes, "pace:ctildes", nelements, idx_ms_combs_max, basis_set->ndensitymax);
   MemKK::realloc_kokkos(d_gen_cgs, "pace:gen_cgs", nelements, idx_ms_combs_max);
-  MemKK::realloc_kokkos(d_coeffs, "pace:coeffs", nelements, total_basis_size_max, basis_set->ndensitymax);
+  MemKK::realloc_kokkos(d_coeffs, "pace:coeffs", nelements, total_num_functions_max, basis_set->ndensitymax);
+  // active set inverted
+  MemKK::realloc_kokkos(d_ASI, "pace:ASI", nelements, total_num_functions_max, total_num_functions_max);
+
 
   auto h_rank = Kokkos::create_mirror_view(d_rank);
   auto h_num_ms_combs = Kokkos::create_mirror_view(d_num_ms_combs);
@@ -327,6 +334,8 @@ void PairPACEExtrapolationKokkos<DeviceType>::copy_tilde()
 //  auto h_ctildes = Kokkos::create_mirror_view(d_ctildes);
   auto h_gen_cgs = Kokkos::create_mirror_view(d_gen_cgs);
   auto h_coeffs = Kokkos::create_mirror_view(d_coeffs);
+// asi
+  auto h_ASI = Kokkos::create_mirror_view(d_ASI);
 
   // copy values on host
 
@@ -390,6 +399,14 @@ void PairPACEExtrapolationKokkos<DeviceType>::copy_tilde()
         idx_ms_comb++;
       }
     }
+
+    // ASI
+    const auto &A_as_inv = b_evaluator->A_active_set_inv.at(mu);
+    for(int i = 0; i < total_basis_size_rank1 + total_basis_size; i++)
+        for(int j = 0; j < total_basis_size_rank1 + total_basis_size; j++){
+            h_ASI(mu,i,j)=A_as_inv(i,j);
+    }
+
   }
 
   Kokkos::deep_copy(d_rank, h_rank);
@@ -402,6 +419,7 @@ void PairPACEExtrapolationKokkos<DeviceType>::copy_tilde()
 //  Kokkos::deep_copy(d_ctildes, h_ctildes);
   Kokkos::deep_copy(d_gen_cgs, h_gen_cgs);
   Kokkos::deep_copy(d_coeffs, h_coeffs);
+  Kokkos::deep_copy(d_ASI, h_ASI);
 }
 
 /* ----------------------------------------------------------------------
@@ -591,6 +609,7 @@ void PairPACEExtrapolationKokkos<DeviceType>::compute(int eflag_in, int vflag_in
 
   chunk_size = MIN(chunksize,inum); // "chunksize" variable is set by user
   chunk_offset = 0;
+  gamma_flag = 1;
 
   grow(chunk_size, maxneigh);
 
@@ -604,6 +623,9 @@ void PairPACEExtrapolationKokkos<DeviceType>::compute(int eflag_in, int vflag_in
     Kokkos::deep_copy(A_rank1, 0.0);
     Kokkos::deep_copy(rhos, 0.0);
 
+    Kokkos::deep_copy(projections, 0.0);
+    Kokkos::deep_copy(gamma, 0.0);
+
     EV_FLOAT ev_tmp;
 
     if (chunk_size > inum - chunk_offset)
@@ -665,6 +687,12 @@ void PairPACEExtrapolationKokkos<DeviceType>::compute(int eflag_in, int vflag_in
       Kokkos::parallel_for("ComputeFS",policy_fs,*this);
     }
 
+    //ComputeGamma
+    if (gamma_flag) {
+      typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeGamma> policy_gamma(0,chunk_size);
+      Kokkos::parallel_for("ComputeGamma",policy_gamma,*this);
+    }
+
     //ComputeWeights
     {
       typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeWeights> policy_weights(0, chunk_size * idx_ms_combs_max);
@@ -952,6 +980,12 @@ void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeRho,
       //for rank=1 (r=0) only 1 ms-combination exists (ms_ind=0), so index of func.ctildes is 0..ndensity-1
       Kokkos::atomic_add(&rhos(ii, p), d_coeffs(mu_i, func_ind, p) * d_gen_cgs(mu_i, idx_ms_comb) * A_cur);
     }
+
+
+    //gamma_i
+    if(gamma_flag)
+        Kokkos::atomic_add(&projections(ii, func_ind),  d_gen_cgs(mu_i, idx_ms_comb) * A_cur);
+
   } else { // rank > 1
     // loop over {ms} combinations in sum
 
@@ -987,6 +1021,9 @@ void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeRho,
       // real-part only multiplication
       Kokkos::atomic_add(&rhos(ii, p), B.real_part_product(d_coeffs(mu_i, func_ind, p) * d_gen_cgs(mu_i, idx_ms_comb)));
     }
+    //gamma_i
+    if(gamma_flag)
+        Kokkos::atomic_add(&projections(ii, func_ind),  B.real_part_product(d_gen_cgs(mu_i, idx_ms_comb)));
   }
 }
 
@@ -1024,6 +1061,35 @@ void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeFS,
   }
 }
 
+
+/* ---------------------------------------------------------------------- */
+
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeGamma, const int& ii) const
+{
+    const int i = d_ilist[ii + chunk_offset];
+    const int mu_i = d_map(type(i));
+    const int basis_size = d_total_basis_size(mu_i);
+
+
+    double gamma_max = 0;
+    for (int j = 0; j <basis_size; j++) {
+        double current_gamma = 0;
+
+        // compute row-matrix-multiplication asi_vector * A_as_inv (transposed matrix)
+        for (int k = 0; k < basis_size; k++)
+            current_gamma += projections(ii,k) * d_ASI(mu_i, j, k); //correct d_ASI(mu_i, j, k)
+
+        if (abs(current_gamma) > gamma_max)
+            gamma_max = abs(current_gamma);
+    }
+
+    // tally energy contribution
+    gamma(ii) = gamma_max;
+
+}
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
@@ -1714,13 +1780,16 @@ double PairPACEExtrapolationKokkos<DeviceType>::memory_usage()
   bytes += MemKK::memory_usage(d_ns);
   bytes += MemKK::memory_usage(d_ls);
   bytes += MemKK::memory_usage(d_ms_combs);
-//  bytes += MemKK::memory_usage(d_ctildes);
   bytes += MemKK::memory_usage(d_gen_cgs);
   bytes += MemKK::memory_usage(d_coeffs);
   bytes += MemKK::memory_usage(alm);
   bytes += MemKK::memory_usage(blm);
   bytes += MemKK::memory_usage(cl);
   bytes += MemKK::memory_usage(dl);
+  bytes += MemKK::memory_usage(d_total_basis_size);
+  bytes += MemKK::memory_usage(d_ASI);
+  bytes += MemKK::memory_usage(projections);
+  bytes += MemKK::memory_usage(gamma);
 
   if (k_splines_gk.h_view.data()) {
     for (int i = 0; i < nelements; i++) {
diff --git a/src/KOKKOS/pair_pace_extrapolation_kokkos.h b/src/KOKKOS/pair_pace_extrapolation_kokkos.h
index 568d12b90c..a5b4f8abc7 100644
--- a/src/KOKKOS/pair_pace_extrapolation_kokkos.h
+++ b/src/KOKKOS/pair_pace_extrapolation_kokkos.h
@@ -40,6 +40,7 @@ class PairPACEExtrapolationKokkos : public PairPACEExtrapolation {
   struct TagPairPACEConjugateAi{};
   struct TagPairPACEComputeRho{};
   struct TagPairPACEComputeFS{};
+  struct TagPairPACEComputeGamma{};
   struct TagPairPACEComputeWeights{};
   struct TagPairPACEComputeDerivative{};
 
@@ -80,6 +81,9 @@ class PairPACEExtrapolationKokkos : public PairPACEExtrapolation {
   KOKKOS_INLINE_FUNCTION
   void operator() (TagPairPACEComputeFS,const int& ii) const;
 
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairPACEComputeGamma, const int& ii) const;
+
   KOKKOS_INLINE_FUNCTION
   void operator() (TagPairPACEComputeWeights,const int& iter) const;
 
@@ -95,8 +99,9 @@ class PairPACEExtrapolationKokkos : public PairPACEExtrapolation {
   void operator() (TagPairPACEComputeForce<NEIGHFLAG,EVFLAG>,const int& ii, EV_FLOAT&) const;
 
  protected:
-  int inum, maxneigh, chunk_size, chunk_offset, idx_ms_combs_max;
+  int inum, maxneigh, chunk_size, chunk_offset, idx_ms_combs_max, total_num_functions_max;
   int host_flag;
+  int gamma_flag;
 
   int eflag, vflag;
 
@@ -236,6 +241,10 @@ class PairPACEExtrapolationKokkos : public PairPACEExtrapolation {
   t_ace_3d d_values;
   t_ace_3d d_derivatives;
 
+  // inverted active set
+  t_ace_3d d_ASI;
+  t_ace_2d projections;
+  t_ace_1d gamma;
   // Spherical Harmonics
 
   void pre_compute_harmonics(int);
@@ -275,6 +284,7 @@ class PairPACEExtrapolationKokkos : public PairPACEExtrapolation {
 
   // tilde
   t_ace_1i d_idx_ms_combs_count;
+  t_ace_1i d_total_basis_size;
   t_ace_2i d_rank;
   t_ace_2i d_num_ms_combs;
   t_ace_2i d_func_inds;

From 1f36bc49ab234151b95525d3e7c502af845782a5 Mon Sep 17 00:00:00 2001
From: Yury Lysogorskiy <yury.lysogorskiy@rub.de>
Date: Tue, 3 Jan 2023 14:08:31 +0100
Subject: [PATCH 04/12] - add extract and extract_peratom methods - rename
 device gamma array to d_gamma - make host h_gamma array - copy from h_gamma
 to host extrapolation_grade_gamma array for each chunk - transpose two last
 dimensions of d_ASI (small improvement of performance) - manage grows of
 extrapolation_grade_gamma

---
 src/KOKKOS/pair_pace_extrapolation_kokkos.cpp | 91 +++++++++++++++----
 src/KOKKOS/pair_pace_extrapolation_kokkos.h   | 14 ++-
 2 files changed, 82 insertions(+), 23 deletions(-)

diff --git a/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp
index c5ca97d036..cbaf92d800 100644
--- a/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp
+++ b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp
@@ -29,9 +29,6 @@
 #include "neighbor_kokkos.h"
 #include "neigh_request.h"
 
-//#include "ace-evaluator/ace_c_basis.h"
-//#include "ace-evaluator/ace_evaluator.h"
-//#include "ace-evaluator/ace_recursive.h"
 #include "ace-evaluator/ace_version.h"
 #include "ace-evaluator/ace_radial.h"
 
@@ -132,7 +129,7 @@ void PairPACEExtrapolationKokkos<DeviceType>::grow(int natom, int maxneigh)
 
     //B-projections
     MemKK::realloc_kokkos(projections, "pace:projections", natom, total_num_functions_max); // per-atom B-projections
-    MemKK::realloc_kokkos(gamma, "pace:gamma", natom); // per-atom gamma
+    MemKK::realloc_kokkos(d_gamma, "pace:gamma", natom); // per-atom gamma
   }
 
   if (((int)ylm.extent(0) < natom) || ((int)ylm.extent(1) < maxneigh)) {
@@ -321,7 +318,9 @@ void PairPACEExtrapolationKokkos<DeviceType>::copy_tilde()
   MemKK::realloc_kokkos(d_gen_cgs, "pace:gen_cgs", nelements, idx_ms_combs_max);
   MemKK::realloc_kokkos(d_coeffs, "pace:coeffs", nelements, total_num_functions_max, basis_set->ndensitymax);
   // active set inverted
-  MemKK::realloc_kokkos(d_ASI, "pace:ASI", nelements, total_num_functions_max, total_num_functions_max);
+  t_ace_3d d_ASI_temp;
+  MemKK::realloc_kokkos(d_ASI_temp, "pace:ASI_temp", nelements, total_num_functions_max, total_num_functions_max);
+
 
 
   auto h_rank = Kokkos::create_mirror_view(d_rank);
@@ -331,11 +330,10 @@ void PairPACEExtrapolationKokkos<DeviceType>::copy_tilde()
   auto h_ns = Kokkos::create_mirror_view(d_ns);
   auto h_ls = Kokkos::create_mirror_view(d_ls);
   auto h_ms_combs = Kokkos::create_mirror_view(d_ms_combs);
-//  auto h_ctildes = Kokkos::create_mirror_view(d_ctildes);
   auto h_gen_cgs = Kokkos::create_mirror_view(d_gen_cgs);
   auto h_coeffs = Kokkos::create_mirror_view(d_coeffs);
 // asi
-  auto h_ASI = Kokkos::create_mirror_view(d_ASI);
+  auto h_ASI = Kokkos::create_mirror_view(d_ASI_temp);
 
   // copy values on host
 
@@ -404,7 +402,7 @@ void PairPACEExtrapolationKokkos<DeviceType>::copy_tilde()
     const auto &A_as_inv = b_evaluator->A_active_set_inv.at(mu);
     for(int i = 0; i < total_basis_size_rank1 + total_basis_size; i++)
         for(int j = 0; j < total_basis_size_rank1 + total_basis_size; j++){
-            h_ASI(mu,i,j)=A_as_inv(i,j);
+            h_ASI(mu,i,j)=A_as_inv(j,i); // transpose back for better performance on GPU
     }
 
   }
@@ -416,10 +414,10 @@ void PairPACEExtrapolationKokkos<DeviceType>::copy_tilde()
   Kokkos::deep_copy(d_ns, h_ns);
   Kokkos::deep_copy(d_ls, h_ls);
   Kokkos::deep_copy(d_ms_combs, h_ms_combs);
-//  Kokkos::deep_copy(d_ctildes, h_ctildes);
   Kokkos::deep_copy(d_gen_cgs, h_gen_cgs);
   Kokkos::deep_copy(d_coeffs, h_coeffs);
-  Kokkos::deep_copy(d_ASI, h_ASI);
+  Kokkos::deep_copy(d_ASI_temp, h_ASI);
+  d_ASI = d_ASI_temp; // copy from temopary array to const array
 }
 
 /* ----------------------------------------------------------------------
@@ -573,6 +571,15 @@ void PairPACEExtrapolationKokkos<DeviceType>::compute(int eflag_in, int vflag_in
     memoryKK->create_kokkos(k_vatom,vatom,maxvatom,"pair:vatom");
     d_vatom = k_vatom.view<DeviceType>();
   }
+
+  if (gamma_flag && atom->nlocal > nmax) {
+        memory->destroy(extrapolation_grade_gamma);
+        nmax = atom->nlocal;
+        memory->create(extrapolation_grade_gamma, nmax, "pace/atom:gamma");
+        //zeroify array
+        memset(extrapolation_grade_gamma, 0, nmax * sizeof(*extrapolation_grade_gamma));
+  }
+
   copymode = 1;
   if (!force->newton_pair)
     error->all(FLERR,"PairPACEExtrapolationKokkos requires 'newton on'");
@@ -609,7 +616,7 @@ void PairPACEExtrapolationKokkos<DeviceType>::compute(int eflag_in, int vflag_in
 
   chunk_size = MIN(chunksize,inum); // "chunksize" variable is set by user
   chunk_offset = 0;
-  gamma_flag = 1;
+
 
   grow(chunk_size, maxneigh);
 
@@ -624,7 +631,7 @@ void PairPACEExtrapolationKokkos<DeviceType>::compute(int eflag_in, int vflag_in
     Kokkos::deep_copy(rhos, 0.0);
 
     Kokkos::deep_copy(projections, 0.0);
-    Kokkos::deep_copy(gamma, 0.0);
+    Kokkos::deep_copy(d_gamma, 0.0);
 
     EV_FLOAT ev_tmp;
 
@@ -729,8 +736,15 @@ void PairPACEExtrapolationKokkos<DeviceType>::compute(int eflag_in, int vflag_in
       }
     }
     ev += ev_tmp;
-    chunk_offset += chunk_size;
 
+    //if gamma_flag - copy current d_gamma to extrapolation_grade_gamma
+    if(gamma_flag){
+        h_gamma = Kokkos::create_mirror_view(d_gamma);
+        Kokkos:deep_copy(h_gamma, d_gamma);
+        memcpy(extrapolation_grade_gamma+chunk_offset, (void *) h_gamma.data(), sizeof(double)*chunk_size);
+    }
+
+    chunk_offset += chunk_size;
   } // end while
 
   if (need_dup)
@@ -1075,19 +1089,21 @@ void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeGamm
 
 
     double gamma_max = 0;
-    for (int j = 0; j <basis_size; j++) {
-        double current_gamma = 0;
+    double current_gamma;
+    for (int j = 0; j <basis_size; j++) { //
+        current_gamma = 0;
 
         // compute row-matrix-multiplication asi_vector * A_as_inv (transposed matrix)
         for (int k = 0; k < basis_size; k++)
-            current_gamma += projections(ii,k) * d_ASI(mu_i, j, k); //correct d_ASI(mu_i, j, k)
+            current_gamma += projections(ii,k) * d_ASI(mu_i, k, j); //correct d_ASI(mu_i, j, k), but it is transposed during initialization
 
-        if (abs(current_gamma) > gamma_max)
-            gamma_max = abs(current_gamma);
+        current_gamma = fabs(current_gamma);
+        if (current_gamma > gamma_max)
+            gamma_max = current_gamma;
     }
 
     // tally energy contribution
-    gamma(ii) = gamma_max;
+    d_gamma(ii) = gamma_max;
 
 }
 /* ---------------------------------------------------------------------- */
@@ -1789,7 +1805,7 @@ double PairPACEExtrapolationKokkos<DeviceType>::memory_usage()
   bytes += MemKK::memory_usage(d_total_basis_size);
   bytes += MemKK::memory_usage(d_ASI);
   bytes += MemKK::memory_usage(projections);
-  bytes += MemKK::memory_usage(gamma);
+  bytes += MemKK::memory_usage(d_gamma);
 
   if (k_splines_gk.h_view.data()) {
     for (int i = 0; i < nelements; i++) {
@@ -1812,3 +1828,38 @@ template class PairPACEExtrapolationKokkos<LMPDeviceType>;
 template class PairPACEExtrapolationKokkos<LMPHostType>;
 #endif
 }
+
+/* ---------------------------------------------------------------------- */
+/* ----------------------------------------------------------------------
+    extract method for extracting value of scale variable
+ ---------------------------------------------------------------------- */
+template<class DeviceType>
+void *PairPACEExtrapolationKokkos<DeviceType>::extract(const char *str, int &dim)
+{
+    //check if str=="gamma_flag" then compute extrapolation grades on this iteration
+    dim = 0;
+    if (strcmp(str, "gamma_flag") == 0) return (void *) &gamma_flag;
+
+    dim = 2;
+    if (strcmp(str, "scale") == 0) return (void *) scale;
+    return nullptr;
+}
+
+/* ----------------------------------------------------------------------
+   peratom requests from FixPair
+   return ptr to requested data
+   also return ncol = # of quantites per atom
+     0 = per-atom vector
+     1 or more = # of columns in per-atom array
+   return NULL if str is not recognized
+---------------------------------------------------------------------- */
+template<class DeviceType>
+void *PairPACEExtrapolationKokkos<DeviceType>::extract_peratom(const char *str, int &ncol)
+{
+    if (strcmp(str, "gamma") == 0) {
+        ncol = 0;
+        return (void *) extrapolation_grade_gamma;
+    }
+
+    return nullptr;
+}
\ No newline at end of file
diff --git a/src/KOKKOS/pair_pace_extrapolation_kokkos.h b/src/KOKKOS/pair_pace_extrapolation_kokkos.h
index a5b4f8abc7..6983fcc338 100644
--- a/src/KOKKOS/pair_pace_extrapolation_kokkos.h
+++ b/src/KOKKOS/pair_pace_extrapolation_kokkos.h
@@ -98,6 +98,10 @@ class PairPACEExtrapolationKokkos : public PairPACEExtrapolation {
   KOKKOS_INLINE_FUNCTION
   void operator() (TagPairPACEComputeForce<NEIGHFLAG,EVFLAG>,const int& ii, EV_FLOAT&) const;
 
+
+  void *extract(const char *str, int &dim);
+  void *extract_peratom(const char *str, int &ncol);
+
  protected:
   int inum, maxneigh, chunk_size, chunk_offset, idx_ms_combs_max, total_num_functions_max;
   int host_flag;
@@ -203,6 +207,7 @@ class PairPACEExtrapolationKokkos : public PairPACEExtrapolation {
   typedef Kokkos::View<double**, DeviceType> t_ace_2d;
   typedef Kokkos::View<double*[3], DeviceType> t_ace_2d3;
   typedef Kokkos::View<double***, DeviceType> t_ace_3d;
+  typedef Kokkos::View<const double***, DeviceType> tc_ace_3d;
   typedef Kokkos::View<double**[3], DeviceType> t_ace_3d3;
   typedef Kokkos::View<double**[4], DeviceType> t_ace_3d4;
   typedef Kokkos::View<double****, DeviceType> t_ace_4d;
@@ -213,6 +218,8 @@ class PairPACEExtrapolationKokkos : public PairPACEExtrapolation {
   typedef Kokkos::View<complex****, DeviceType> t_ace_4c;
   typedef Kokkos::View<complex***[3], DeviceType> t_ace_4c3;
 
+  typedef Kokkos::View<double*>::HostMirror th_ace_1d;
+
   t_ace_3d A_rank1;
   t_ace_4c A;
 
@@ -242,11 +249,12 @@ class PairPACEExtrapolationKokkos : public PairPACEExtrapolation {
   t_ace_3d d_derivatives;
 
   // inverted active set
-  t_ace_3d d_ASI;
+  tc_ace_3d d_ASI;
   t_ace_2d projections;
-  t_ace_1d gamma;
-  // Spherical Harmonics
+  t_ace_1d d_gamma;
+  th_ace_1d h_gamma;
 
+  // Spherical Harmonics
   void pre_compute_harmonics(int);
 
   KOKKOS_INLINE_FUNCTION

From e033cebcdd8274299969f1e4e172c232db9ce7b8 Mon Sep 17 00:00:00 2001
From: Yury Lysogorskiy <yury.lysogorskiy@rub.de>
Date: Tue, 3 Jan 2023 14:30:04 +0100
Subject: [PATCH 05/12] update lammps-user-pace version and checksum

---
 cmake/Modules/Packages/ML-PACE.cmake | 4 ++--
 lib/pace/Install.py                  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cmake/Modules/Packages/ML-PACE.cmake b/cmake/Modules/Packages/ML-PACE.cmake
index 0159f36c34..b64584e5a7 100644
--- a/cmake/Modules/Packages/ML-PACE.cmake
+++ b/cmake/Modules/Packages/ML-PACE.cmake
@@ -1,6 +1,6 @@
-set(PACELIB_URL "https://github.com/ICAMS/lammps-user-pace/archive/refs/tags/v.2022.10.15.tar.gz" CACHE STRING "URL for PACE evaluator library sources")
+set(PACELIB_URL "https://github.com/ICAMS/lammps-user-pace/archive/refs/tags/v.2023.01.3.tar.gz" CACHE STRING "URL for PACE evaluator library sources")
 
-set(PACELIB_MD5 "848ad6a6cc79fa82745927001fb1c9b5" CACHE STRING "MD5 checksum of PACE evaluator library tarball")
+set(PACELIB_MD5 "f418d32b60e531063ac4285bf702b468" CACHE STRING "MD5 checksum of PACE evaluator library tarball")
 mark_as_advanced(PACELIB_URL)
 mark_as_advanced(PACELIB_MD5)
 
diff --git a/lib/pace/Install.py b/lib/pace/Install.py
index 9f132a9580..f7f50d4f6d 100644
--- a/lib/pace/Install.py
+++ b/lib/pace/Install.py
@@ -18,11 +18,11 @@ from install_helpers import fullpath, geturl, checkmd5sum
 # settings
 
 thisdir = fullpath('.')
-version ='v.2022.10.15'
+version ='v.2023.01.3'
 
 # known checksums for different PACE versions. used to validate the download.
 checksums = { \
-    'v.2022.10.15': '848ad6a6cc79fa82745927001fb1c9b5'
+    'v.2023.01.3': 'f418d32b60e531063ac4285bf702b468'
 }
 
 parser = ArgumentParser(prog='Install.py', description="LAMMPS library build wrapper script")

From 4d50109731a83f8c1a69ba81f0173f72f26a1364 Mon Sep 17 00:00:00 2001
From: Yury Lysogorskiy <yury.lysogorskiy@rub.de>
Date: Tue, 3 Jan 2023 14:43:46 +0100
Subject: [PATCH 06/12] add check that LINEAR ASI must be used

---
 src/KOKKOS/pair_pace_extrapolation_kokkos.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp
index cbaf92d800..ba5ecafeac 100644
--- a/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp
+++ b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp
@@ -429,7 +429,7 @@ void PairPACEExtrapolationKokkos<DeviceType>::init_style()
 {
   if (host_flag) {
     if (lmp->kokkos->nthreads > 1)
-      error->all(FLERR,"Pair style pace/kk can currently only run on a single "
+      error->all(FLERR,"Pair style pace/extrapolation/kk can currently only run on a single "
                          "CPU thread");
 
     PairPACEExtrapolation::init_style();
@@ -497,6 +497,10 @@ void PairPACEExtrapolationKokkos<DeviceType>::coeff(int narg, char **arg)
 {
   PairPACEExtrapolation::coeff(narg,arg);
 
+  auto b_evaluator = aceimpl->ace;
+  if (!b_evaluator->get_is_linear_extrapolation_grade()) {
+        error->all(FLERR,"Must use LINEAR ASI with pair pace/extrapolation/kk");
+  }
   // Set up element lists
 
   auto h_map = Kokkos::create_mirror_view(d_map);

From cfbc2d88944ffa03852e81f5176d11cde7622088 Mon Sep 17 00:00:00 2001
From: Yury Lysogorskiy <yury.lysogorskiy@rub.de>
Date: Tue, 3 Jan 2023 14:48:25 +0100
Subject: [PATCH 07/12] update pair_pace.rst

---
 doc/src/pair_pace.rst | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/doc/src/pair_pace.rst b/doc/src/pair_pace.rst
index 697a9965b6..01f7df63f6 100644
--- a/doc/src/pair_pace.rst
+++ b/doc/src/pair_pace.rst
@@ -1,6 +1,7 @@
 .. index:: pair_style pace
 .. index:: pair_style pace/kk
 .. index:: pair_style pace/extrapolation
+.. index:: pair_style pace/extrapolation/kk
 
 pair_style pace command
 =======================
@@ -127,6 +128,9 @@ but not more often than every 20 steps.
 
 On all other steps `pair_style pace recursive` will be used.
 
+When using the pair style *pace/extrapolation* with the KOKKOS package on GPUs
+product B-basis evaluator is always used and only *linear* ASI is supported.
+
 ----------
 
 See the :doc:`pair_coeff <pair_coeff>` page for alternate ways
@@ -186,4 +190,4 @@ recursive, chunksize = 4096,
 
 .. _Lysogorskiy2022:
 
-**(Lysogorskiy2022)** Lysogorskiy, Bochkarev, Mrovec, Drautz, TBS (2022).
+**(Lysogorskiy2022)** Lysogorskiy, Bochkarev, Mrovec, Drautz, arXiv:2212.08716 (2022).

From 295d8a69035497ea0b575b6764c349cd82a9af5c Mon Sep 17 00:00:00 2001
From: Yury Lysogorskiy <yury.lysogorskiy@rub.de>
Date: Tue, 3 Jan 2023 15:58:34 +0100
Subject: [PATCH 08/12] fix_pair.cpp: respect lmp->suffix when looking for
 pair_style name match

---
 src/fix_pair.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/fix_pair.cpp b/src/fix_pair.cpp
index 5e5ccbf810..bd389f6632 100644
--- a/src/fix_pair.cpp
+++ b/src/fix_pair.cpp
@@ -36,6 +36,10 @@ FixPair::FixPair(LAMMPS *lmp, int narg, char **arg) :
   if (nevery < 1) error->all(FLERR,"Illegal fix pair every value: {}", nevery);
 
   pairname = utils::strdup(arg[4]);
+  if(lmp->suffix) {
+      strcat(pairname,"/");
+      strcat(pairname,lmp->suffix);
+  }
   pstyle = force->pair_match(pairname,1,0);
   if (pstyle == nullptr) error->all(FLERR,"Pair style {} for fix pair not found", pairname);
 

From a3572b61d84d17c5efec076663c8607aa1325c13 Mon Sep 17 00:00:00 2001
From: Stan Gerald Moore <stamoor@sandia.gov>
Date: Tue, 3 Jan 2023 10:09:47 -0700
Subject: [PATCH 09/12] Small cleanup

---
 src/KOKKOS/Install.sh                         |   2 +
 src/KOKKOS/pair_pace_extrapolation_kokkos.cpp | 129 +++++++++---------
 src/ML-PACE/pair_pace_extrapolation.cpp       |   4 -
 3 files changed, 64 insertions(+), 71 deletions(-)

diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index a5bf6437fc..77a9932d90 100755
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -320,6 +320,8 @@ action pair_multi_lucy_rx_kokkos.cpp pair_multi_lucy_rx.cpp
 action pair_multi_lucy_rx_kokkos.h pair_multi_lucy_rx.h
 action pair_pace_kokkos.cpp pair_pace.cpp
 action pair_pace_kokkos.h pair_pace.h
+action pair_pace_extrapolation_kokkos.cpp pair_pace_extrapolation.cpp
+action pair_pace_extrapolation_kokkos.h pair_pace_extrapolation.h
 action pair_reaxff_kokkos.cpp pair_reaxff.cpp
 action pair_reaxff_kokkos.h pair_reaxff.h
 action pair_snap_kokkos.cpp pair_snap.cpp
diff --git a/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp
index ba5ecafeac..6369c7fba5 100644
--- a/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp
+++ b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp
@@ -38,19 +38,18 @@
 #include <cstring>
 
 namespace LAMMPS_NS {
-    struct ACEALImpl {
-        ACEALImpl() : basis_set(nullptr), ace(nullptr) {}
+  struct ACEALImpl {
+    ACEALImpl() : basis_set(nullptr), ace(nullptr) {}
 
-        ~ACEALImpl()
-        {
-            delete basis_set;
-            delete ace;
-        }
+    ~ACEALImpl() {
+      delete basis_set;
+      delete ace;
+    }
 
-        ACEBBasisSet *basis_set;
-        ACEBEvaluator *ace;
-    };
-}    // namespace LAMMPS_NS
+    ACEBBasisSet *basis_set;
+    ACEBEvaluator *ace;
+  };
+} // namespace LAMMPS_NS
 
 using namespace LAMMPS_NS;
 using namespace MathConst;
@@ -187,7 +186,7 @@ void PairPACEExtrapolationKokkos<DeviceType>::copy_pertype()
     h_rho_core_cutoff[n] = basis_set->map_embedding_specifications.at(n).rho_core_cutoff;
     h_drho_core_cutoff[n] = basis_set->map_embedding_specifications.at(n).drho_core_cutoff;
 
-    h_E0vals(n)= basis_set->E0vals(n);
+    h_E0vals(n) = basis_set->E0vals(n);
 
     h_ndensity(n) = basis_set->map_embedding_specifications.at(n).ndensity;
 
@@ -321,8 +320,6 @@ void PairPACEExtrapolationKokkos<DeviceType>::copy_tilde()
   t_ace_3d d_ASI_temp;
   MemKK::realloc_kokkos(d_ASI_temp, "pace:ASI_temp", nelements, total_num_functions_max, total_num_functions_max);
 
-
-
   auto h_rank = Kokkos::create_mirror_view(d_rank);
   auto h_num_ms_combs = Kokkos::create_mirror_view(d_num_ms_combs);
   auto h_func_inds = Kokkos::create_mirror_view(d_func_inds);
@@ -332,7 +329,7 @@ void PairPACEExtrapolationKokkos<DeviceType>::copy_tilde()
   auto h_ms_combs = Kokkos::create_mirror_view(d_ms_combs);
   auto h_gen_cgs = Kokkos::create_mirror_view(d_gen_cgs);
   auto h_coeffs = Kokkos::create_mirror_view(d_coeffs);
-// asi
+  // asi
   auto h_ASI = Kokkos::create_mirror_view(d_ASI_temp);
 
   // copy values on host
@@ -380,7 +377,7 @@ void PairPACEExtrapolationKokkos<DeviceType>::copy_tilde()
       }
 
       for (int p = 0; p < ndensity; ++p)
-            h_coeffs(mu, func_ind_through, p) = func->coeff[p];
+        h_coeffs(mu, func_ind_through, p) = func->coeff[p];
 
 
       // loop over {ms} combinations in sum
@@ -400,11 +397,10 @@ void PairPACEExtrapolationKokkos<DeviceType>::copy_tilde()
 
     // ASI
     const auto &A_as_inv = b_evaluator->A_active_set_inv.at(mu);
-    for(int i = 0; i < total_basis_size_rank1 + total_basis_size; i++)
-        for(int j = 0; j < total_basis_size_rank1 + total_basis_size; j++){
-            h_ASI(mu,i,j)=A_as_inv(j,i); // transpose back for better performance on GPU
+    for (int i = 0; i < total_basis_size_rank1 + total_basis_size; i++)
+      for (int j = 0; j < total_basis_size_rank1 + total_basis_size; j++){
+        h_ASI(mu,i,j) = A_as_inv(j,i); // transpose back for better performance on GPU
     }
-
   }
 
   Kokkos::deep_copy(d_rank, h_rank);
@@ -565,6 +561,7 @@ void PairPACEExtrapolationKokkos<DeviceType>::compute(int eflag_in, int vflag_in
   ev_init(eflag,vflag,0);
 
   // reallocate per-atom arrays if necessary
+
   if (eflag_atom) {
     memoryKK->destroy_kokkos(k_eatom,eatom);
     memoryKK->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom");
@@ -742,7 +739,7 @@ void PairPACEExtrapolationKokkos<DeviceType>::compute(int eflag_in, int vflag_in
     ev += ev_tmp;
 
     //if gamma_flag - copy current d_gamma to extrapolation_grade_gamma
-    if(gamma_flag){
+    if (gamma_flag){
         h_gamma = Kokkos::create_mirror_view(d_gamma);
         Kokkos:deep_copy(h_gamma, d_gamma);
         memcpy(extrapolation_grade_gamma+chunk_offset, (void *) h_gamma.data(), sizeof(double)*chunk_size);
@@ -1001,7 +998,7 @@ void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeRho,
 
 
     //gamma_i
-    if(gamma_flag)
+    if (gamma_flag)
         Kokkos::atomic_add(&projections(ii, func_ind),  d_gen_cgs(mu_i, idx_ms_comb) * A_cur);
 
   } else { // rank > 1
@@ -1040,14 +1037,13 @@ void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeRho,
       Kokkos::atomic_add(&rhos(ii, p), B.real_part_product(d_coeffs(mu_i, func_ind, p) * d_gen_cgs(mu_i, idx_ms_comb)));
     }
     //gamma_i
-    if(gamma_flag)
-        Kokkos::atomic_add(&projections(ii, func_ind),  B.real_part_product(d_gen_cgs(mu_i, idx_ms_comb)));
+    if (gamma_flag)
+      Kokkos::atomic_add(&projections(ii, func_ind),  B.real_part_product(d_gen_cgs(mu_i, idx_ms_comb)));
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
-
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeFS, const int& ii) const
@@ -1079,37 +1075,34 @@ void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeFS,
   }
 }
 
-
 /* ---------------------------------------------------------------------- */
 
-
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeGamma, const int& ii) const
 {
-    const int i = d_ilist[ii + chunk_offset];
-    const int mu_i = d_map(type(i));
-    const int basis_size = d_total_basis_size(mu_i);
+  const int i = d_ilist[ii + chunk_offset];
+  const int mu_i = d_map(type(i));
+  const int basis_size = d_total_basis_size(mu_i);
 
+  double gamma_max = 0;
+  double current_gamma;
+  for (int j = 0; j <basis_size; j++) {
+    current_gamma = 0;
 
-    double gamma_max = 0;
-    double current_gamma;
-    for (int j = 0; j <basis_size; j++) { //
-        current_gamma = 0;
+    // compute row-matrix-multiplication asi_vector * A_as_inv (transposed matrix)
+    for (int k = 0; k < basis_size; k++)
+      current_gamma += projections(ii,k) * d_ASI(mu_i, k, j); //correct d_ASI(mu_i, j, k), but it is transposed during initialization
 
-        // compute row-matrix-multiplication asi_vector * A_as_inv (transposed matrix)
-        for (int k = 0; k < basis_size; k++)
-            current_gamma += projections(ii,k) * d_ASI(mu_i, k, j); //correct d_ASI(mu_i, j, k), but it is transposed during initialization
-
-        current_gamma = fabs(current_gamma);
-        if (current_gamma > gamma_max)
-            gamma_max = current_gamma;
-    }
-
-    // tally energy contribution
-    d_gamma(ii) = gamma_max;
+    current_gamma = fabs(current_gamma);
+    if (current_gamma > gamma_max)
+      gamma_max = current_gamma;
+  }
 
+  // tally energy contribution
+  d_gamma(ii) = gamma_max;
 }
+
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
@@ -1824,29 +1817,20 @@ double PairPACEExtrapolationKokkos<DeviceType>::memory_usage()
   return bytes;
 }
 
-/* ---------------------------------------------------------------------- */
-
-namespace LAMMPS_NS {
-template class PairPACEExtrapolationKokkos<LMPDeviceType>;
-#ifdef LMP_KOKKOS_GPU
-template class PairPACEExtrapolationKokkos<LMPHostType>;
-#endif
-}
-
-/* ---------------------------------------------------------------------- */
 /* ----------------------------------------------------------------------
     extract method for extracting value of scale variable
  ---------------------------------------------------------------------- */
+
 template<class DeviceType>
 void *PairPACEExtrapolationKokkos<DeviceType>::extract(const char *str, int &dim)
 {
-    //check if str=="gamma_flag" then compute extrapolation grades on this iteration
-    dim = 0;
-    if (strcmp(str, "gamma_flag") == 0) return (void *) &gamma_flag;
+  //check if str=="gamma_flag" then compute extrapolation grades on this iteration
+  dim = 0;
+  if (strcmp(str, "gamma_flag") == 0) return (void *) &gamma_flag;
 
-    dim = 2;
-    if (strcmp(str, "scale") == 0) return (void *) scale;
-    return nullptr;
+  dim = 2;
+  if (strcmp(str, "scale") == 0) return (void *) scale;
+  return nullptr;
 }
 
 /* ----------------------------------------------------------------------
@@ -1857,13 +1841,24 @@ void *PairPACEExtrapolationKokkos<DeviceType>::extract(const char *str, int &dim
      1 or more = # of columns in per-atom array
    return NULL if str is not recognized
 ---------------------------------------------------------------------- */
+
 template<class DeviceType>
 void *PairPACEExtrapolationKokkos<DeviceType>::extract_peratom(const char *str, int &ncol)
 {
-    if (strcmp(str, "gamma") == 0) {
-        ncol = 0;
-        return (void *) extrapolation_grade_gamma;
-    }
+  if (strcmp(str, "gamma") == 0) {
+    ncol = 0;
+    return (void *) extrapolation_grade_gamma;
+  }
+
+  return nullptr;
+}
+
+/* ---------------------------------------------------------------------- */
+
+namespace LAMMPS_NS {
+template class PairPACEExtrapolationKokkos<LMPDeviceType>;
+#ifdef LMP_KOKKOS_GPU
+template class PairPACEExtrapolationKokkos<LMPHostType>;
+#endif
+}
 
-    return nullptr;
-}
\ No newline at end of file
diff --git a/src/ML-PACE/pair_pace_extrapolation.cpp b/src/ML-PACE/pair_pace_extrapolation.cpp
index 2c5e5fafe9..8a0116526a 100644
--- a/src/ML-PACE/pair_pace_extrapolation.cpp
+++ b/src/ML-PACE/pair_pace_extrapolation.cpp
@@ -135,16 +135,12 @@ void PairPACEExtrapolation::compute(int eflag, int vflag)
 
   double **x = atom->x;
   double **f = atom->f;
-//  tagint *tag = atom->tag;
   int *type = atom->type;
   // number of atoms in cell
   int nlocal = atom->nlocal;
 
   int newton_pair = force->newton_pair;
 
-  // number of atoms including ghost atoms
-//  int nall = nlocal + atom->nghost;
-
   // inum: length of the neighborlists list
   inum = list->inum;
 

From bcb5285ef9fca4211295d45bd900448b4064ec01 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 3 Jan 2023 14:15:12 -0500
Subject: [PATCH 10/12] update / correct suffix handling in fix pair

---
 doc/src/fix_pair.rst |  9 +++++++--
 src/fix_pair.cpp     | 17 +++++++++++++----
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/doc/src/fix_pair.rst b/doc/src/fix_pair.rst
index abb44718cd..44c91f18ee 100644
--- a/doc/src/fix_pair.rst
+++ b/doc/src/fix_pair.rst
@@ -1,7 +1,7 @@
 .. index:: fix pair
 
 fix pair command
-=======================
+================
 
 Syntax
 """"""
@@ -47,7 +47,12 @@ These are example use cases:
 The *N* argument determines how often the fix is invoked.
 
 The *pstyle* argument is the name of the pair style.  It can be a
-sub-style used in a :doc:`pair_style hybrid <pair_hybrid>` command.
+sub-style used in a :doc:`pair_style hybrid <pair_hybrid>` command.  If
+there are multiple sub-styles using the same pair style, then *pstyle*
+should be specified as "style:N", where *N* is the number of the
+instance of the pair style you wish monitor (e.g., the first or second).
+For example, *pstyle* could be specified as "pace/extrapolation" or
+"amoeba" or "eam:1" or "eam:2".
 
 One or more *name/flag* pairs of arguments follow.  Each *name* is a
 per-atom quantity which the pair style must recognize as an extraction
diff --git a/src/fix_pair.cpp b/src/fix_pair.cpp
index bd389f6632..db7b24dfea 100644
--- a/src/fix_pair.cpp
+++ b/src/fix_pair.cpp
@@ -36,11 +36,20 @@ FixPair::FixPair(LAMMPS *lmp, int narg, char **arg) :
   if (nevery < 1) error->all(FLERR,"Illegal fix pair every value: {}", nevery);
 
   pairname = utils::strdup(arg[4]);
-  if(lmp->suffix) {
-      strcat(pairname,"/");
-      strcat(pairname,lmp->suffix);
+  char *cptr;
+  int nsub = 0;
+  if ((cptr = strchr(pairname,':'))) {
+    *cptr = '\0';
+    nsub = utils::inumeric(FLERR,cptr+1,false,lmp);
   }
-  pstyle = force->pair_match(pairname,1,0);
+
+  if (lmp->suffix_enable) {
+    if (lmp->suffix)
+      pstyle = force->pair_match(fmt::format("{}/{}",pairname,lmp->suffix),1,nsub);
+    if ((pstyle == nullptr) && lmp->suffix2)
+      pstyle = force->pair_match(fmt::format("{}/{}",pairname,lmp->suffix2),1,nsub);
+  }
+  if (pstyle == nullptr) pstyle = force->pair_match(pairname,1,nsub);
   if (pstyle == nullptr) error->all(FLERR,"Pair style {} for fix pair not found", pairname);
 
   nfield = (narg-5) / 2;

From 87a8cfe299f3e194bc6b24d19660a0ed69cb9ca8 Mon Sep 17 00:00:00 2001
From: Yury Lysogorskiy <yury.lysogorskiy@rub.de>
Date: Wed, 4 Jan 2023 12:46:50 +0100
Subject: [PATCH 11/12] - replace #include "ace-evaluator/ace_radial.h" in
 pair_pace_kokkos.h and pair_pace_extrapolation_kokkos.h with forward
 declaration "class SplineInterpolator;" - move
 SplineInterpolatorKokkos::operator=(const SplineInterpolator &spline) to .cpp
 files

---
 src/KOKKOS/pair_pace_extrapolation_kokkos.cpp | 18 ++++++++++++++++
 src/KOKKOS/pair_pace_extrapolation_kokkos.h   | 21 +++----------------
 src/KOKKOS/pair_pace_kokkos.cpp               | 18 ++++++++++++++++
 src/KOKKOS/pair_pace_kokkos.h                 | 21 +++----------------
 4 files changed, 42 insertions(+), 36 deletions(-)

diff --git a/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp
index 6369c7fba5..e6ae694cad 100644
--- a/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp
+++ b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp
@@ -1672,7 +1672,25 @@ void PairPACEExtrapolationKokkos<DeviceType>::evaluate_splines(const int ii, con
 }
 
 /* ---------------------------------------------------------------------- */
+template<class DeviceType>
+void PairPACEExtrapolationKokkos<DeviceType>::SplineInterpolatorKokkos::operator=(const SplineInterpolator &spline) {
+    cutoff = spline.cutoff;
+    deltaSplineBins = spline.deltaSplineBins;
+    ntot = spline.ntot;
+    nlut = spline.nlut;
+    invrscalelookup = spline.invrscalelookup;
+    rscalelookup = spline.rscalelookup;
+    num_of_functions = spline.num_of_functions;
 
+    lookupTable = t_ace_3d4("lookupTable", ntot+1, num_of_functions);
+    auto h_lookupTable = Kokkos::create_mirror_view(lookupTable);
+    for (int i = 0; i < ntot+1; i++)
+        for (int j = 0; j < num_of_functions; j++)
+            for (int k = 0; k < 4; k++)
+                h_lookupTable(i, j, k) = spline.lookupTable(i, j, k);
+    Kokkos::deep_copy(lookupTable, h_lookupTable);
+}
+/* ---------------------------------------------------------------------- */
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairPACEExtrapolationKokkos<DeviceType>::SplineInterpolatorKokkos::calcSplines(const int ii, const int jj, const double r, const t_ace_3d &d_values, const t_ace_3d &d_derivatives) const
diff --git a/src/KOKKOS/pair_pace_extrapolation_kokkos.h b/src/KOKKOS/pair_pace_extrapolation_kokkos.h
index 6983fcc338..a65841f8f6 100644
--- a/src/KOKKOS/pair_pace_extrapolation_kokkos.h
+++ b/src/KOKKOS/pair_pace_extrapolation_kokkos.h
@@ -24,10 +24,11 @@ PairStyle(pace/extrapolation/kk/host,PairPACEExtrapolationKokkos<LMPHostType>);
 #define LMP_PAIR_PACE_EXTRAPOLATION_KOKKOS_H
 
 #include "pair_pace_extrapolation.h"
-#include "ace-evaluator/ace_radial.h"
 #include "kokkos_type.h"
 #include "pair_kokkos.h"
 
+class SplineInterpolator;
+
 namespace LAMMPS_NS {
 
 template<class DeviceType>
@@ -313,23 +314,7 @@ class PairPACEExtrapolationKokkos : public PairPACEExtrapolation {
 
     t_ace_3d4 lookupTable;
 
-    void operator=(const SplineInterpolator &spline) {
-      cutoff = spline.cutoff;
-      deltaSplineBins = spline.deltaSplineBins;
-      ntot = spline.ntot;
-      nlut = spline.nlut;
-      invrscalelookup = spline.invrscalelookup;
-      rscalelookup = spline.rscalelookup;
-      num_of_functions = spline.num_of_functions;
-
-      lookupTable = t_ace_3d4("lookupTable", ntot+1, num_of_functions);
-      auto h_lookupTable = Kokkos::create_mirror_view(lookupTable);
-      for (int i = 0; i < ntot+1; i++)
-        for (int j = 0; j < num_of_functions; j++)
-          for (int k = 0; k < 4; k++)
-            h_lookupTable(i, j, k) = spline.lookupTable(i, j, k);
-      Kokkos::deep_copy(lookupTable, h_lookupTable);
-    }
+    void operator=(const SplineInterpolator &spline);
 
     void deallocate() {
       lookupTable = t_ace_3d4();
diff --git a/src/KOKKOS/pair_pace_kokkos.cpp b/src/KOKKOS/pair_pace_kokkos.cpp
index dd5b97fe32..6f1e3feaf8 100644
--- a/src/KOKKOS/pair_pace_kokkos.cpp
+++ b/src/KOKKOS/pair_pace_kokkos.cpp
@@ -1574,7 +1574,25 @@ void PairPACEKokkos<DeviceType>::evaluate_splines(const int ii, const int jj, do
 }
 
 /* ---------------------------------------------------------------------- */
+template<class DeviceType>
+void PairPACEKokkos<DeviceType>::SplineInterpolatorKokkos::operator=(const SplineInterpolator &spline) {
+    cutoff = spline.cutoff;
+    deltaSplineBins = spline.deltaSplineBins;
+    ntot = spline.ntot;
+    nlut = spline.nlut;
+    invrscalelookup = spline.invrscalelookup;
+    rscalelookup = spline.rscalelookup;
+    num_of_functions = spline.num_of_functions;
 
+    lookupTable = t_ace_3d4("lookupTable", ntot+1, num_of_functions);
+    auto h_lookupTable = Kokkos::create_mirror_view(lookupTable);
+    for (int i = 0; i < ntot+1; i++)
+        for (int j = 0; j < num_of_functions; j++)
+            for (int k = 0; k < 4; k++)
+                h_lookupTable(i, j, k) = spline.lookupTable(i, j, k);
+    Kokkos::deep_copy(lookupTable, h_lookupTable);
+}
+/* ---------------------------------------------------------------------- */
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairPACEKokkos<DeviceType>::SplineInterpolatorKokkos::calcSplines(const int ii, const int jj, const double r, const t_ace_3d &d_values, const t_ace_3d &d_derivatives) const
diff --git a/src/KOKKOS/pair_pace_kokkos.h b/src/KOKKOS/pair_pace_kokkos.h
index cc2a3cb674..39cfd100f8 100644
--- a/src/KOKKOS/pair_pace_kokkos.h
+++ b/src/KOKKOS/pair_pace_kokkos.h
@@ -24,10 +24,11 @@ PairStyle(pace/kk/host,PairPACEKokkos<LMPHostType>);
 #define LMP_PAIR_PACE_KOKKOS_H
 
 #include "pair_pace.h"
-#include "ace-evaluator/ace_radial.h"
 #include "kokkos_type.h"
 #include "pair_kokkos.h"
 
+class SplineInterpolator;
+
 namespace LAMMPS_NS {
 
 template<class DeviceType>
@@ -293,23 +294,7 @@ class PairPACEKokkos : public PairPACE {
 
     t_ace_3d4 lookupTable;
 
-    void operator=(const SplineInterpolator &spline) {
-      cutoff = spline.cutoff;
-      deltaSplineBins = spline.deltaSplineBins;
-      ntot = spline.ntot;
-      nlut = spline.nlut;
-      invrscalelookup = spline.invrscalelookup;
-      rscalelookup = spline.rscalelookup;
-      num_of_functions = spline.num_of_functions;
-
-      lookupTable = t_ace_3d4("lookupTable", ntot+1, num_of_functions);
-      auto h_lookupTable = Kokkos::create_mirror_view(lookupTable);
-      for (int i = 0; i < ntot+1; i++)
-        for (int j = 0; j < num_of_functions; j++)
-          for (int k = 0; k < 4; k++)
-            h_lookupTable(i, j, k) = spline.lookupTable(i, j, k);
-      Kokkos::deep_copy(lookupTable, h_lookupTable);
-    }
+    void operator=(const SplineInterpolator &spline);
 
     void deallocate() {
       lookupTable = t_ace_3d4();

From 20b6355888d3b005601b62439270cd749dae24e9 Mon Sep 17 00:00:00 2001
From: Yury Lysogorskiy <yury.lysogorskiy@rub.de>
Date: Wed, 4 Jan 2023 15:00:03 +0100
Subject: [PATCH 12/12] refactor fix_pair.h/cpp: extract method "query_pstyle"
 and call it also in void FixPair::init()

---
 src/fix_pair.cpp | 40 +++++++++++++++++++++++++---------------
 src/fix_pair.h   |  2 ++
 2 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/src/fix_pair.cpp b/src/fix_pair.cpp
index db7b24dfea..59f9dbda4a 100644
--- a/src/fix_pair.cpp
+++ b/src/fix_pair.cpp
@@ -21,6 +21,7 @@
 #include "memory.h"
 #include "pair.h"
 #include "update.h"
+#include "fmt/format.h"
 
 using namespace LAMMPS_NS;
 using namespace FixConst;
@@ -36,20 +37,7 @@ FixPair::FixPair(LAMMPS *lmp, int narg, char **arg) :
   if (nevery < 1) error->all(FLERR,"Illegal fix pair every value: {}", nevery);
 
   pairname = utils::strdup(arg[4]);
-  char *cptr;
-  int nsub = 0;
-  if ((cptr = strchr(pairname,':'))) {
-    *cptr = '\0';
-    nsub = utils::inumeric(FLERR,cptr+1,false,lmp);
-  }
-
-  if (lmp->suffix_enable) {
-    if (lmp->suffix)
-      pstyle = force->pair_match(fmt::format("{}/{}",pairname,lmp->suffix),1,nsub);
-    if ((pstyle == nullptr) && lmp->suffix2)
-      pstyle = force->pair_match(fmt::format("{}/{}",pairname,lmp->suffix2),1,nsub);
-  }
-  if (pstyle == nullptr) pstyle = force->pair_match(pairname,1,nsub);
+  query_pstyle(lmp);
   if (pstyle == nullptr) error->all(FLERR,"Pair style {} for fix pair not found", pairname);
 
   nfield = (narg-5) / 2;
@@ -143,6 +131,28 @@ FixPair::FixPair(LAMMPS *lmp, int narg, char **arg) :
   lasttime = -1;
 }
 
+/* ---------------------------------------------------------------------- */
+
+void FixPair::query_pstyle(LAMMPS *lmp) {
+    char *cptr=nullptr;
+    int nsub = 0;
+    if ((cptr = strchr(pairname, ':'))) {
+        *cptr = '\0';
+        nsub = utils::inumeric(FLERR,cptr+1,false,lmp);
+    }
+    pstyle = nullptr;
+    if (lmp->suffix_enable) {
+        if (lmp->suffix) {
+            pstyle = force->pair_match(fmt::format("{}/{}", pairname, lmp->suffix), 1, nsub);
+            if (pstyle == nullptr && (lmp->suffix2)) {
+                pstyle = force->pair_match(fmt::format("{}/{}", pairname, lmp->suffix2), 1, nsub);
+            }
+        }
+    }
+    if (pstyle == nullptr) pstyle = force->pair_match(pairname, 1, nsub);
+}
+
+
 /* ---------------------------------------------------------------------- */
 
 FixPair::~FixPair()
@@ -184,7 +194,7 @@ void FixPair::init()
 {
   // insure pair style still exists
 
-  pstyle = force->pair_match(pairname,1,0);
+  query_pstyle(lmp);
   if (pstyle == nullptr) error->all(FLERR,"Pair style {} for fix pair not found", pairname);
 }
 
diff --git a/src/fix_pair.h b/src/fix_pair.h
index 765b261ee5..57fbdf8ecb 100644
--- a/src/fix_pair.h
+++ b/src/fix_pair.h
@@ -56,6 +56,8 @@ class FixPair : public Fix {
   class Pair *pstyle;
   double *vector;
   double **array;
+
+  void query_pstyle(LAMMPS *lmp);
 };
 
 }    // namespace LAMMPS_NS