diff --git a/cmake/Modules/Packages/ML-PACE.cmake b/cmake/Modules/Packages/ML-PACE.cmake
index 0159f36c34..b64584e5a7 100644
--- a/cmake/Modules/Packages/ML-PACE.cmake
+++ b/cmake/Modules/Packages/ML-PACE.cmake
@@ -1,6 +1,6 @@
-set(PACELIB_URL "https://github.com/ICAMS/lammps-user-pace/archive/refs/tags/v.2022.10.15.tar.gz" CACHE STRING "URL for PACE evaluator library sources")
+set(PACELIB_URL "https://github.com/ICAMS/lammps-user-pace/archive/refs/tags/v.2023.01.3.tar.gz" CACHE STRING "URL for PACE evaluator library sources")
 
-set(PACELIB_MD5 "848ad6a6cc79fa82745927001fb1c9b5" CACHE STRING "MD5 checksum of PACE evaluator library tarball")
+set(PACELIB_MD5 "f418d32b60e531063ac4285bf702b468" CACHE STRING "MD5 checksum of PACE evaluator library tarball")
 mark_as_advanced(PACELIB_URL)
 mark_as_advanced(PACELIB_MD5)
 
diff --git a/doc/src/fix_pair.rst b/doc/src/fix_pair.rst
index abb44718cd..44c91f18ee 100644
--- a/doc/src/fix_pair.rst
+++ b/doc/src/fix_pair.rst
@@ -1,7 +1,7 @@
 .. index:: fix pair
 
 fix pair command
-=======================
+================
 
 Syntax
 """"""
@@ -47,7 +47,12 @@ These are example use cases:
 The *N* argument determines how often the fix is invoked.
 
 The *pstyle* argument is the name of the pair style.  It can be a
-sub-style used in a :doc:`pair_style hybrid <pair_hybrid>` command.
+sub-style used in a :doc:`pair_style hybrid <pair_hybrid>` command.  If
+there are multiple sub-styles using the same pair style, then *pstyle*
+should be specified as "style:N", where *N* is the number of the
+instance of the pair style you wish monitor (e.g., the first or second).
+For example, *pstyle* could be specified as "pace/extrapolation" or
+"amoeba" or "eam:1" or "eam:2".
 
 One or more *name/flag* pairs of arguments follow.  Each *name* is a
 per-atom quantity which the pair style must recognize as an extraction
diff --git a/doc/src/pair_pace.rst b/doc/src/pair_pace.rst
index 697a9965b6..01f7df63f6 100644
--- a/doc/src/pair_pace.rst
+++ b/doc/src/pair_pace.rst
@@ -1,6 +1,7 @@
 .. index:: pair_style pace
 .. index:: pair_style pace/kk
 .. index:: pair_style pace/extrapolation
+.. index:: pair_style pace/extrapolation/kk
 
 pair_style pace command
 =======================
@@ -127,6 +128,9 @@ but not more often than every 20 steps.
 
 On all other steps `pair_style pace recursive` will be used.
 
+When using the pair style *pace/extrapolation* with the KOKKOS package on GPUs
+product B-basis evaluator is always used and only *linear* ASI is supported.
+
 ----------
 
 See the :doc:`pair_coeff <pair_coeff>` page for alternate ways
@@ -186,4 +190,4 @@ recursive, chunksize = 4096,
 
 .. _Lysogorskiy2022:
 
-**(Lysogorskiy2022)** Lysogorskiy, Bochkarev, Mrovec, Drautz, TBS (2022).
+**(Lysogorskiy2022)** Lysogorskiy, Bochkarev, Mrovec, Drautz, arXiv:2212.08716 (2022).
diff --git a/lib/pace/Install.py b/lib/pace/Install.py
index 9f132a9580..f7f50d4f6d 100644
--- a/lib/pace/Install.py
+++ b/lib/pace/Install.py
@@ -18,11 +18,11 @@ from install_helpers import fullpath, geturl, checkmd5sum
 # settings
 
 thisdir = fullpath('.')
-version ='v.2022.10.15'
+version ='v.2023.01.3'
 
 # known checksums for different PACE versions. used to validate the download.
 checksums = { \
-    'v.2022.10.15': '848ad6a6cc79fa82745927001fb1c9b5'
+    'v.2023.01.3': 'f418d32b60e531063ac4285bf702b468'
 }
 
 parser = ArgumentParser(prog='Install.py', description="LAMMPS library build wrapper script")
diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index a5bf6437fc..77a9932d90 100755
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -320,6 +320,8 @@ action pair_multi_lucy_rx_kokkos.cpp pair_multi_lucy_rx.cpp
 action pair_multi_lucy_rx_kokkos.h pair_multi_lucy_rx.h
 action pair_pace_kokkos.cpp pair_pace.cpp
 action pair_pace_kokkos.h pair_pace.h
+action pair_pace_extrapolation_kokkos.cpp pair_pace_extrapolation.cpp
+action pair_pace_extrapolation_kokkos.h pair_pace_extrapolation.h
 action pair_reaxff_kokkos.cpp pair_reaxff.cpp
 action pair_reaxff_kokkos.h pair_reaxff.h
 action pair_snap_kokkos.cpp pair_snap.cpp
diff --git a/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp
new file mode 100644
index 0000000000..e6ae694cad
--- /dev/null
+++ b/src/KOKKOS/pair_pace_extrapolation_kokkos.cpp
@@ -0,0 +1,1882 @@
+// clang-format off
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   aE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Yury Lysogorskiy (ICAMS)
+------------------------------------------------------------------------- */
+
+#include "pair_pace_extrapolation_kokkos.h"
+
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "kokkos.h"
+#include "math_const.h"
+#include "memory_kokkos.h"
+#include "neighbor_kokkos.h"
+#include "neigh_request.h"
+
+#include "ace-evaluator/ace_version.h"
+#include "ace-evaluator/ace_radial.h"
+
+#include "ace/ace_b_basis.h"
+#include "ace/ace_b_evaluator.h"
+
+#include <cstring>
+
+namespace LAMMPS_NS {
+  struct ACEALImpl {
+    ACEALImpl() : basis_set(nullptr), ace(nullptr) {}
+
+    ~ACEALImpl() {
+      delete basis_set;
+      delete ace;
+    }
+
+    ACEBBasisSet *basis_set;
+    ACEBEvaluator *ace;
+  };
+} // namespace LAMMPS_NS
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+enum{FS,FS_SHIFTEDSCALED};
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairPACEExtrapolationKokkos<DeviceType>::PairPACEExtrapolationKokkos(LAMMPS *lmp) : PairPACEExtrapolation(lmp)
+{
+  respa_enable = 0;
+
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+
+  host_flag = (execution_space == Host);
+}
+
+/* ----------------------------------------------------------------------
+   check if allocated, since class can be destructed when incomplete
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairPACEExtrapolationKokkos<DeviceType>::~PairPACEExtrapolationKokkos()
+{
+  if (copymode) return;
+
+  memoryKK->destroy_kokkos(k_eatom,eatom);
+  memoryKK->destroy_kokkos(k_vatom,vatom);
+
+  // deallocate views of views in serial to prevent issues in Kokkos tools
+
+  if (k_splines_gk.h_view.data()) {
+    for (int i = 0; i < nelements; i++) {
+      for (int j = 0; j < nelements; j++) {
+        k_splines_gk.h_view(i, j).deallocate();
+        k_splines_rnl.h_view(i, j).deallocate();
+        k_splines_hc.h_view(i, j).deallocate();
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairPACEExtrapolationKokkos<DeviceType>::grow(int natom, int maxneigh)
+{
+  auto basis_set = aceimpl->basis_set;
+
+  if ((int)A.extent(0) < natom) {
+
+    MemKK::realloc_kokkos(A, "pace:A", natom, nelements, nradmax + 1, (lmax + 1) * (lmax + 1));
+    MemKK::realloc_kokkos(A_rank1, "pace:A_rank1", natom, nelements, nradbase);
+
+    MemKK::realloc_kokkos(A_list, "pace:A_list", natom, idx_ms_combs_max, basis_set->rankmax);
+    //size is +1 of max to avoid out-of-boundary array access in double-triangular scheme
+    MemKK::realloc_kokkos(A_forward_prod, "pace:A_forward_prod", natom, idx_ms_combs_max, basis_set->rankmax + 1);
+
+    MemKK::realloc_kokkos(e_atom, "pace:e_atom", natom);
+    MemKK::realloc_kokkos(rhos, "pace:rhos", natom, basis_set->ndensitymax + 1); // +1 density for core repulsion
+    MemKK::realloc_kokkos(dF_drho, "pace:dF_drho", natom, basis_set->ndensitymax + 1); // +1 density for core repulsion
+
+    MemKK::realloc_kokkos(weights, "pace:weights", natom, nelements, nradmax + 1, (lmax + 1) * (lmax + 1));
+    MemKK::realloc_kokkos(weights_rank1, "pace:weights_rank1", natom, nelements, nradbase);
+
+    // hard-core repulsion
+    MemKK::realloc_kokkos(rho_core, "pace:rho_core", natom);
+    MemKK::realloc_kokkos(dF_drho_core, "pace:dF_drho_core", natom);
+
+    MemKK::realloc_kokkos(dB_flatten, "pace:dB_flatten", natom, idx_ms_combs_max, basis_set->rankmax);
+
+    //B-projections
+    MemKK::realloc_kokkos(projections, "pace:projections", natom, total_num_functions_max); // per-atom B-projections
+    MemKK::realloc_kokkos(d_gamma, "pace:gamma", natom); // per-atom gamma
+  }
+
+  if (((int)ylm.extent(0) < natom) || ((int)ylm.extent(1) < maxneigh)) {
+
+    // radial functions
+    MemKK::realloc_kokkos(fr, "pace:fr", natom, maxneigh, nradmax, lmax + 1);
+    MemKK::realloc_kokkos(dfr, "pace:dfr", natom, maxneigh, nradmax, lmax + 1);
+    MemKK::realloc_kokkos(gr, "pace:gr", natom, maxneigh, nradbase);
+    MemKK::realloc_kokkos(dgr, "pace:dgr", natom, maxneigh, nradbase);
+    const int max_num_functions = MAX(nradbase, nradmax*(lmax + 1));
+    MemKK::realloc_kokkos(d_values, "pace:d_values", natom, maxneigh, max_num_functions);
+    MemKK::realloc_kokkos(d_derivatives, "pace:d_derivatives", natom, maxneigh, max_num_functions);
+
+    // hard-core repulsion
+    MemKK::realloc_kokkos(cr, "pace:cr", natom, maxneigh);
+    MemKK::realloc_kokkos(dcr, "pace:dcr", natom, maxneigh);
+
+    // spherical harmonics
+    MemKK::realloc_kokkos(plm, "pace:plm", natom, maxneigh, (lmax + 1) * (lmax + 1));
+    MemKK::realloc_kokkos(dplm, "pace:dplm", natom, maxneigh, (lmax + 1) * (lmax + 1));
+    MemKK::realloc_kokkos(ylm, "pace:ylm", natom, maxneigh, (lmax + 1) * (lmax + 1));
+    MemKK::realloc_kokkos(dylm, "pace:dylm", natom, maxneigh, (lmax + 1) * (lmax + 1));
+
+    // short neigh list
+    MemKK::realloc_kokkos(d_ncount, "pace:ncount", natom);
+    MemKK::realloc_kokkos(d_mu, "pace:mu", natom, maxneigh);
+    MemKK::realloc_kokkos(d_rhats, "pace:rhats", natom, maxneigh);
+    MemKK::realloc_kokkos(d_rnorms, "pace:rnorms", natom, maxneigh);
+    MemKK::realloc_kokkos(d_nearest, "pace:nearest", natom, maxneigh);
+
+    MemKK::realloc_kokkos(f_ij, "pace:f_ij", natom, maxneigh);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairPACEExtrapolationKokkos<DeviceType>::copy_pertype()
+{
+  auto basis_set = aceimpl->basis_set;
+
+  MemKK::realloc_kokkos(d_rho_core_cutoff, "pace:rho_core_cutoff", nelements);
+  MemKK::realloc_kokkos(d_drho_core_cutoff, "pace:drho_core_cutoff", nelements);
+  MemKK::realloc_kokkos(d_E0vals, "pace:E0vals", nelements);
+  MemKK::realloc_kokkos(d_ndensity, "pace:ndensity", nelements);
+  MemKK::realloc_kokkos(d_npoti, "pace:npoti", nelements);
+
+  auto h_rho_core_cutoff = Kokkos::create_mirror_view(d_rho_core_cutoff);
+  auto h_drho_core_cutoff = Kokkos::create_mirror_view(d_drho_core_cutoff);
+  auto h_E0vals = Kokkos::create_mirror_view(d_E0vals);
+  auto h_ndensity = Kokkos::create_mirror_view(d_ndensity);
+  auto h_npoti = Kokkos::create_mirror_view(d_npoti);
+
+  for (int n = 0; n < nelements; n++) {
+    h_rho_core_cutoff[n] = basis_set->map_embedding_specifications.at(n).rho_core_cutoff;
+    h_drho_core_cutoff[n] = basis_set->map_embedding_specifications.at(n).drho_core_cutoff;
+
+    h_E0vals(n) = basis_set->E0vals(n);
+
+    h_ndensity(n) = basis_set->map_embedding_specifications.at(n).ndensity;
+
+    string npoti = basis_set->map_embedding_specifications.at(n).npoti;
+    if (npoti == "FinnisSinclair")
+      h_npoti(n) = FS;
+    else if (npoti == "FinnisSinclairShiftedScaled")
+      h_npoti(n) = FS_SHIFTEDSCALED;
+  }
+
+  Kokkos::deep_copy(d_rho_core_cutoff, h_rho_core_cutoff);
+  Kokkos::deep_copy(d_drho_core_cutoff, h_drho_core_cutoff);
+  Kokkos::deep_copy(d_E0vals, h_E0vals);
+  Kokkos::deep_copy(d_ndensity, h_ndensity);
+  Kokkos::deep_copy(d_npoti, h_npoti);
+
+  MemKK::realloc_kokkos(d_wpre, "pace:wpre", nelements, basis_set->ndensitymax);
+  MemKK::realloc_kokkos(d_mexp, "pace:mexp", nelements, basis_set->ndensitymax);
+
+  auto h_wpre = Kokkos::create_mirror_view(d_wpre);
+  auto h_mexp = Kokkos::create_mirror_view(d_mexp);
+
+  for (int n = 0; n < nelements; n++) {
+    const int ndensity = basis_set->map_embedding_specifications.at(n).ndensity;
+    for (int p = 0; p < ndensity; p++) {
+      h_wpre(n, p) = basis_set->map_embedding_specifications.at(n).FS_parameters.at(p * 2 + 0);
+      h_mexp(n, p) = basis_set->map_embedding_specifications.at(n).FS_parameters.at(p * 2 + 1);
+    }
+  }
+
+  Kokkos::deep_copy(d_wpre, h_wpre);
+  Kokkos::deep_copy(d_mexp, h_mexp);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairPACEExtrapolationKokkos<DeviceType>::copy_splines()
+{
+  auto basis_set = aceimpl->basis_set;
+
+  if (k_splines_gk.d_view.data()) {
+    for (int i = 0; i < nelements; i++) {
+      for (int j = 0; j < nelements; j++) {
+        k_splines_gk.h_view(i, j).deallocate();
+        k_splines_rnl.h_view(i, j).deallocate();
+        k_splines_hc.h_view(i, j).deallocate();
+      }
+    }
+  }
+
+  k_splines_gk = Kokkos::DualView<SplineInterpolatorKokkos**, DeviceType>("pace:splines_gk", nelements, nelements);
+  k_splines_rnl = Kokkos::DualView<SplineInterpolatorKokkos**, DeviceType>("pace:splines_rnl", nelements, nelements);
+  k_splines_hc = Kokkos::DualView<SplineInterpolatorKokkos**, DeviceType>("pace:splines_hc", nelements, nelements);
+
+  ACERadialFunctions* radial_functions = dynamic_cast<ACERadialFunctions*>(basis_set->radial_functions);
+
+  for (int i = 0; i < nelements; i++) {
+    for (int j = 0; j < nelements; j++) {
+      k_splines_gk.h_view(i, j) = radial_functions->splines_gk(i, j);
+      k_splines_rnl.h_view(i, j) = radial_functions->splines_rnl(i, j);
+      k_splines_hc.h_view(i, j) = radial_functions->splines_hc(i, j);
+    }
+  }
+
+  k_splines_gk.modify_host();
+  k_splines_rnl.modify_host();
+  k_splines_hc.modify_host();
+
+  k_splines_gk.sync_device();
+  k_splines_rnl.sync_device();
+  k_splines_hc.sync_device();
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairPACEExtrapolationKokkos<DeviceType>::copy_tilde()
+{
+  auto basis_set = aceimpl->basis_set;
+  auto b_evaluator = aceimpl->ace;
+
+  // flatten loops, get per-element count and max
+
+  idx_ms_combs_max = 0;
+  total_num_functions_max = 0;
+
+  MemKK::realloc_kokkos(d_idx_ms_combs_count, "pace:idx_ms_combs_count", nelements);
+  MemKK::realloc_kokkos(d_total_basis_size, "pace:total_basis_size", nelements);
+  auto h_idx_ms_combs_count = Kokkos::create_mirror_view(d_idx_ms_combs_count);
+  auto h_total_basis_size = Kokkos::create_mirror_view(d_total_basis_size);
+
+  for (int mu = 0; mu < nelements; mu++) {
+    int idx_ms_combs = 0;
+    const int total_basis_size_rank1 = basis_set->total_basis_size_rank1[mu];
+    const int total_basis_size = basis_set->total_basis_size[mu];
+
+    ACEBBasisFunction *basis = basis_set->basis[mu];
+
+    // rank=1
+    for (int func_rank1_ind = 0; func_rank1_ind < total_basis_size_rank1; ++func_rank1_ind)
+      idx_ms_combs++;
+
+    // rank > 1
+    for (int func_ind = 0; func_ind < total_basis_size; ++func_ind) {
+      ACEBBasisFunction *func = &basis[func_ind];
+
+      // loop over {ms} combinations in sum
+      for (int ms_ind = 0; ms_ind < func->num_ms_combs; ++ms_ind)
+        idx_ms_combs++;
+    }
+    h_idx_ms_combs_count(mu) = idx_ms_combs;
+    idx_ms_combs_max = MAX(idx_ms_combs_max, idx_ms_combs);
+    total_num_functions_max = MAX(total_num_functions_max, total_basis_size_rank1 + total_basis_size);
+    h_total_basis_size(mu) = total_basis_size_rank1 + total_basis_size;
+  }
+
+  Kokkos::deep_copy(d_idx_ms_combs_count, h_idx_ms_combs_count);
+  Kokkos::deep_copy(d_total_basis_size, h_total_basis_size);
+
+  MemKK::realloc_kokkos(d_rank, "pace:rank", nelements, total_num_functions_max);
+  MemKK::realloc_kokkos(d_num_ms_combs, "pace:num_ms_combs", nelements, total_num_functions_max);
+  MemKK::realloc_kokkos(d_func_inds, "pace:func_inds", nelements, idx_ms_combs_max);
+  MemKK::realloc_kokkos(d_mus, "pace:mus", nelements, total_num_functions_max, basis_set->rankmax);
+  MemKK::realloc_kokkos(d_ns, "pace:ns", nelements, total_num_functions_max, basis_set->rankmax);
+  MemKK::realloc_kokkos(d_ls, "pace:ls", nelements, total_num_functions_max, basis_set->rankmax);
+  MemKK::realloc_kokkos(d_ms_combs, "pace:ms_combs", nelements, idx_ms_combs_max, basis_set->rankmax);
+  MemKK::realloc_kokkos(d_gen_cgs, "pace:gen_cgs", nelements, idx_ms_combs_max);
+  MemKK::realloc_kokkos(d_coeffs, "pace:coeffs", nelements, total_num_functions_max, basis_set->ndensitymax);
+  // active set inverted
+  t_ace_3d d_ASI_temp;
+  MemKK::realloc_kokkos(d_ASI_temp, "pace:ASI_temp", nelements, total_num_functions_max, total_num_functions_max);
+
+  auto h_rank = Kokkos::create_mirror_view(d_rank);
+  auto h_num_ms_combs = Kokkos::create_mirror_view(d_num_ms_combs);
+  auto h_func_inds = Kokkos::create_mirror_view(d_func_inds);
+  auto h_mus = Kokkos::create_mirror_view(d_mus);
+  auto h_ns = Kokkos::create_mirror_view(d_ns);
+  auto h_ls = Kokkos::create_mirror_view(d_ls);
+  auto h_ms_combs = Kokkos::create_mirror_view(d_ms_combs);
+  auto h_gen_cgs = Kokkos::create_mirror_view(d_gen_cgs);
+  auto h_coeffs = Kokkos::create_mirror_view(d_coeffs);
+  // asi
+  auto h_ASI = Kokkos::create_mirror_view(d_ASI_temp);
+
+  // copy values on host
+
+  for (int mu = 0; mu < nelements; mu++) {
+    const int total_basis_size_rank1 = basis_set->total_basis_size_rank1[mu];
+    const int total_basis_size = basis_set->total_basis_size[mu];
+
+    ACEBBasisFunction *basis_rank1 = basis_set->basis_rank1[mu];
+    ACEBBasisFunction *basis = basis_set->basis[mu];
+
+    const int ndensity = basis_set->map_embedding_specifications.at(mu).ndensity;
+
+    int idx_ms_comb = 0;
+
+    // rank=1
+    for (int func_ind = 0; func_ind < total_basis_size_rank1; ++func_ind) {
+      ACEBBasisFunction *func = &basis_rank1[func_ind];
+      h_rank(mu, func_ind) = 1;
+      h_mus(mu, func_ind, 0) = func->mus[0];
+      h_ns(mu, func_ind, 0) = func->ns[0];
+
+      for (int p = 0; p < ndensity; ++p)
+            h_coeffs(mu, func_ind, p) = func->coeff[p];
+
+      h_gen_cgs(mu, idx_ms_comb) = func->gen_cgs[0];
+
+      h_func_inds(mu, idx_ms_comb) = func_ind;
+      idx_ms_comb++;
+    }
+
+    // rank > 1
+    for (int func_ind = 0; func_ind < total_basis_size; ++func_ind) {
+      ACEBBasisFunction *func = &basis[func_ind];
+      // TODO: check if func->ctildes are zero, then skip
+
+      const int func_ind_through = total_basis_size_rank1 + func_ind;
+
+      const int rank = h_rank(mu, func_ind_through) = func->rank;
+      h_num_ms_combs(mu, func_ind_through) = func->num_ms_combs;
+      for (int t = 0; t < rank; t++) {
+        h_mus(mu, func_ind_through, t) = func->mus[t];
+        h_ns(mu, func_ind_through, t) = func->ns[t];
+        h_ls(mu, func_ind_through, t) = func->ls[t];
+      }
+
+      for (int p = 0; p < ndensity; ++p)
+        h_coeffs(mu, func_ind_through, p) = func->coeff[p];
+
+
+      // loop over {ms} combinations in sum
+      for (int ms_ind = 0; ms_ind < func->num_ms_combs; ++ms_ind) {
+        auto ms = &func->ms_combs[ms_ind * rank]; // current ms-combination (of length = rank)
+        for (int t = 0; t < rank; t++)
+          h_ms_combs(mu, idx_ms_comb, t) = ms[t];
+
+
+        h_gen_cgs(mu, idx_ms_comb) = func->gen_cgs[ms_ind];
+
+
+        h_func_inds(mu, idx_ms_comb) = func_ind_through;
+        idx_ms_comb++;
+      }
+    }
+
+    // ASI
+    const auto &A_as_inv = b_evaluator->A_active_set_inv.at(mu);
+    for (int i = 0; i < total_basis_size_rank1 + total_basis_size; i++)
+      for (int j = 0; j < total_basis_size_rank1 + total_basis_size; j++){
+        h_ASI(mu,i,j) = A_as_inv(j,i); // transpose back for better performance on GPU
+    }
+  }
+
+  Kokkos::deep_copy(d_rank, h_rank);
+  Kokkos::deep_copy(d_num_ms_combs, h_num_ms_combs);
+  Kokkos::deep_copy(d_func_inds, h_func_inds);
+  Kokkos::deep_copy(d_mus, h_mus);
+  Kokkos::deep_copy(d_ns, h_ns);
+  Kokkos::deep_copy(d_ls, h_ls);
+  Kokkos::deep_copy(d_ms_combs, h_ms_combs);
+  Kokkos::deep_copy(d_gen_cgs, h_gen_cgs);
+  Kokkos::deep_copy(d_coeffs, h_coeffs);
+  Kokkos::deep_copy(d_ASI_temp, h_ASI);
+  d_ASI = d_ASI_temp; // copy from temopary array to const array
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairPACEExtrapolationKokkos<DeviceType>::init_style()
+{
+  if (host_flag) {
+    if (lmp->kokkos->nthreads > 1)
+      error->all(FLERR,"Pair style pace/extrapolation/kk can currently only run on a single "
+                         "CPU thread");
+
+    PairPACEExtrapolation::init_style();
+    return;
+  }
+
+  if (atom->tag_enable == 0) error->all(FLERR, "Pair style PACE requires atom IDs");
+  if (force->newton_pair == 0) error->all(FLERR, "Pair style PACE requires newton pair on");
+
+  // neighbor list request for KOKKOS
+
+  neighflag = lmp->kokkos->neighflag;
+
+  auto request = neighbor->add_request(this, NeighConst::REQ_FULL);
+  request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
+                           !std::is_same<DeviceType,LMPDeviceType>::value);
+  request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
+  if (neighflag == FULL)
+    error->all(FLERR,"Must use half neighbor list style with pair pace/kk");
+
+  auto basis_set = aceimpl->basis_set;
+
+  nelements = basis_set->nelements;
+  lmax = basis_set->lmax;
+  nradmax = basis_set->nradmax;
+  nradbase = basis_set->nradbase;
+
+  // spherical harmonics
+
+  MemKK::realloc_kokkos(alm, "pace:alm", (lmax + 1) * (lmax + 1));
+  MemKK::realloc_kokkos(blm, "pace:blm", (lmax + 1) * (lmax + 1));
+  MemKK::realloc_kokkos(cl, "pace:cl", lmax + 1);
+  MemKK::realloc_kokkos(dl, "pace:dl", lmax + 1);
+
+  pre_compute_harmonics(lmax);
+  copy_pertype();
+  copy_splines();
+  copy_tilde();
+}
+
+/* ----------------------------------------------------------------------
+   init for one type pair i,j and corresponding j,i
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+double PairPACEExtrapolationKokkos<DeviceType>::init_one(int i, int j)
+{
+  double cutone = PairPACEExtrapolation::init_one(i,j);
+
+  k_scale.h_view(i,j) = k_scale.h_view(j,i) = scale[i][j];
+  k_scale.template modify<LMPHostType>();
+
+  k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutone*cutone;
+  k_cutsq.template modify<LMPHostType>();
+
+  return cutone;
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairPACEExtrapolationKokkos<DeviceType>::coeff(int narg, char **arg)
+{
+  PairPACEExtrapolation::coeff(narg,arg);
+
+  auto b_evaluator = aceimpl->ace;
+  if (!b_evaluator->get_is_linear_extrapolation_grade()) {
+        error->all(FLERR,"Must use LINEAR ASI with pair pace/extrapolation/kk");
+  }
+  // Set up element lists
+
+  auto h_map = Kokkos::create_mirror_view(d_map);
+
+  for (int i = 1; i <= atom->ntypes; i++)
+    h_map(i) = map[i];
+
+  Kokkos::deep_copy(d_map,h_map);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairPACEExtrapolationKokkos<DeviceType>::allocate()
+{
+  PairPACEExtrapolation::allocate();
+
+  int n = atom->ntypes + 1;
+  MemKK::realloc_kokkos(d_map, "pace:map", n);
+
+  MemKK::realloc_kokkos(k_cutsq, "pace:cutsq", n, n);
+  d_cutsq = k_cutsq.template view<DeviceType>();
+
+  MemKK::realloc_kokkos(k_scale, "pace:scale", n, n);
+  d_scale = k_scale.template view<DeviceType>();
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct FindMaxNumNeighs {
+  typedef DeviceType device_type;
+  NeighListKokkos<DeviceType> k_list;
+
+  FindMaxNumNeighs(NeighListKokkos<DeviceType>* nl): k_list(*nl) {}
+  ~FindMaxNumNeighs() {k_list.copymode = 1;}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& ii, int& maxneigh) const {
+    const int i = k_list.d_ilist[ii];
+    const int num_neighs = k_list.d_numneigh[i];
+    if (maxneigh < num_neighs) maxneigh = num_neighs;
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairPACEExtrapolationKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
+{
+  if (host_flag) {
+    atomKK->sync(Host,X_MASK|TYPE_MASK);
+    PairPACEExtrapolation::compute(eflag_in,vflag_in);
+    atomKK->modified(Host,F_MASK);
+    return;
+  }
+  eflag = eflag_in;
+  vflag = vflag_in;
+
+  if (neighflag == FULL) no_virial_fdotr_compute = 1;
+
+  ev_init(eflag,vflag,0);
+
+  // reallocate per-atom arrays if necessary
+
+  if (eflag_atom) {
+    memoryKK->destroy_kokkos(k_eatom,eatom);
+    memoryKK->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom");
+    d_eatom = k_eatom.view<DeviceType>();
+  }
+  if (vflag_atom) {
+    memoryKK->destroy_kokkos(k_vatom,vatom);
+    memoryKK->create_kokkos(k_vatom,vatom,maxvatom,"pair:vatom");
+    d_vatom = k_vatom.view<DeviceType>();
+  }
+
+  if (gamma_flag && atom->nlocal > nmax) {
+        memory->destroy(extrapolation_grade_gamma);
+        nmax = atom->nlocal;
+        memory->create(extrapolation_grade_gamma, nmax, "pace/atom:gamma");
+        //zeroify array
+        memset(extrapolation_grade_gamma, 0, nmax * sizeof(*extrapolation_grade_gamma));
+  }
+
+  copymode = 1;
+  if (!force->newton_pair)
+    error->all(FLERR,"PairPACEExtrapolationKokkos requires 'newton on'");
+
+  atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK);
+  x = atomKK->k_x.view<DeviceType>();
+  f = atomKK->k_f.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+  k_scale.template sync<DeviceType>();
+  k_cutsq.template sync<DeviceType>();
+
+  NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(list);
+  d_numneigh = k_list->d_numneigh;
+  d_neighbors = k_list->d_neighbors;
+  d_ilist = k_list->d_ilist;
+  inum = list->inum;
+
+  need_dup = lmp->kokkos->need_dup<DeviceType>();
+  if (need_dup) {
+    dup_f     = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(f);
+    dup_vatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_vatom);
+  } else {
+    ndup_f     = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(f);
+    ndup_vatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_vatom);
+  }
+
+  maxneigh = 0;
+  Kokkos::parallel_reduce("pace::find_maxneigh", inum, FindMaxNumNeighs<DeviceType>(k_list), Kokkos::Max<int>(maxneigh));
+
+  int vector_length_default = 1;
+  int team_size_default = 1;
+  if (!host_flag)
+    team_size_default = 32;
+
+  chunk_size = MIN(chunksize,inum); // "chunksize" variable is set by user
+  chunk_offset = 0;
+
+
+  grow(chunk_size, maxneigh);
+
+  EV_FLOAT ev;
+
+  while (chunk_offset < inum) { // chunk up loop to prevent running out of memory
+
+    Kokkos::deep_copy(weights, 0.0);
+    Kokkos::deep_copy(weights_rank1, 0.0);
+    Kokkos::deep_copy(A, 0.0);
+    Kokkos::deep_copy(A_rank1, 0.0);
+    Kokkos::deep_copy(rhos, 0.0);
+
+    Kokkos::deep_copy(projections, 0.0);
+    Kokkos::deep_copy(d_gamma, 0.0);
+
+    EV_FLOAT ev_tmp;
+
+    if (chunk_size > inum - chunk_offset)
+      chunk_size = inum - chunk_offset;
+
+    //Neigh
+    {
+      int vector_length = vector_length_default;
+      int team_size = team_size_default;
+      check_team_size_for<TagPairPACEComputeNeigh>(chunk_size,team_size,vector_length);
+      int scratch_size = scratch_size_helper<int>(team_size * maxneigh);
+      typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeNeigh> policy_neigh(chunk_size,team_size,vector_length);
+      policy_neigh = policy_neigh.set_scratch_size(0, Kokkos::PerTeam(scratch_size));
+      Kokkos::parallel_for("ComputeNeigh",policy_neigh,*this);
+    }
+
+    //ComputeRadial
+    {
+      int vector_length = vector_length_default;
+      int team_size = team_size_default;
+      check_team_size_for<TagPairPACEComputeRadial>(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
+      typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeRadial> policy_radial(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
+      Kokkos::parallel_for("ComputeRadial",policy_radial,*this);
+    }
+
+    //ComputeYlm
+    {
+      int vector_length = vector_length_default;
+      int team_size = 16;
+      check_team_size_for<TagPairPACEComputeYlm>(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
+      typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeYlm> policy_ylm(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
+      Kokkos::parallel_for("ComputeYlm",policy_ylm,*this);
+    }
+
+    //ComputeAi
+    {
+      int vector_length = vector_length_default;
+      int team_size = team_size_default;
+      check_team_size_for<TagPairPACEComputeAi>(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
+      typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeAi> policy_ai(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
+      Kokkos::parallel_for("ComputeAi",policy_ai,*this);
+    }
+
+    //ConjugateAi
+    {
+      typename Kokkos::RangePolicy<DeviceType,TagPairPACEConjugateAi> policy_conj_ai(0,chunk_size);
+      Kokkos::parallel_for("ConjugateAi",policy_conj_ai,*this);
+    }
+
+    //ComputeRho
+    {
+      typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeRho> policy_rho(0, chunk_size * idx_ms_combs_max);
+      Kokkos::parallel_for("ComputeRho",policy_rho,*this);
+    }
+
+    //ComputeFS
+    {
+      typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeFS> policy_fs(0,chunk_size);
+      Kokkos::parallel_for("ComputeFS",policy_fs,*this);
+    }
+
+    //ComputeGamma
+    if (gamma_flag) {
+      typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeGamma> policy_gamma(0,chunk_size);
+      Kokkos::parallel_for("ComputeGamma",policy_gamma,*this);
+    }
+
+    //ComputeWeights
+    {
+      typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeWeights> policy_weights(0, chunk_size * idx_ms_combs_max);
+      Kokkos::parallel_for("ComputeWeights",policy_weights,*this);
+    }
+
+    //ComputeDerivative
+    {
+      int vector_length = vector_length_default;
+      int team_size = team_size_default;
+      check_team_size_for<TagPairPACEComputeDerivative>(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
+      typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeDerivative> policy_derivative(((chunk_size+team_size-1)/team_size)*maxneigh,team_size,vector_length);
+      Kokkos::parallel_for("ComputeDerivative",policy_derivative,*this);
+    }
+
+    //ComputeForce
+    {
+      if (evflag) {
+        if (neighflag == HALF) {
+          typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeForce<HALF,1> > policy_force(0,chunk_size);
+          Kokkos::parallel_reduce(policy_force, *this, ev_tmp);
+        } else if (neighflag == HALFTHREAD) {
+          typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeForce<HALFTHREAD,1> > policy_force(0,chunk_size);
+          Kokkos::parallel_reduce("ComputeForce",policy_force, *this, ev_tmp);
+        }
+      } else {
+        if (neighflag == HALF) {
+          typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeForce<HALF,0> > policy_force(0,chunk_size);
+          Kokkos::parallel_for(policy_force, *this);
+        } else if (neighflag == HALFTHREAD) {
+          typename Kokkos::RangePolicy<DeviceType,TagPairPACEComputeForce<HALFTHREAD,0> > policy_force(0,chunk_size);
+          Kokkos::parallel_for("ComputeForce",policy_force, *this);
+        }
+      }
+    }
+    ev += ev_tmp;
+
+    //if gamma_flag - copy current d_gamma to extrapolation_grade_gamma
+    if (gamma_flag){
+        h_gamma = Kokkos::create_mirror_view(d_gamma);
+        Kokkos:deep_copy(h_gamma, d_gamma);
+        memcpy(extrapolation_grade_gamma+chunk_offset, (void *) h_gamma.data(), sizeof(double)*chunk_size);
+    }
+
+    chunk_offset += chunk_size;
+  } // end while
+
+  if (need_dup)
+    Kokkos::Experimental::contribute(f, dup_f);
+
+  if (eflag_global) eng_vdwl += ev.evdwl;
+  if (vflag_global) {
+    virial[0] += ev.v[0];
+    virial[1] += ev.v[1];
+    virial[2] += ev.v[2];
+    virial[3] += ev.v[3];
+    virial[4] += ev.v[4];
+    virial[5] += ev.v[5];
+  }
+
+  if (vflag_fdotr) pair_virial_fdotr_compute(this);
+
+  if (eflag_atom) {
+    k_eatom.template modify<DeviceType>();
+    k_eatom.template sync<LMPHostType>();
+  }
+
+  if (vflag_atom) {
+    if (need_dup)
+      Kokkos::Experimental::contribute(d_vatom, dup_vatom);
+    k_vatom.template modify<DeviceType>();
+    k_vatom.template sync<LMPHostType>();
+  }
+
+  atomKK->modified(execution_space,F_MASK);
+
+  copymode = 0;
+
+  // free duplicated memory
+  if (need_dup) {
+    dup_f     = decltype(dup_f)();
+    dup_vatom = decltype(dup_vatom)();
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeNeigh>::member_type& team) const
+{
+  const int ii = team.league_rank();
+  const int i = d_ilist[ii + chunk_offset];
+  const int itype = type[i];
+  const X_FLOAT xtmp = x(i,0);
+  const X_FLOAT ytmp = x(i,1);
+  const X_FLOAT ztmp = x(i,2);
+  const int jnum = d_numneigh[i];
+
+  // get a pointer to scratch memory
+  // This is used to cache whether or not an atom is within the cutoff
+  // If it is, inside is assigned to 1, otherwise -1
+  const int team_rank = team.team_rank();
+  const int scratch_shift = team_rank * maxneigh; // offset into pointer for entire team
+  int* inside = (int*)team.team_shmem().get_shmem(team.team_size() * maxneigh * sizeof(int), 0) + scratch_shift;
+
+  // loop over list of all neighbors within force cutoff
+  // distsq[] = distance sq to each
+  // rlist[] = distance vector to each
+  // nearest[] = atom indices of neighbors
+
+  int ncount = 0;
+  Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,jnum),
+      [&] (const int jj, int& count) {
+    int j = d_neighbors(i,jj);
+    j &= NEIGHMASK;
+
+    const int jtype = type(j);
+
+    const F_FLOAT delx = xtmp - x(j,0);
+    const F_FLOAT dely = ytmp - x(j,1);
+    const F_FLOAT delz = ztmp - x(j,2);
+    const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+
+    inside[jj] = -1;
+    if (rsq < d_cutsq(itype,jtype)) {
+     inside[jj] = 1;
+     count++;
+    }
+  },ncount);
+
+  d_ncount(ii) = ncount;
+
+  Kokkos::parallel_scan(Kokkos::TeamThreadRange(team,jnum),
+      [&] (const int jj, int& offset, bool final) {
+
+    if (inside[jj] < 0) return;
+
+    if (final) {
+      int j = d_neighbors(i,jj);
+      j &= NEIGHMASK;
+      const F_FLOAT delx = xtmp - x(j,0);
+      const F_FLOAT dely = ytmp - x(j,1);
+      const F_FLOAT delz = ztmp - x(j,2);
+      const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+      const F_FLOAT r = sqrt(rsq);
+      const F_FLOAT rinv = 1.0/r;
+      const int mu_j = d_map(type(j));
+      d_mu(ii,offset) = mu_j;
+      d_rnorms(ii,offset) = r;
+      d_rhats(ii,offset,0) = -delx*rinv;
+      d_rhats(ii,offset,1) = -dely*rinv;
+      d_rhats(ii,offset,2) = -delz*rinv;
+      d_nearest(ii,offset) = j;
+    }
+    offset++;
+  });
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeRadial, const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeRadial>::member_type& team) const
+{
+  // Extract the atom number
+  int ii = team.team_rank() + team.team_size() * (team.league_rank() %
+           ((chunk_size+team.team_size()-1)/team.team_size()));
+  if (ii >= chunk_size) return;
+  const int i = d_ilist[ii + chunk_offset];
+
+  // Extract the neighbor number
+  const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
+  const int ncount = d_ncount(ii);
+  if (jj >= ncount) return;
+
+  const double r_norm = d_rnorms(ii, jj);
+  const int mu_i = d_map(type(i));
+  const int mu_j = d_mu(ii, jj);
+
+  evaluate_splines(ii, jj, r_norm, nradbase, nradmax, mu_i, mu_j);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeYlm, const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeYlm>::member_type& team) const
+{
+  // Extract the atom number
+  int ii = team.team_rank() + team.team_size() * (team.league_rank() %
+           ((chunk_size+team.team_size()-1)/team.team_size()));
+  if (ii >= chunk_size) return;
+
+  // Extract the neighbor number
+  const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
+  const int ncount = d_ncount(ii);
+  if (jj >= ncount) return;
+
+  const double xn = d_rhats(ii, jj, 0);
+  const double yn = d_rhats(ii, jj, 1);
+  const double zn = d_rhats(ii, jj, 2);
+  compute_ylm(ii,jj,xn,yn,zn,lmax);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeAi, const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeAi>::member_type& team) const
+{
+  // Extract the atom number
+  int ii = team.team_rank() + team.team_size() * (team.league_rank() %
+           ((chunk_size+team.team_size()-1)/team.team_size()));
+  if (ii >= chunk_size) return;
+
+  // Extract the neighbor number
+  const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
+  const int ncount = d_ncount(ii);
+  if (jj >= ncount) return;
+
+  const int mu_j = d_mu(ii, jj);
+
+  // rank = 1
+  for (int n = 0; n < nradbase; n++)
+    Kokkos::atomic_add(&A_rank1(ii, mu_j, n), gr(ii, jj, n) * Y00);
+
+  // rank > 1
+  for (int n = 0; n < nradmax; n++) {
+    for (int l = 0; l <= lmax; l++) {
+      for (int m = 0; m <= l; m++) {
+        const int idx = l * (l + 1) + m; // (l, m)
+        Kokkos::atomic_add(&A(ii, mu_j, n, idx).re, fr(ii, jj, n, l) * ylm(ii, jj, idx).re);
+        Kokkos::atomic_add(&A(ii, mu_j, n, idx).im, fr(ii, jj, n, l) * ylm(ii, jj, idx).im);
+      }
+    }
+  }
+
+  // hard-core repulsion
+  Kokkos::atomic_add(&rho_core(ii), cr(ii, jj));
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEConjugateAi, const int& ii) const
+{
+  //complex conjugate A's (for NEGATIVE (-m) terms)
+  // for rank > 1
+  for (int mu_j = 0; mu_j < nelements; mu_j++) {
+    for (int n = 0; n < nradmax; n++) {
+      for (int l = 0; l <= lmax; l++) {
+        //fill in -m part in the outer loop using the same m <-> -m symmetry as for Ylm
+        for (int m = 1; m <= l; m++) {
+          const int idx = l * (l + 1) + m; // (l, m)
+          const int idxm = l * (l + 1) - m; // (l, -m)
+          const int factor = m % 2 == 0 ? 1 : -1;
+          A(ii, mu_j, n, idxm) = A(ii, mu_j, n, idx).conj() * (double)factor;
+        }
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeRho, const int& iter) const
+{
+  const int idx_ms_comb = iter / chunk_size;
+  const int ii = iter % chunk_size;
+
+  const int i = d_ilist[ii + chunk_offset];
+  const int mu_i = d_map(type(i));
+
+  if (idx_ms_comb >= d_idx_ms_combs_count(mu_i)) return;
+
+  const int ndensity = d_ndensity(mu_i);
+
+  const int func_ind = d_func_inds(mu_i, idx_ms_comb);
+  const int rank = d_rank(mu_i, func_ind);
+  const int r = rank - 1;
+
+  // Basis functions B with iterative product and density rho(p) calculation
+  if (rank == 1) {
+    const int mu = d_mus(mu_i, func_ind, 0);
+    const int n = d_ns(mu_i, func_ind, 0);
+    double A_cur = A_rank1(ii, mu, n - 1);
+    for (int p = 0; p < ndensity; ++p) {
+      //for rank=1 (r=0) only 1 ms-combination exists (ms_ind=0), so index of func.ctildes is 0..ndensity-1
+      Kokkos::atomic_add(&rhos(ii, p), d_coeffs(mu_i, func_ind, p) * d_gen_cgs(mu_i, idx_ms_comb) * A_cur);
+    }
+
+
+    //gamma_i
+    if (gamma_flag)
+        Kokkos::atomic_add(&projections(ii, func_ind),  d_gen_cgs(mu_i, idx_ms_comb) * A_cur);
+
+  } else { // rank > 1
+    // loop over {ms} combinations in sum
+
+    // loop over m, collect B  = product of A with given ms
+    A_forward_prod(ii, idx_ms_comb, 0) = complex::one();
+
+    // fill forward A-product triangle
+    for (int t = 0; t < rank; t++) {
+      //TODO: optimize ns[t]-1 -> ns[t] during functions construction
+      const int mu = d_mus(mu_i, func_ind, t);
+      const int n = d_ns(mu_i, func_ind, t);
+      const int l = d_ls(mu_i, func_ind, t);
+      const int m = d_ms_combs(mu_i, idx_ms_comb, t); // current ms-combination (of length = rank)
+      const int idx = l * (l + 1) + m; // (l, m)
+      A_list(ii, idx_ms_comb, t) = A(ii, mu, n - 1, idx);
+      A_forward_prod(ii, idx_ms_comb, t + 1) = A_forward_prod(ii, idx_ms_comb, t) * A_list(ii, idx_ms_comb, t);
+    }
+
+    complex A_backward_prod = complex::one();
+
+    // fill backward A-product triangle
+    for (int t = r; t >= 1; t--) {
+      const complex dB = A_forward_prod(ii, idx_ms_comb, t) * A_backward_prod; // dB - product of all A's except t-th
+      dB_flatten(ii, idx_ms_comb, t) = dB;
+
+      A_backward_prod = A_backward_prod * A_list(ii, idx_ms_comb, t);
+    }
+    dB_flatten(ii, idx_ms_comb, 0) = A_forward_prod(ii, idx_ms_comb, 0) * A_backward_prod;
+
+    const complex B = A_forward_prod(ii, idx_ms_comb, rank);
+
+    for (int p = 0; p < ndensity; ++p) {
+      // real-part only multiplication
+      Kokkos::atomic_add(&rhos(ii, p), B.real_part_product(d_coeffs(mu_i, func_ind, p) * d_gen_cgs(mu_i, idx_ms_comb)));
+    }
+    //gamma_i
+    if (gamma_flag)
+      Kokkos::atomic_add(&projections(ii, func_ind),  B.real_part_product(d_gen_cgs(mu_i, idx_ms_comb)));
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeFS, const int& ii) const
+{
+  const int i = d_ilist[ii + chunk_offset];
+  const int mu_i = d_map(type(i));
+
+  const double rho_cut = d_rho_core_cutoff(mu_i);
+  const double drho_cut = d_drho_core_cutoff(mu_i);
+  const int ndensity = d_ndensity(mu_i);
+
+  double evdwl, fcut, dfcut;
+  evdwl = fcut = dfcut = 0.0;
+
+  inner_cutoff(rho_core(ii), rho_cut, drho_cut, fcut, dfcut);
+  FS_values_and_derivatives(ii, evdwl, mu_i);
+
+  dF_drho_core(ii) = evdwl * dfcut + 1;
+  for (int p = 0; p < ndensity; ++p)
+    dF_drho(ii, p) *= fcut;
+
+
+  // tally energy contribution
+  if (eflag) {
+    double evdwl_cut = evdwl * fcut + rho_core(ii);
+    // E0 shift
+    evdwl_cut += d_E0vals(mu_i);
+    e_atom(ii) = evdwl_cut;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeGamma, const int& ii) const
+{
+  const int i = d_ilist[ii + chunk_offset];
+  const int mu_i = d_map(type(i));
+  const int basis_size = d_total_basis_size(mu_i);
+
+  double gamma_max = 0;
+  double current_gamma;
+  for (int j = 0; j <basis_size; j++) {
+    current_gamma = 0;
+
+    // compute row-matrix-multiplication asi_vector * A_as_inv (transposed matrix)
+    for (int k = 0; k < basis_size; k++)
+      current_gamma += projections(ii,k) * d_ASI(mu_i, k, j); //correct d_ASI(mu_i, j, k), but it is transposed during initialization
+
+    current_gamma = fabs(current_gamma);
+    if (current_gamma > gamma_max)
+      gamma_max = current_gamma;
+  }
+
+  // tally energy contribution
+  d_gamma(ii) = gamma_max;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeWeights, const int& iter) const
+{
+  const int idx_ms_comb = iter / chunk_size;
+  const int ii = iter % chunk_size;
+
+  const int i = d_ilist[ii + chunk_offset];
+  const int mu_i = d_map(type(i));
+
+  if (idx_ms_comb >= d_idx_ms_combs_count(mu_i)) return;
+
+  const int ndensity = d_ndensity(mu_i);
+
+  const int func_ind = d_func_inds(mu_i, idx_ms_comb);
+  const int rank = d_rank(mu_i, func_ind);
+
+  // Weights and theta calculation
+
+  if (rank == 1) {
+    const int mu = d_mus(mu_i, func_ind, 0);
+    const int n = d_ns(mu_i, func_ind, 0);
+    double theta = 0.0;
+    for (int p = 0; p < ndensity; ++p) {
+      // for rank=1 (r=0) only 1 ms-combination exists (ms_ind=0), so index of func.ctildes is 0..ndensity-1
+      theta += dF_drho(ii, p) * d_coeffs(mu_i, func_ind, p) * d_gen_cgs(mu_i, idx_ms_comb);
+    }
+    Kokkos::atomic_add(&weights_rank1(ii, mu, n - 1), theta);
+  } else { // rank > 1
+    double theta = 0.0;
+    for (int p = 0; p < ndensity; ++p)
+      theta += dF_drho(ii, p) * d_coeffs(mu_i, func_ind, p) * d_gen_cgs(mu_i, idx_ms_comb);
+
+    theta *= 0.5; // 0.5 factor due to possible double counting ???
+    for (int t = 0; t < rank; ++t) {
+      const int m_t = d_ms_combs(mu_i, idx_ms_comb, t);
+      const int factor = (m_t % 2 == 0 ? 1 : -1);
+      const complex dB = dB_flatten(ii, idx_ms_comb, t);
+      const int mu_t = d_mus(mu_i, func_ind, t);
+      const int n_t = d_ns(mu_i, func_ind, t);
+      const int l_t = d_ls(mu_i, func_ind, t);
+      const int idx = l_t * (l_t + 1) + m_t; // (l, m)
+      const complex value = theta * dB;
+      Kokkos::atomic_add(&(weights(ii, mu_t, n_t - 1, idx).re), value.re);
+      Kokkos::atomic_add(&(weights(ii, mu_t, n_t - 1, idx).im), value.im);
+      // update -m_t (that could also be positive), because the basis is half_basis
+      const int idxm = l_t * (l_t + 1) - m_t; // (l, -m)
+      const complex valuem = theta * dB.conj() * (double)factor;
+      Kokkos::atomic_add(&(weights(ii, mu_t, n_t - 1, idxm).re), valuem.re);
+      Kokkos::atomic_add(&(weights(ii, mu_t, n_t - 1, idxm).im), valuem.im);
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeDerivative, const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeDerivative>::member_type& team) const
+{
+  // Extract the atom number
+  int ii = team.team_rank() + team.team_size() * (team.league_rank() %
+           ((chunk_size+team.team_size()-1)/team.team_size()));
+  if (ii >= chunk_size) return;
+  const int i = d_ilist[ii + chunk_offset];
+
+  // Extract the neighbor number
+  const int jj = team.league_rank() / ((chunk_size+team.team_size()-1)/team.team_size());
+  const int ncount = d_ncount(ii);
+  if (jj >= ncount) return;
+
+  const int itype = type(i);
+  const double scale = d_scale(itype,itype);
+
+  const int mu_j = d_mu(ii, jj);
+  double r_hat[3];
+  r_hat[0] = d_rhats(ii, jj, 0);
+  r_hat[1] = d_rhats(ii, jj, 1);
+  r_hat[2] = d_rhats(ii, jj, 2);
+  const double r = d_rnorms(ii, jj);
+  const double rinv = 1.0/r;
+
+  double f_ji[3];
+  f_ji[0] = f_ji[1] = f_ji[2] = 0;
+
+  // for rank = 1
+  for (int n = 0; n < nradbase; ++n) {
+    if (weights_rank1(ii, mu_j, n) == 0) continue;
+    double &DG = dgr(ii, jj, n);
+    double DGR = DG * Y00;
+    DGR *= weights_rank1(ii, mu_j, n);
+    f_ji[0] += DGR * r_hat[0];
+    f_ji[1] += DGR * r_hat[1];
+    f_ji[2] += DGR * r_hat[2];
+  }
+
+  // for rank > 1
+  for (int n = 0; n < nradmax; n++) {
+    for (int l = 0; l <= lmax; l++) {
+      const double R_over_r = fr(ii, jj, n, l) * rinv;
+      const double DR = dfr(ii, jj, n, l);
+
+      // for m >= 0
+      for (int m = 0; m <= l; m++) {
+        const int idx = l * (l + 1) + m; // (l, m)
+        complex w = weights(ii, mu_j, n, idx);
+        if (w.re == 0.0 && w.im == 0.0) continue;
+        // counting for -m cases if m > 0
+        if (m > 0) {
+          w.re *= 2.0;
+          w.im *= 2.0;
+        }
+
+        complex DY[3];
+        DY[0] = dylm(ii, jj, idx, 0);
+        DY[1] = dylm(ii, jj, idx, 1);
+        DY[2] = dylm(ii, jj, idx, 2);
+        const complex Y_DR = ylm(ii, jj, idx) * DR;
+
+        complex grad_phi_nlm[3];
+        grad_phi_nlm[0] = Y_DR * r_hat[0] + DY[0] * R_over_r;
+        grad_phi_nlm[1] = Y_DR * r_hat[1] + DY[1] * R_over_r;
+        grad_phi_nlm[2] = Y_DR * r_hat[2] + DY[2] * R_over_r;
+        // real-part multiplication only
+        f_ji[0] += w.real_part_product(grad_phi_nlm[0]);
+        f_ji[1] += w.real_part_product(grad_phi_nlm[1]);
+        f_ji[2] += w.real_part_product(grad_phi_nlm[2]);
+      }
+    }
+  }
+
+  // hard-core repulsion
+  const double fpair = dF_drho_core(ii) * dcr(ii,jj);
+  f_ij(ii, jj, 0) = scale * f_ji[0] + fpair * r_hat[0];
+  f_ij(ii, jj, 1) = scale * f_ji[1] + fpair * r_hat[1];
+  f_ij(ii, jj, 2) = scale * f_ji[2] + fpair * r_hat[2];
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+template<int NEIGHFLAG, int EVFLAG>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeForce<NEIGHFLAG,EVFLAG>, const int& ii, EV_FLOAT& ev) const
+{
+  // The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial
+  const auto v_f = ScatterViewHelper<typename NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f);
+  const auto a_f = v_f.template access<typename AtomicDup<NEIGHFLAG,DeviceType>::value>();
+
+  const int i = d_ilist[ii + chunk_offset];
+  const int itype = type(i);
+  const double scale = d_scale(itype,itype);
+
+  const int ncount = d_ncount(ii);
+
+  F_FLOAT fitmp[3] = {0.0,0.0,0.0};
+  for (int jj = 0; jj < ncount; jj++) {
+    int j = d_nearest(ii,jj);
+
+    double r_hat[3];
+    r_hat[0] = d_rhats(ii, jj, 0);
+    r_hat[1] = d_rhats(ii, jj, 1);
+    r_hat[2] = d_rhats(ii, jj, 2);
+    const double r = d_rnorms(ii, jj);
+    const double delx = -r_hat[0]*r;
+    const double dely = -r_hat[1]*r;
+    const double delz = -r_hat[2]*r;
+
+    const double fpairx = f_ij(ii, jj, 0);
+    const double fpairy = f_ij(ii, jj, 1);
+    const double fpairz = f_ij(ii, jj, 2);
+
+    fitmp[0] += fpairx;
+    fitmp[1] += fpairy;
+    fitmp[2] += fpairz;
+    a_f(j,0) -= fpairx;
+    a_f(j,1) -= fpairy;
+    a_f(j,2) -= fpairz;
+
+    // tally per-atom virial contribution
+    if (EVFLAG && vflag_either)
+      v_tally_xyz<NEIGHFLAG>(ev, i, j, fpairx, fpairy, fpairz, delx, dely, delz);
+  }
+
+  a_f(i,0) += fitmp[0];
+  a_f(i,1) += fitmp[1];
+  a_f(i,2) += fitmp[2];
+
+  // tally energy contribution
+  if (EVFLAG && eflag_either) {
+    const double evdwl = scale*e_atom(ii);
+    //ev_tally_full(i, 2.0 * evdwl, 0.0, 0.0, 0.0, 0.0, 0.0);
+    if (eflag_global) ev.evdwl += evdwl;
+    if (eflag_atom) d_eatom[i] += evdwl;
+  }
+}
+
+template<class DeviceType>
+template<int NEIGHFLAG, int EVFLAG>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::operator() (TagPairPACEComputeForce<NEIGHFLAG,EVFLAG>,const int& ii) const {
+  EV_FLOAT ev;
+  this->template operator()<NEIGHFLAG,EVFLAG>(TagPairPACEComputeForce<NEIGHFLAG,EVFLAG>(), ii, ev);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+template<int NEIGHFLAG>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::v_tally_xyz(EV_FLOAT &ev, const int &i, const int &j,
+      const F_FLOAT &fx, const F_FLOAT &fy, const F_FLOAT &fz,
+      const F_FLOAT &delx, const F_FLOAT &dely, const F_FLOAT &delz) const
+{
+  // The vatom array is duplicated for OpenMP, atomic for CUDA, and neither for Serial
+
+  auto v_vatom = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_vatom),decltype(ndup_vatom)>::get(dup_vatom,ndup_vatom);
+  auto a_vatom = v_vatom.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();
+
+  const E_FLOAT v0 = delx*fx;
+  const E_FLOAT v1 = dely*fy;
+  const E_FLOAT v2 = delz*fz;
+  const E_FLOAT v3 = delx*fy;
+  const E_FLOAT v4 = delx*fz;
+  const E_FLOAT v5 = dely*fz;
+
+  if (vflag_global) {
+    ev.v[0] += v0;
+    ev.v[1] += v1;
+    ev.v[2] += v2;
+    ev.v[3] += v3;
+    ev.v[4] += v4;
+    ev.v[5] += v5;
+  }
+
+  if (vflag_atom) {
+    a_vatom(i,0) += 0.5*v0;
+    a_vatom(i,1) += 0.5*v1;
+    a_vatom(i,2) += 0.5*v2;
+    a_vatom(i,3) += 0.5*v3;
+    a_vatom(i,4) += 0.5*v4;
+    a_vatom(i,5) += 0.5*v5;
+    a_vatom(j,0) += 0.5*v0;
+    a_vatom(j,1) += 0.5*v1;
+    a_vatom(j,2) += 0.5*v2;
+    a_vatom(j,3) += 0.5*v3;
+    a_vatom(j,4) += 0.5*v4;
+    a_vatom(j,5) += 0.5*v5;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairPACEExtrapolationKokkos<DeviceType>::pre_compute_harmonics(int lmax)
+{
+  auto h_alm = Kokkos::create_mirror_view(alm);
+  auto h_blm = Kokkos::create_mirror_view(blm);
+  auto h_cl = Kokkos::create_mirror_view(cl);
+  auto h_dl = Kokkos::create_mirror_view(dl);
+
+  for (int l = 1; l <= lmax; l++) {
+    const double lsq = l * l;
+    const double ld = 2 * l;
+    const double l1 = (4 * lsq - 1);
+    const double l2 = lsq - ld + 1;
+    for (int m = 0; m < l - 1; m++) {
+      const double msq = m * m;
+      const double a = sqrt((double(l1)) / (double(lsq - msq)));
+      const double b = -sqrt((double(l2 - msq)) / (double(4 * l2 - 1)));
+      const int idx = l * (l + 1) + m; // (l, m)
+      h_alm(idx) = a;
+      h_blm(idx) = b;
+    }
+  }
+
+  for (int l = 1; l <= lmax; l++) {
+    h_cl(l) = -sqrt(1.0 + 0.5 / (double(l)));
+    h_dl(l) = sqrt(double(2 * (l - 1) + 3));
+  }
+
+  Kokkos::deep_copy(alm, h_alm);
+  Kokkos::deep_copy(blm, h_blm);
+  Kokkos::deep_copy(cl, h_cl);
+  Kokkos::deep_copy(dl, h_dl);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::compute_barplm(int ii, int jj, double rz, int lmax) const
+{
+  // requires -1 <= rz <= 1 , NO CHECKING IS PERFORMED !!!!!!!!!
+  // prefactors include 1/sqrt(2) factor compared to reference
+
+  // l=0, m=0
+  // plm(ii, jj, 0, 0) = Y00/sq1o4pi; //= sq1o4pi;
+  plm(ii, jj, 0) = Y00; //= 1;
+  dplm(ii, jj, 0) = 0.0;
+
+  if (lmax > 0) {
+
+    // l=1, m=0
+    plm(ii, jj, 2) = Y00 * sq3 * rz;
+    dplm(ii, jj, 2) = Y00 * sq3;
+
+    // l=1, m=1
+    plm(ii, jj, 3) = -sq3o2 * Y00;
+    dplm(ii, jj, 3) = 0.0;
+
+    // loop l = 2, lmax
+    for (int l = 2; l <= lmax; l++) {
+      for (int m = 0; m < l - 1; m++) {
+        const int idx = l * (l + 1) + m; // (l, m)
+        const int idx1 = (l - 1) * l + m; // (l - 1, m)
+        const int idx2 = (l - 2) * (l - 1) + m; // (l - 2, m)
+        plm(ii, jj, idx) = alm(idx) * (rz * plm(ii, jj, idx1) + blm(idx) * plm(ii, jj, idx2));
+        dplm(ii, jj, idx) = alm(idx) * (plm(ii, jj, idx1) + rz * dplm(ii, jj, idx1) + blm(idx) * dplm(ii, jj, idx2));
+      }
+      const int idx = l * (l + 1) + l; // (l, l)
+      const int idx1 = l * (l + 1) + l - 1; // (l, l - 1)
+      const int idx2 = (l - 1) * l + l - 1; // (l - 1, l - 1)
+      const double t = dl(l) * plm(ii, jj, idx2);
+      plm(ii, jj, idx1) = t * rz;
+      dplm(ii, jj, idx1) = t;
+      plm(ii, jj, idx) = cl(l) * plm(ii, jj, idx2);
+      dplm(ii, jj, idx) = 0.0;
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::compute_ylm(int ii, int jj, double rx, double ry, double rz, int lmax) const
+{
+  // requires rx^2 + ry^2 + rz^2 = 1 , NO CHECKING IS PERFORMED !!!!!!!!!
+
+  complex phase;
+  complex phasem, mphasem1;
+  complex dyx, dyy, dyz;
+  complex rdy;
+
+  phase.re = rx;
+  phase.im = ry;
+
+  // compute barplm
+  compute_barplm(ii, jj, rz, lmax);
+
+  // m = 0
+  for (int l = 0; l <= lmax; l++) {
+    const int idx = l * (l + 1);
+
+    ylm(ii, jj, idx).re = plm(ii, jj, idx);
+    ylm(ii, jj, idx).im = 0.0;
+
+    dyz.re = dplm(ii, jj, idx);
+    rdy.re = dyz.re * rz;
+
+    dylm(ii, jj, idx, 0).re = -rdy.re * rx;
+    dylm(ii, jj, idx, 0).im = 0.0;
+    dylm(ii, jj, idx, 1).re = -rdy.re * ry;
+    dylm(ii, jj, idx, 1).im = 0.0;
+    dylm(ii, jj, idx, 2).re = dyz.re - rdy.re * rz;
+    dylm(ii, jj, idx, 2).im = 0;
+  }
+  // m = 1
+  for (int l = 1; l <= lmax; l++) {
+    const int idx = l * (l + 1) + 1;
+
+    ylm(ii, jj, idx) = phase * plm(ii, jj, idx);
+
+    dyx.re = plm(ii, jj, idx);
+    dyx.im = 0.0;
+    dyy.re = 0.0;
+    dyy.im = plm(ii, jj, idx);
+    dyz.re = phase.re * dplm(ii, jj, idx);
+    dyz.im = phase.im * dplm(ii, jj, idx);
+
+    rdy.re = rx * dyx.re + +rz * dyz.re;
+    rdy.im = ry * dyy.im + rz * dyz.im;
+
+    dylm(ii, jj, idx, 0).re = dyx.re - rdy.re * rx;
+    dylm(ii, jj, idx, 0).im = -rdy.im * rx;
+    dylm(ii, jj, idx, 1).re = -rdy.re * ry;
+    dylm(ii, jj, idx, 1).im = dyy.im - rdy.im * ry;
+    dylm(ii, jj, idx, 2).re = dyz.re - rdy.re * rz;
+    dylm(ii, jj, idx, 2).im = dyz.im - rdy.im * rz;
+  }
+
+  // m > 1
+  phasem = phase;
+  for (int m = 2; m <= lmax; m++) {
+
+    mphasem1.re = phasem.re * double(m);
+    mphasem1.im = phasem.im * double(m);
+    phasem = phasem * phase;
+
+    for (int l = m; l <= lmax; l++) {
+      const int idx = l * (l + 1) + m;
+
+      ylm(ii, jj, idx).re = phasem.re * plm(ii, jj, idx);
+      ylm(ii, jj, idx).im = phasem.im * plm(ii, jj, idx);
+
+      dyx = mphasem1 * plm(ii, jj, idx);
+      dyy.re = -dyx.im;
+      dyy.im = dyx.re;
+      dyz = phasem * dplm(ii, jj, idx);
+
+      rdy.re = rx * dyx.re + ry * dyy.re + rz * dyz.re;
+      rdy.im = rx * dyx.im + ry * dyy.im + rz * dyz.im;
+
+      dylm(ii, jj, idx, 0).re = dyx.re - rdy.re * rx;
+      dylm(ii, jj, idx, 0).im = dyx.im - rdy.im * rx;
+      dylm(ii, jj, idx, 1).re = dyy.re - rdy.re * ry;
+      dylm(ii, jj, idx, 1).im = dyy.im - rdy.im * ry;
+      dylm(ii, jj, idx, 2).re = dyz.re - rdy.re * rz;
+      dylm(ii, jj, idx, 2).im = dyz.im - rdy.im * rz;
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::cutoff_func_poly(const double r, const double r_in, const double delta_in, double &fc, double &dfc) const
+{
+  if (r <= r_in-delta_in) {
+    fc = 1;
+    dfc = 0;
+  } else if (r >= r_in ) {
+    fc = 0;
+    dfc = 0;
+  } else {
+    double x = 1 - 2 * (1 + (r - r_in) / delta_in);
+    fc = 0.5 + 7.5 / 2. * (x / 4. - pow(x, 3) / 6. + pow(x, 5) / 20.);
+    dfc = -7.5 / delta_in * (0.25 - x * x / 2.0 + pow(x, 4) / 4.);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::Fexp(const double x, const double m, double &F, double &DF) const
+{
+  const double w = 1.e6;
+  const double eps = 1e-10;
+
+  const double lambda = pow(1.0 / w, m - 1.0);
+  if (abs(x) > eps) {
+    double g;
+    const double a = abs(x);
+    const double am = pow(a, m);
+    const double w3x3 = pow(w * a, 3); //// use cube
+    const double sign_factor = (signbit(x) ? -1 : 1);
+    if (w3x3 > 30.0)
+        g = 0.0;
+    else
+        g = exp(-w3x3);
+
+    const double omg = 1.0 - g;
+    F = sign_factor * (omg * am + lambda * g * a);
+    const double dg = -3.0 * w * w * w * a * a * g;
+    DF = m * pow(a, m - 1.0) * omg - am * dg + lambda * dg * a + lambda * g;
+  } else {
+    F = lambda * x;
+    DF = lambda;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::FexpShiftedScaled(const double rho, const double mexp, double &F, double &DF) const
+{
+  const double eps = 1e-10;
+
+  if (abs(mexp - 1.0) < eps) {
+    F = rho;
+    DF = 1;
+  } else {
+    const double a = abs(rho);
+    const double exprho = exp(-a);
+    const double nx = 1. / mexp;
+    const double xoff = pow(nx, (nx / (1.0 - nx))) * exprho;
+    const double yoff = pow(nx, (1 / (1.0 - nx))) * exprho;
+    const double sign_factor = (signbit(rho) ? -1 : 1);
+    F = sign_factor * (pow(xoff + a, mexp) - yoff);
+    DF = yoff + mexp * (-xoff + 1.0) * pow(xoff + a, mexp - 1.);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::inner_cutoff(const double rho_core, const double rho_cut, const double drho_cut,
+                                     double &fcut, double &dfcut) const
+{
+  double rho_low = rho_cut - drho_cut;
+  if (rho_core >= rho_cut) {
+    fcut = 0;
+    dfcut = 0;
+  } else if (rho_core <= rho_low) {
+    fcut = 1;
+    dfcut = 0;
+  } else {
+    cutoff_func_poly(rho_core, rho_cut, drho_cut, fcut, dfcut);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::FS_values_and_derivatives(const int ii, double &evdwl, const int mu_i) const
+{
+  double F, DF = 0;
+  int npoti = d_npoti(mu_i);
+  int ndensity = d_ndensity(mu_i);
+  for (int p = 0; p < ndensity; p++) {
+    const double wpre = d_wpre(mu_i, p);
+    const double mexp = d_mexp(mu_i, p);
+
+    if (npoti == FS)
+      Fexp(rhos(ii, p), mexp, F, DF);
+    else if (npoti == FS_SHIFTEDSCALED)
+      FexpShiftedScaled(rhos(ii, p), mexp, F, DF);
+
+    evdwl += F * wpre; // * weight (wpre)
+    dF_drho(ii, p) = DF * wpre; // * weight (wpre)
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::evaluate_splines(const int ii, const int jj, double r,
+                                                  int /*nradbase_c*/, int /*nradial_c*/,
+                                                  int mu_i, int mu_j) const
+{
+  auto &spline_gk = k_splines_gk.template view<DeviceType>()(mu_i, mu_j);
+  auto &spline_rnl = k_splines_rnl.template view<DeviceType>()(mu_i, mu_j);
+  auto &spline_hc = k_splines_hc.template view<DeviceType>()(mu_i, mu_j);
+
+  spline_gk.calcSplines(ii, jj, r, gr, dgr);
+
+  spline_rnl.calcSplines(ii, jj, r, d_values, d_derivatives);
+  for (int kk = 0; kk < (int)fr.extent(2); kk++) {
+    for (int ll = 0; ll < (int)fr.extent(3); ll++) {
+      const int flatten = kk*fr.extent(3) + ll;
+      fr(ii, jj, kk, ll) = d_values(ii, jj, flatten);
+      dfr(ii, jj, kk, ll) = d_derivatives(ii, jj, flatten);
+    }
+  }
+
+  spline_hc.calcSplines(ii, jj, r, d_values, d_derivatives);
+  cr(ii, jj) = d_values(ii, jj, 0);
+  dcr(ii, jj) = d_derivatives(ii, jj, 0);
+}
+
+/* ---------------------------------------------------------------------- */
+template<class DeviceType>
+void PairPACEExtrapolationKokkos<DeviceType>::SplineInterpolatorKokkos::operator=(const SplineInterpolator &spline) {
+    cutoff = spline.cutoff;
+    deltaSplineBins = spline.deltaSplineBins;
+    ntot = spline.ntot;
+    nlut = spline.nlut;
+    invrscalelookup = spline.invrscalelookup;
+    rscalelookup = spline.rscalelookup;
+    num_of_functions = spline.num_of_functions;
+
+    lookupTable = t_ace_3d4("lookupTable", ntot+1, num_of_functions);
+    auto h_lookupTable = Kokkos::create_mirror_view(lookupTable);
+    for (int i = 0; i < ntot+1; i++)
+        for (int j = 0; j < num_of_functions; j++)
+            for (int k = 0; k < 4; k++)
+                h_lookupTable(i, j, k) = spline.lookupTable(i, j, k);
+    Kokkos::deep_copy(lookupTable, h_lookupTable);
+}
+/* ---------------------------------------------------------------------- */
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairPACEExtrapolationKokkos<DeviceType>::SplineInterpolatorKokkos::calcSplines(const int ii, const int jj, const double r, const t_ace_3d &d_values, const t_ace_3d &d_derivatives) const
+{
+  double wl, wl2, wl3, w2l1, w3l2;
+  double c[4];
+  double x = r * rscalelookup;
+  int nl = static_cast<int>(floor(x));
+
+  if (nl <= 0)
+    Kokkos::abort("Encountered very small distance. Stopping.");
+
+  if (nl < nlut) {
+    wl = x - double(nl);
+    wl2 = wl * wl;
+    wl3 = wl2 * wl;
+    w2l1 = 2.0 * wl;
+    w3l2 = 3.0 * wl2;
+    for (int func_id = 0; func_id < num_of_functions; func_id++) {
+      for (int idx = 0; idx < 4; idx++)
+        c[idx] = lookupTable(nl, func_id, idx);
+      d_values(ii, jj, func_id) = c[0] + c[1] * wl + c[2] * wl2 + c[3] * wl3;
+      d_derivatives(ii, jj, func_id) = (c[1] + c[2] * w2l1 + c[3] * w3l2) * rscalelookup;
+    }
+  } else { // fill with zeroes
+    for (int func_id = 0; func_id < num_of_functions; func_id++) {
+      d_values(ii, jj, func_id) = 0.0;
+      d_derivatives(ii, jj, func_id) = 0.0;
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+template<class TagStyle>
+void PairPACEExtrapolationKokkos<DeviceType>::check_team_size_for(int inum, int &team_size, int vector_length) {
+  int team_size_max;
+
+  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelForTag());
+
+  if (team_size*vector_length > team_size_max)
+    team_size = team_size_max/vector_length;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+template<class TagStyle>
+void PairPACEExtrapolationKokkos<DeviceType>::check_team_size_reduce(int inum, int &team_size, int vector_length) {
+  int team_size_max;
+
+  team_size_max = Kokkos::TeamPolicy<DeviceType,TagStyle>(inum,Kokkos::AUTO).team_size_max(*this,Kokkos::ParallelReduceTag());
+
+  if (team_size*vector_length > team_size_max)
+    team_size = team_size_max/vector_length;
+}
+
+template<class DeviceType>
+template<typename scratch_type>
+int PairPACEExtrapolationKokkos<DeviceType>::scratch_size_helper(int values_per_team) {
+  typedef Kokkos::View<scratch_type*, Kokkos::DefaultExecutionSpace::scratch_memory_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> > ScratchViewType;
+
+  return ScratchViewType::shmem_size(values_per_team);
+}
+
+/* ----------------------------------------------------------------------
+   memory usage of arrays
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+double PairPACEExtrapolationKokkos<DeviceType>::memory_usage()
+{
+  double bytes = 0;
+
+  bytes += MemKK::memory_usage(A);
+  bytes += MemKK::memory_usage(A_rank1);
+  bytes += MemKK::memory_usage(A_list);
+  bytes += MemKK::memory_usage(A_forward_prod);
+  bytes += MemKK::memory_usage(e_atom);
+  bytes += MemKK::memory_usage(rhos);
+  bytes += MemKK::memory_usage(dF_drho);
+  bytes += MemKK::memory_usage(weights);
+  bytes += MemKK::memory_usage(weights_rank1);
+  bytes += MemKK::memory_usage(rho_core);
+  bytes += MemKK::memory_usage(dF_drho_core);
+  bytes += MemKK::memory_usage(dB_flatten);
+  bytes += MemKK::memory_usage(fr);
+  bytes += MemKK::memory_usage(dfr);
+  bytes += MemKK::memory_usage(gr);
+  bytes += MemKK::memory_usage(dgr);
+  bytes += MemKK::memory_usage(d_values);
+  bytes += MemKK::memory_usage(d_derivatives);
+  bytes += MemKK::memory_usage(cr);
+  bytes += MemKK::memory_usage(dcr);
+  bytes += MemKK::memory_usage(plm);
+  bytes += MemKK::memory_usage(dplm);
+  bytes += MemKK::memory_usage(ylm);
+  bytes += MemKK::memory_usage(dylm);
+  bytes += MemKK::memory_usage(d_ncount);
+  bytes += MemKK::memory_usage(d_mu);
+  bytes += MemKK::memory_usage(d_rhats);
+  bytes += MemKK::memory_usage(d_rnorms);
+  bytes += MemKK::memory_usage(d_nearest);
+  bytes += MemKK::memory_usage(f_ij);
+  bytes += MemKK::memory_usage(d_rho_core_cutoff);
+  bytes += MemKK::memory_usage(d_drho_core_cutoff);
+  bytes += MemKK::memory_usage(d_E0vals);
+  bytes += MemKK::memory_usage(d_ndensity);
+  bytes += MemKK::memory_usage(d_npoti);
+  bytes += MemKK::memory_usage(d_wpre);
+  bytes += MemKK::memory_usage(d_mexp);
+  bytes += MemKK::memory_usage(d_idx_ms_combs_count);
+  bytes += MemKK::memory_usage(d_rank);
+  bytes += MemKK::memory_usage(d_num_ms_combs);
+  bytes += MemKK::memory_usage(d_func_inds);
+  bytes += MemKK::memory_usage(d_mus);
+  bytes += MemKK::memory_usage(d_ns);
+  bytes += MemKK::memory_usage(d_ls);
+  bytes += MemKK::memory_usage(d_ms_combs);
+  bytes += MemKK::memory_usage(d_gen_cgs);
+  bytes += MemKK::memory_usage(d_coeffs);
+  bytes += MemKK::memory_usage(alm);
+  bytes += MemKK::memory_usage(blm);
+  bytes += MemKK::memory_usage(cl);
+  bytes += MemKK::memory_usage(dl);
+  bytes += MemKK::memory_usage(d_total_basis_size);
+  bytes += MemKK::memory_usage(d_ASI);
+  bytes += MemKK::memory_usage(projections);
+  bytes += MemKK::memory_usage(d_gamma);
+
+  if (k_splines_gk.h_view.data()) {
+    for (int i = 0; i < nelements; i++) {
+      for (int j = 0; j < nelements; j++) {
+        bytes += k_splines_gk.h_view(i, j).memory_usage();
+        bytes += k_splines_rnl.h_view(i, j).memory_usage();
+        bytes += k_splines_hc.h_view(i, j).memory_usage();
+      }
+    }
+  }
+
+  return bytes;
+}
+
+/* ----------------------------------------------------------------------
+    extract method for extracting value of scale variable
+ ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void *PairPACEExtrapolationKokkos<DeviceType>::extract(const char *str, int &dim)
+{
+  //check if str=="gamma_flag" then compute extrapolation grades on this iteration
+  dim = 0;
+  if (strcmp(str, "gamma_flag") == 0) return (void *) &gamma_flag;
+
+  dim = 2;
+  if (strcmp(str, "scale") == 0) return (void *) scale;
+  return nullptr;
+}
+
+/* ----------------------------------------------------------------------
+   peratom requests from FixPair
+   return ptr to requested data
+   also return ncol = # of quantites per atom
+     0 = per-atom vector
+     1 or more = # of columns in per-atom array
+   return NULL if str is not recognized
+---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void *PairPACEExtrapolationKokkos<DeviceType>::extract_peratom(const char *str, int &ncol)
+{
+  if (strcmp(str, "gamma") == 0) {
+    ncol = 0;
+    return (void *) extrapolation_grade_gamma;
+  }
+
+  return nullptr;
+}
+
+/* ---------------------------------------------------------------------- */
+
+namespace LAMMPS_NS {
+template class PairPACEExtrapolationKokkos<LMPDeviceType>;
+#ifdef LMP_KOKKOS_GPU
+template class PairPACEExtrapolationKokkos<LMPHostType>;
+#endif
+}
+
diff --git a/src/KOKKOS/pair_pace_extrapolation_kokkos.h b/src/KOKKOS/pair_pace_extrapolation_kokkos.h
new file mode 100644
index 0000000000..a65841f8f6
--- /dev/null
+++ b/src/KOKKOS/pair_pace_extrapolation_kokkos.h
@@ -0,0 +1,339 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+// clang-format off
+PairStyle(pace/extrapolation/kk,PairPACEExtrapolationKokkos<LMPDeviceType>);
+PairStyle(pace/extrapolation/kk/device,PairPACEExtrapolationKokkos<LMPDeviceType>);
+PairStyle(pace/extrapolation/kk/host,PairPACEExtrapolationKokkos<LMPHostType>);
+// clang-format on
+#else
+
+// clang-format off
+#ifndef LMP_PAIR_PACE_EXTRAPOLATION_KOKKOS_H
+#define LMP_PAIR_PACE_EXTRAPOLATION_KOKKOS_H
+
+#include "pair_pace_extrapolation.h"
+#include "kokkos_type.h"
+#include "pair_kokkos.h"
+
+class SplineInterpolator;
+
+namespace LAMMPS_NS {
+
+template<class DeviceType>
+class PairPACEExtrapolationKokkos : public PairPACEExtrapolation {
+ public:
+  struct TagPairPACEComputeNeigh{};
+  struct TagPairPACEComputeRadial{};
+  struct TagPairPACEComputeYlm{};
+  struct TagPairPACEComputeAi{};
+  struct TagPairPACEConjugateAi{};
+  struct TagPairPACEComputeRho{};
+  struct TagPairPACEComputeFS{};
+  struct TagPairPACEComputeGamma{};
+  struct TagPairPACEComputeWeights{};
+  struct TagPairPACEComputeDerivative{};
+
+  template<int NEIGHFLAG, int EVFLAG>
+  struct TagPairPACEComputeForce{};
+
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typedef EV_FLOAT value_type;
+  using complex = SNAComplex<double>;
+
+  PairPACEExtrapolationKokkos(class LAMMPS *);
+  ~PairPACEExtrapolationKokkos() override;
+
+  void compute(int, int) override;
+  void coeff(int, char **) override;
+  void init_style() override;
+  double init_one(int, int) override;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairPACEComputeNeigh,const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeNeigh>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairPACEComputeRadial,const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeRadial>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairPACEComputeYlm,const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeYlm>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairPACEComputeAi,const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeAi>::member_type& team) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairPACEConjugateAi,const int& ii) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairPACEComputeRho,const int& iter) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairPACEComputeFS,const int& ii) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairPACEComputeGamma, const int& ii) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairPACEComputeWeights,const int& iter) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairPACEComputeDerivative,const typename Kokkos::TeamPolicy<DeviceType, TagPairPACEComputeDerivative>::member_type& team) const;
+
+  template<int NEIGHFLAG, int EVFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairPACEComputeForce<NEIGHFLAG,EVFLAG>,const int& ii) const;
+
+  template<int NEIGHFLAG, int EVFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagPairPACEComputeForce<NEIGHFLAG,EVFLAG>,const int& ii, EV_FLOAT&) const;
+
+
+  void *extract(const char *str, int &dim);
+  void *extract_peratom(const char *str, int &ncol);
+
+ protected:
+  int inum, maxneigh, chunk_size, chunk_offset, idx_ms_combs_max, total_num_functions_max;
+  int host_flag;
+  int gamma_flag;
+
+  int eflag, vflag;
+
+  int neighflag, max_ndensity;
+  int nelements, lmax, nradmax, nradbase;
+
+  typename AT::t_neighbors_2d d_neighbors;
+  typename AT::t_int_1d_randomread d_ilist;
+  typename AT::t_int_1d_randomread d_numneigh;
+
+  DAT::tdual_efloat_1d k_eatom;
+  DAT::tdual_virial_array k_vatom;
+  typename AT::t_efloat_1d d_eatom;
+  typename AT::t_virial_array d_vatom;
+
+  typename AT::t_x_array_randomread x;
+  typename AT::t_f_array f;
+  typename AT::t_int_1d_randomread type;
+
+  typedef Kokkos::DualView<F_FLOAT**, DeviceType> tdual_fparams;
+  tdual_fparams k_cutsq, k_scale;
+  typedef Kokkos::View<F_FLOAT**, DeviceType> t_fparams;
+  t_fparams d_cutsq, d_scale;
+
+  typename AT::t_int_1d d_map;
+
+  int need_dup;
+
+  using KKDeviceType = typename KKDevice<DeviceType>::value;
+
+  template<typename DataType, typename Layout>
+  using DupScatterView = KKScatterView<DataType, Layout, KKDeviceType, KKScatterSum, KKScatterDuplicated>;
+
+  template<typename DataType, typename Layout>
+  using NonDupScatterView = KKScatterView<DataType, Layout, KKDeviceType, KKScatterSum, KKScatterNonDuplicated>;
+
+  DupScatterView<F_FLOAT*[3], typename DAT::t_f_array::array_layout> dup_f;
+  DupScatterView<F_FLOAT*[6], typename DAT::t_virial_array::array_layout> dup_vatom;
+
+  NonDupScatterView<F_FLOAT*[3], typename DAT::t_f_array::array_layout> ndup_f;
+  NonDupScatterView<F_FLOAT*[6], typename DAT::t_virial_array::array_layout> ndup_vatom;
+
+  friend void pair_virial_fdotr_compute<PairPACEExtrapolationKokkos>(PairPACEExtrapolationKokkos*);
+
+  void grow(int, int);
+  void copy_pertype();
+  void copy_splines();
+  void copy_tilde();
+  void allocate() override;
+  void precompute_harmonics();
+  double memory_usage() override;
+
+  template<int NEIGHFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void v_tally_xyz(EV_FLOAT &ev, const int &i, const int &j,
+      const F_FLOAT &fx, const F_FLOAT &fy, const F_FLOAT &fz,
+      const F_FLOAT &delx, const F_FLOAT &dely, const F_FLOAT &delz) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void compute_barplm(int, int, double, int) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void compute_ylm(int, int, double, double, double, int) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void cutoff_func_poly(const double, const double, const double, double &, double &) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void Fexp(const double, const double, double &, double &) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void FexpShiftedScaled(const double, const double, double &, double &) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void inner_cutoff(const double, const double, const double, double &, double &) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void FS_values_and_derivatives(const int, double&, const int) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void evaluate_splines(const int, const int, double, int, int, int, int) const;
+
+  template<class TagStyle>
+  void check_team_size_for(int, int&, int);
+
+  template<class TagStyle>
+  void check_team_size_reduce(int, int&, int);
+
+  // Utility routine which wraps computing per-team scratch size requirements for
+  // ComputeNeigh, ComputeUi, and ComputeFusedDeidrj
+  template <typename scratch_type>
+  int scratch_size_helper(int values_per_team);
+
+  typedef Kokkos::View<int*, DeviceType> t_ace_1i;
+  typedef Kokkos::View<int**, DeviceType> t_ace_2i;
+  typedef Kokkos::View<int***, DeviceType> t_ace_3i;
+  typedef Kokkos::View<int****, DeviceType> t_ace_4i;
+  typedef Kokkos::View<double*, DeviceType> t_ace_1d;
+  typedef Kokkos::View<double**, DeviceType> t_ace_2d;
+  typedef Kokkos::View<double*[3], DeviceType> t_ace_2d3;
+  typedef Kokkos::View<double***, DeviceType> t_ace_3d;
+  typedef Kokkos::View<const double***, DeviceType> tc_ace_3d;
+  typedef Kokkos::View<double**[3], DeviceType> t_ace_3d3;
+  typedef Kokkos::View<double**[4], DeviceType> t_ace_3d4;
+  typedef Kokkos::View<double****, DeviceType> t_ace_4d;
+  typedef Kokkos::View<complex*, DeviceType> t_ace_1c;
+  typedef Kokkos::View<complex**, DeviceType> t_ace_2c;
+  typedef Kokkos::View<complex***, DeviceType> t_ace_3c;
+  typedef Kokkos::View<complex**[3], DeviceType> t_ace_3c3;
+  typedef Kokkos::View<complex****, DeviceType> t_ace_4c;
+  typedef Kokkos::View<complex***[3], DeviceType> t_ace_4c3;
+
+  typedef Kokkos::View<double*>::HostMirror th_ace_1d;
+
+  t_ace_3d A_rank1;
+  t_ace_4c A;
+
+  t_ace_3c A_list;
+  t_ace_3c A_forward_prod;
+
+  t_ace_3d weights_rank1;
+  t_ace_4c weights;
+
+  t_ace_1d e_atom;
+  t_ace_2d rhos;
+  t_ace_2d dF_drho;
+
+  // hard-core repulsion
+  t_ace_1d rho_core;
+  t_ace_3c dB_flatten;
+  t_ace_2d cr;
+  t_ace_2d dcr;
+  t_ace_1d dF_drho_core;
+
+  // radial functions
+  t_ace_4d fr;
+  t_ace_4d dfr;
+  t_ace_3d gr;
+  t_ace_3d dgr;
+  t_ace_3d d_values;
+  t_ace_3d d_derivatives;
+
+  // inverted active set
+  tc_ace_3d d_ASI;
+  t_ace_2d projections;
+  t_ace_1d d_gamma;
+  th_ace_1d h_gamma;
+
+  // Spherical Harmonics
+  void pre_compute_harmonics(int);
+
+  KOKKOS_INLINE_FUNCTION
+  void compute_barplm(double rz, int lmaxi);
+
+  KOKKOS_INLINE_FUNCTION
+  void compute_ylm(double rx, double ry, double rz, int lmaxi);
+
+  t_ace_1d alm;
+  t_ace_1d blm;
+  t_ace_1d cl;
+  t_ace_1d dl;
+
+  t_ace_3d plm;
+  t_ace_3d dplm;
+
+  t_ace_3c ylm;
+  t_ace_4c3 dylm;
+
+  // short neigh list
+  t_ace_1i d_ncount;
+  t_ace_2d d_mu;
+  t_ace_2d d_rnorms;
+  t_ace_3d3 d_rhats;
+  t_ace_2i d_nearest;
+
+  // per-type
+  t_ace_1i d_ndensity;
+  t_ace_1i d_npoti;
+  t_ace_1d d_rho_core_cutoff;
+  t_ace_1d d_drho_core_cutoff;
+  t_ace_1d d_E0vals;
+  t_ace_2d d_wpre;
+  t_ace_2d d_mexp;
+
+  // tilde
+  t_ace_1i d_idx_ms_combs_count;
+  t_ace_1i d_total_basis_size;
+  t_ace_2i d_rank;
+  t_ace_2i d_num_ms_combs;
+  t_ace_2i d_func_inds;
+  t_ace_3i d_mus;
+  t_ace_3i d_ns;
+  t_ace_3i d_ls;
+  t_ace_3i d_ms_combs;
+//  t_ace_3d d_ctildes;
+  t_ace_2d d_gen_cgs;
+  t_ace_3d d_coeffs;
+
+  t_ace_3d3 f_ij;
+
+ public:
+  struct SplineInterpolatorKokkos {
+    int ntot, nlut, num_of_functions;
+    double cutoff, deltaSplineBins, invrscalelookup, rscalelookup;
+
+    t_ace_3d4 lookupTable;
+
+    void operator=(const SplineInterpolator &spline);
+
+    void deallocate() {
+      lookupTable = t_ace_3d4();
+    }
+
+    double memory_usage() {
+      return lookupTable.span() * sizeof(typename decltype(lookupTable)::value_type);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void calcSplines(const int ii, const int jj, const double r, const t_ace_3d &d_values, const t_ace_3d &d_derivatives) const;
+  };
+
+  Kokkos::DualView<SplineInterpolatorKokkos**, DeviceType> k_splines_gk;
+  Kokkos::DualView<SplineInterpolatorKokkos**, DeviceType> k_splines_rnl;
+  Kokkos::DualView<SplineInterpolatorKokkos**, DeviceType> k_splines_hc;
+
+};
+}    // namespace LAMMPS_NS
+
+#endif
+#endif
diff --git a/src/KOKKOS/pair_pace_kokkos.cpp b/src/KOKKOS/pair_pace_kokkos.cpp
index dd5b97fe32..6f1e3feaf8 100644
--- a/src/KOKKOS/pair_pace_kokkos.cpp
+++ b/src/KOKKOS/pair_pace_kokkos.cpp
@@ -1574,7 +1574,25 @@ void PairPACEKokkos<DeviceType>::evaluate_splines(const int ii, const int jj, do
 }
 
 /* ---------------------------------------------------------------------- */
+template<class DeviceType>
+void PairPACEKokkos<DeviceType>::SplineInterpolatorKokkos::operator=(const SplineInterpolator &spline) {
+    cutoff = spline.cutoff;
+    deltaSplineBins = spline.deltaSplineBins;
+    ntot = spline.ntot;
+    nlut = spline.nlut;
+    invrscalelookup = spline.invrscalelookup;
+    rscalelookup = spline.rscalelookup;
+    num_of_functions = spline.num_of_functions;
 
+    lookupTable = t_ace_3d4("lookupTable", ntot+1, num_of_functions);
+    auto h_lookupTable = Kokkos::create_mirror_view(lookupTable);
+    for (int i = 0; i < ntot+1; i++)
+        for (int j = 0; j < num_of_functions; j++)
+            for (int k = 0; k < 4; k++)
+                h_lookupTable(i, j, k) = spline.lookupTable(i, j, k);
+    Kokkos::deep_copy(lookupTable, h_lookupTable);
+}
+/* ---------------------------------------------------------------------- */
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairPACEKokkos<DeviceType>::SplineInterpolatorKokkos::calcSplines(const int ii, const int jj, const double r, const t_ace_3d &d_values, const t_ace_3d &d_derivatives) const
diff --git a/src/KOKKOS/pair_pace_kokkos.h b/src/KOKKOS/pair_pace_kokkos.h
index cc2a3cb674..39cfd100f8 100644
--- a/src/KOKKOS/pair_pace_kokkos.h
+++ b/src/KOKKOS/pair_pace_kokkos.h
@@ -24,10 +24,11 @@ PairStyle(pace/kk/host,PairPACEKokkos<LMPHostType>);
 #define LMP_PAIR_PACE_KOKKOS_H
 
 #include "pair_pace.h"
-#include "ace-evaluator/ace_radial.h"
 #include "kokkos_type.h"
 #include "pair_kokkos.h"
 
+class SplineInterpolator;
+
 namespace LAMMPS_NS {
 
 template<class DeviceType>
@@ -293,23 +294,7 @@ class PairPACEKokkos : public PairPACE {
 
     t_ace_3d4 lookupTable;
 
-    void operator=(const SplineInterpolator &spline) {
-      cutoff = spline.cutoff;
-      deltaSplineBins = spline.deltaSplineBins;
-      ntot = spline.ntot;
-      nlut = spline.nlut;
-      invrscalelookup = spline.invrscalelookup;
-      rscalelookup = spline.rscalelookup;
-      num_of_functions = spline.num_of_functions;
-
-      lookupTable = t_ace_3d4("lookupTable", ntot+1, num_of_functions);
-      auto h_lookupTable = Kokkos::create_mirror_view(lookupTable);
-      for (int i = 0; i < ntot+1; i++)
-        for (int j = 0; j < num_of_functions; j++)
-          for (int k = 0; k < 4; k++)
-            h_lookupTable(i, j, k) = spline.lookupTable(i, j, k);
-      Kokkos::deep_copy(lookupTable, h_lookupTable);
-    }
+    void operator=(const SplineInterpolator &spline);
 
     void deallocate() {
       lookupTable = t_ace_3d4();
diff --git a/src/ML-PACE/pair_pace_extrapolation.cpp b/src/ML-PACE/pair_pace_extrapolation.cpp
index ec185e75df..8a0116526a 100644
--- a/src/ML-PACE/pair_pace_extrapolation.cpp
+++ b/src/ML-PACE/pair_pace_extrapolation.cpp
@@ -98,6 +98,8 @@ PairPACEExtrapolation::PairPACEExtrapolation(LAMMPS *lmp) : Pair(lmp)
   scale = nullptr;
   flag_compute_extrapolation_grade = 0;
   extrapolation_grade_gamma = nullptr;
+
+  chunksize = 4096;
 }
 
 /* ----------------------------------------------------------------------
@@ -133,16 +135,12 @@ void PairPACEExtrapolation::compute(int eflag, int vflag)
 
   double **x = atom->x;
   double **f = atom->f;
-  tagint *tag = atom->tag;
   int *type = atom->type;
   // number of atoms in cell
   int nlocal = atom->nlocal;
 
   int newton_pair = force->newton_pair;
 
-  // number of atoms including ghost atoms
-  int nall = nlocal + atom->nghost;
-
   // inum: length of the neighborlists list
   inum = list->inum;
 
@@ -283,7 +281,20 @@ void PairPACEExtrapolation::allocate()
 
 void PairPACEExtrapolation::settings(int narg, char **arg)
 {
-  if (narg > 0) error->all(FLERR, "Pair style pace/extrapolation supports no keywords");
+//  if (narg > 2) error->all(FLERR, "Pair style pace/extrapolation supports no keywords");
+  if (narg > 2) utils::missing_cmd_args(FLERR, "pair_style pace/extrapolation", error);
+  // ACE potentials are parameterized in metal units
+  if (strcmp("metal", update->unit_style) != 0)
+    error->all(FLERR, "ACE potentials require 'metal' units");
+
+  int iarg = 0;
+  while (iarg < narg) {
+      if (strcmp(arg[iarg], "chunksize") == 0) {
+          chunksize = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
+          iarg += 2;
+      } else
+          error->all(FLERR, "Unknown pair_style pace keyword: {}", arg[iarg]);
+  }
 
   if (comm->me == 0)
     utils::logmesg(lmp, "ACE/AL version: {}.{}.{}\n", VERSION_YEAR, VERSION_MONTH, VERSION_DAY);
@@ -343,7 +354,6 @@ void PairPACEExtrapolation::coeff(int narg, char **arg)
   aceimpl->rec_ace->element_type_mapping.init(atom->ntypes + 1);
   aceimpl->rec_ace->element_type_mapping.fill(-1);    //-1 means atom not included into potential
 
-  FILE *species_type_file = nullptr;
 
   const int n = atom->ntypes;
   element_names.resize(n);
diff --git a/src/ML-PACE/pair_pace_extrapolation.h b/src/ML-PACE/pair_pace_extrapolation.h
index c5d9da23db..6f7eeb279e 100644
--- a/src/ML-PACE/pair_pace_extrapolation.h
+++ b/src/ML-PACE/pair_pace_extrapolation.h
@@ -49,13 +49,15 @@ class PairPACEExtrapolation : public Pair {
   struct ACEALImpl *aceimpl;
   int nmax;
 
-  void allocate();
+  virtual void allocate();
   std::vector<std::string> element_names;    // list of elements (used by dump pace/extrapolation)
   double *extrapolation_grade_gamma;         //per-atom gamma value
 
   int flag_compute_extrapolation_grade;
 
   double **scale;
+
+  int chunksize;
 };
 
 }    // namespace LAMMPS_NS
diff --git a/src/fix_pair.cpp b/src/fix_pair.cpp
index 5e5ccbf810..59f9dbda4a 100644
--- a/src/fix_pair.cpp
+++ b/src/fix_pair.cpp
@@ -21,6 +21,7 @@
 #include "memory.h"
 #include "pair.h"
 #include "update.h"
+#include "fmt/format.h"
 
 using namespace LAMMPS_NS;
 using namespace FixConst;
@@ -36,7 +37,7 @@ FixPair::FixPair(LAMMPS *lmp, int narg, char **arg) :
   if (nevery < 1) error->all(FLERR,"Illegal fix pair every value: {}", nevery);
 
   pairname = utils::strdup(arg[4]);
-  pstyle = force->pair_match(pairname,1,0);
+  query_pstyle(lmp);
   if (pstyle == nullptr) error->all(FLERR,"Pair style {} for fix pair not found", pairname);
 
   nfield = (narg-5) / 2;
@@ -130,6 +131,28 @@ FixPair::FixPair(LAMMPS *lmp, int narg, char **arg) :
   lasttime = -1;
 }
 
+/* ---------------------------------------------------------------------- */
+
+void FixPair::query_pstyle(LAMMPS *lmp) {
+    char *cptr=nullptr;
+    int nsub = 0;
+    if ((cptr = strchr(pairname, ':'))) {
+        *cptr = '\0';
+        nsub = utils::inumeric(FLERR,cptr+1,false,lmp);
+    }
+    pstyle = nullptr;
+    if (lmp->suffix_enable) {
+        if (lmp->suffix) {
+            pstyle = force->pair_match(fmt::format("{}/{}", pairname, lmp->suffix), 1, nsub);
+            if (pstyle == nullptr && (lmp->suffix2)) {
+                pstyle = force->pair_match(fmt::format("{}/{}", pairname, lmp->suffix2), 1, nsub);
+            }
+        }
+    }
+    if (pstyle == nullptr) pstyle = force->pair_match(pairname, 1, nsub);
+}
+
+
 /* ---------------------------------------------------------------------- */
 
 FixPair::~FixPair()
@@ -171,7 +194,7 @@ void FixPair::init()
 {
   // insure pair style still exists
 
-  pstyle = force->pair_match(pairname,1,0);
+  query_pstyle(lmp);
   if (pstyle == nullptr) error->all(FLERR,"Pair style {} for fix pair not found", pairname);
 }
 
diff --git a/src/fix_pair.h b/src/fix_pair.h
index 765b261ee5..57fbdf8ecb 100644
--- a/src/fix_pair.h
+++ b/src/fix_pair.h
@@ -56,6 +56,8 @@ class FixPair : public Fix {
   class Pair *pstyle;
   double *vector;
   double **array;
+
+  void query_pstyle(LAMMPS *lmp);
 };
 
 }    // namespace LAMMPS_NS