diff --git a/doc/src/Commands_pair.rst b/doc/src/Commands_pair.rst
index f902b40b29..11334ea2d1 100644
--- a/doc/src/Commands_pair.rst
+++ b/doc/src/Commands_pair.rst
@@ -194,7 +194,7 @@ OPT.
    * :doc:`lubricateU/poly <pair_lubricateU>`
    * :doc:`mdpd <pair_mesodpd>`
    * :doc:`mdpd/rhosum <pair_mesodpd>`
-   * :doc:`meam <pair_meam>`
+   * :doc:`meam (k) <pair_meam>`
    * :doc:`meam/spline (o) <pair_meam_spline>`
    * :doc:`meam/sw/spline <pair_meam_sw_spline>`
    * :doc:`mesocnt <pair_mesocnt>`
diff --git a/doc/src/package.rst b/doc/src/package.rst
index 8f9a65b2cc..6ff34515cf 100644
--- a/doc/src/package.rst
+++ b/doc/src/package.rst
@@ -71,7 +71,7 @@ Syntax
            *no_affinity* values = none
        *kokkos* args = keyword value ...
          zero or more keyword/value pairs may be appended
-         keywords = *neigh* or *neigh/qeq* or *neigh/thread* or *newton* or *binsize* or *comm* or *comm/exchange* or *comm/forward* *comm/pair/forward* *comm/fix/forward* or *comm/reverse* or *gpu/aware* or *pair/only*
+         keywords = *neigh* or *neigh/qeq* or *neigh/thread* or *newton* or *binsize* or *comm* or *comm/exchange* or *comm/forward* *comm/pair/forward* *comm/fix/forward* or *comm/reverse* or *comm/pair/reverse* or *gpu/aware* or *pair/only*
            *neigh* value = *full* or *half*
              full = full neighbor list
              half = half neighbor list built in thread-safe manner
@@ -96,6 +96,7 @@ Syntax
            *comm/pair/forward* value = *no* or *device*
            *comm/fix/forward* value = *no* or *device*
            *comm/reverse* value = *no* or *host* or *device*
+           *comm/pair/reverse* value = *no* or *device*
              no = perform communication pack/unpack in non-KOKKOS mode
              host = perform pack/unpack on host (e.g. with OpenMP threading)
              device = perform pack/unpack on device (e.g. on GPU)
@@ -500,7 +501,7 @@ rule of thumb may give too large a binsize and the default should be
 overridden with a smaller value.
 
 The *comm* and *comm/exchange* and *comm/forward* and *comm/pair/forward*
-and *comm/fix/forward* and comm/reverse*
+and *comm/fix/forward* and *comm/reverse* and *comm/pair/reverse*
 keywords determine whether the host or device performs the packing and
 unpacking of data when communicating per-atom data between processors.
 "Exchange" communication happens only on timesteps that neighbor lists
@@ -521,9 +522,16 @@ packing/unpacking data for the communication. A value of *host* means to
 use the host, typically a multi-core CPU, and perform the
 packing/unpacking in parallel with threads. A value of *device* means to
 use the device, typically a GPU, to perform the packing/unpacking
-operation. If a value of *host* is used for the *comm/pair/forward* or
-*comm/fix/forward* keyword, it will be automatically be changed to *no*
-since these keywords don't support *host* mode.
+operation.
+
+For the *comm/pair/forward* or *comm/fix/forward* or *comm/pair/reverse*
+keywords, if a value of *host* is used it will be automatically
+be changed to *no* since these keywords don't support *host* mode. The
+value of *no* will also always be used when running on the CPU, i.e. setting
+the value to *device* will have no effect if the pair/fix style is 
+running on the CPU. For the *comm/fix/forward* or *comm/pair/reverse*
+keywords, not all styles support *device* mode and in that case will run
+in *no* mode instead.
 
 The optimal choice for these keywords depends on the input script and
 the hardware used. The *no* value is useful for verifying that the
diff --git a/doc/src/pair_meam.rst b/doc/src/pair_meam.rst
index afa89e10ca..eac8449b21 100644
--- a/doc/src/pair_meam.rst
+++ b/doc/src/pair_meam.rst
@@ -1,8 +1,11 @@
 .. index:: pair_style meam
+.. index:: pair_style meam/kk
 
 pair_style meam command
 =========================
 
+Accelerator Variants: *meam/kk*
+
 Syntax
 """"""
 
@@ -347,6 +350,12 @@ Most published MEAM parameter sets use the default values *attrac* = *repulse* =
 Setting *repuls* = *attrac* = *delta* corresponds to the form used in several
 recent published MEAM parameter sets, such as :ref:`(Valone) <Valone>`
 
+----------
+
+.. include:: accel_styles.rst
+
+----------
+
 .. note::
 
    The default form of the *erose* expression in LAMMPS was corrected
diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index 952f21e403..314205ea3e 100755
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -182,6 +182,13 @@ action kokkos_base.h
 action kokkos_base_fft.h fft3d.h
 action kokkos_few.h
 action kokkos_type.h
+action meam_kokkos.h meam.h
+action meam_dens_final_kokkos.h meam_dens_final.cpp
+action meam_dens_init_kokkos.h meam_dens_init.cpp
+action meam_force_kokkos.h meam_force.cpp
+action meam_funcs_kokkos.h meam_funcs.cpp
+action meam_impl_kokkos.h meam_impl.cpp
+action meam_setup_done_kokkos.h meam_setup_done.cpp
 action memory_kokkos.h
 action modify_kokkos.cpp
 action modify_kokkos.h
@@ -207,13 +214,6 @@ action nbin_ssa_kokkos.cpp nbin_ssa.cpp
 action nbin_ssa_kokkos.h nbin_ssa.h
 action math_special_kokkos.cpp
 action math_special_kokkos.h
-action meam_setup_done_kokkos.h meam_setup_done.cpp
-action meam_kokkos.h meam.h
-action meam_impl_kokkos.h meam_impl.cpp
-action meam_funcs_kokkos.h meam_funcs.cpp
-action meam_force_kokkos.h meam_force.cpp
-action meam_dens_init_kokkos.h meam_dens_init.cpp
-action meam_dens_final_kokkos.h meam_dens_final.cpp
 action min_cg_kokkos.cpp
 action min_cg_kokkos.h
 action min_kokkos.cpp
@@ -294,8 +294,8 @@ action pair_lj_gromacs_kokkos.cpp pair_lj_gromacs.cpp
 action pair_lj_gromacs_kokkos.h pair_lj_gromacs.h
 action pair_lj_sdk_kokkos.cpp pair_lj_sdk.cpp
 action pair_lj_sdk_kokkos.h pair_lj_sdk.h
-action pair_meam_kokkos.h pair_meam.h
 action pair_meam_kokkos.cpp pair_meam.cpp
+action pair_meam_kokkos.h pair_meam.h
 action pair_morse_kokkos.cpp
 action pair_morse_kokkos.h
 action pair_multi_lucy_rx_kokkos.cpp pair_multi_lucy_rx.cpp
diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp
index 4883838273..fea5864225 100644
--- a/src/KOKKOS/comm_kokkos.cpp
+++ b/src/KOKKOS/comm_kokkos.cpp
@@ -109,6 +109,7 @@ void CommKokkos::init()
   exchange_comm_classic = lmp->kokkos->exchange_comm_classic;
   forward_comm_classic = lmp->kokkos->forward_comm_classic;
   forward_pair_comm_classic = lmp->kokkos->forward_pair_comm_classic;
+  reverse_pair_comm_classic = lmp->kokkos->reverse_pair_comm_classic;
   forward_fix_comm_classic = lmp->kokkos->forward_fix_comm_classic;
   reverse_comm_classic = lmp->kokkos->reverse_comm_classic;
   exchange_comm_on_host = lmp->kokkos->exchange_comm_on_host;
@@ -478,12 +479,13 @@ void CommKokkos::forward_comm_device(Pair *pair)
   int nsize = pair->comm_forward;
   KokkosBase* pairKKBase = dynamic_cast<KokkosBase*>(pair);
 
+  int nmax = max_buf_pair;
   for (iswap = 0; iswap < nswap; iswap++) {
-    int n = MAX(max_buf_pair,nsize*sendnum[iswap]);
-    n = MAX(n,nsize*recvnum[iswap]);
-    if (n > max_buf_pair)
-      grow_buf_pair(n);
+    nmax = MAX(nmax,nsize*sendnum[iswap]);
+    nmax = MAX(nmax,nsize*recvnum[iswap]);
   }
+  if (nmax > max_buf_pair)
+    grow_buf_pair(nmax);
 
   for (iswap = 0; iswap < nswap; iswap++) {
 
@@ -545,8 +547,76 @@ void CommKokkos::grow_buf_fix(int n) {
 
 void CommKokkos::reverse_comm(Pair *pair)
 {
-  k_sendlist.sync<LMPHostType>();
-  CommBrick::reverse_comm(pair);
+  if (pair->execution_space == Host || !pair->reverse_comm_device || reverse_pair_comm_classic) {
+    k_sendlist.sync<LMPHostType>();
+    CommBrick::reverse_comm(pair);
+  } else {
+    k_sendlist.sync<LMPDeviceType>();
+    reverse_comm_device<LMPDeviceType>(pair);
+  }
+}
+
+template<class DeviceType>
+void CommKokkos::reverse_comm_device(Pair *pair)
+{
+  int iswap,n;
+  MPI_Request request;
+  DAT::tdual_xfloat_1d k_buf_tmp;
+
+  KokkosBase* pairKKBase = dynamic_cast<KokkosBase*>(pair);
+
+  int nsize = MAX(pair->comm_reverse,pair->comm_reverse_off);
+
+  int nmax = max_buf_pair;
+  for (iswap = 0; iswap < nswap; iswap++) {
+    nmax = MAX(nmax,nsize*sendnum[iswap]);
+    nmax = MAX(nmax,nsize*recvnum[iswap]);
+  }
+  if (nmax > max_buf_pair)
+    grow_buf_pair(nmax);
+
+  for (iswap = nswap-1; iswap >= 0; iswap--) {
+
+    // pack buffer
+
+    n = pairKKBase->pack_reverse_comm_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_send_pair);
+    DeviceType().fence();
+
+    // exchange with another proc
+    // if self, set recv buffer to send buffer
+
+    double* buf_send_pair;
+    double* buf_recv_pair;
+    if (lmp->kokkos->gpu_aware_flag) {
+      buf_send_pair = k_buf_send_pair.view<DeviceType>().data();
+      buf_recv_pair = k_buf_recv_pair.view<DeviceType>().data();
+    } else {
+      k_buf_send_pair.modify<DeviceType>();
+      k_buf_send_pair.sync<LMPHostType>();
+      buf_send_pair = k_buf_send_pair.h_view.data();
+      buf_recv_pair = k_buf_recv_pair.h_view.data();
+    }
+
+    if (sendproc[iswap] != me) {
+      if (sendnum[iswap])
+        MPI_Irecv(buf_recv_pair,nsize*sendnum[iswap],MPI_DOUBLE,sendproc[iswap],0,world,&request);
+      if (recvnum[iswap])
+        MPI_Send(buf_send_pair,n,MPI_DOUBLE,recvproc[iswap],0,world);
+      if (sendnum[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE);
+
+      if (!lmp->kokkos->gpu_aware_flag) {
+        k_buf_recv_pair.modify<LMPHostType>();
+        k_buf_recv_pair.sync<DeviceType>();
+      }
+      k_buf_tmp = k_buf_recv_pair;
+    } else k_buf_tmp = k_buf_send_pair;
+
+    // unpack buffer
+
+    pairKKBase->unpack_reverse_comm_kokkos(sendnum[iswap],k_sendlist,
+                                       iswap,k_buf_tmp);
+    DeviceType().fence();
+  }
 }
 
 void CommKokkos::forward_comm(Dump *dump)
diff --git a/src/KOKKOS/comm_kokkos.h b/src/KOKKOS/comm_kokkos.h
index 3fcce82e2c..80736c1f92 100644
--- a/src/KOKKOS/comm_kokkos.h
+++ b/src/KOKKOS/comm_kokkos.h
@@ -27,6 +27,7 @@ class CommKokkos : public CommBrick {
   bool exchange_comm_classic;
   bool forward_comm_classic;
   bool forward_pair_comm_classic;
+  bool reverse_pair_comm_classic;
   bool forward_fix_comm_classic;
   bool reverse_comm_classic;
   bool exchange_comm_on_host;
@@ -58,6 +59,7 @@ class CommKokkos : public CommBrick {
   template<class DeviceType> void forward_comm_device(int dummy);
   template<class DeviceType> void reverse_comm_device();
   template<class DeviceType> void forward_comm_device(Pair *pair);
+  template<class DeviceType> void reverse_comm_device(Pair *pair);
   template<class DeviceType> void forward_comm_device(Fix *fix, int size=0);
   template<class DeviceType> void exchange_device();
   template<class DeviceType> void borders_device();
diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp
index 18f05fa281..3ecce0d75d 100644
--- a/src/KOKKOS/kokkos.cpp
+++ b/src/KOKKOS/kokkos.cpp
@@ -91,6 +91,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
   exchange_comm_changed = 0;
   forward_comm_changed = 0;
   forward_pair_comm_changed = 0;
+  reverse_pair_comm_changed = 0;
   forward_fix_comm_changed = 0;
   reverse_comm_changed = 0;
 
@@ -239,7 +240,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
     newtonflag = 0;
 
     exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
-    forward_pair_comm_classic = forward_fix_comm_classic = 0;
+    forward_pair_comm_classic = reverse_pair_comm_classic = forward_fix_comm_classic = 0;
 
     exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
   } else {
@@ -253,7 +254,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
     newtonflag = 1;
 
     exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 1;
-    forward_pair_comm_classic = forward_fix_comm_classic = 1;
+    forward_pair_comm_classic = reverse_pair_comm_classic = forward_fix_comm_classic = 1;
 
     exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
   }
@@ -394,17 +395,17 @@ void KokkosLMP::accelerator(int narg, char **arg)
       if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
       if (strcmp(arg[iarg+1],"no") == 0) {
         exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 1;
-        forward_pair_comm_classic = forward_fix_comm_classic = 1;
+        forward_pair_comm_classic = reverse_pair_comm_classic = forward_fix_comm_classic = 1;
 
         exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
       } else if (strcmp(arg[iarg+1],"host") == 0) {
         exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
-        forward_pair_comm_classic = forward_fix_comm_classic = 1;
+        forward_pair_comm_classic = reverse_pair_comm_classic = forward_fix_comm_classic = 1;
 
         exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 1;
       } else if (strcmp(arg[iarg+1],"device") == 0) {
         exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
-        forward_pair_comm_classic = forward_fix_comm_classic = 0;
+        forward_pair_comm_classic = reverse_pair_comm_classic = forward_fix_comm_classic = 0;
 
         exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
       } else error->all(FLERR,"Illegal package kokkos command");
@@ -441,6 +442,14 @@ void KokkosLMP::accelerator(int narg, char **arg)
       else error->all(FLERR,"Illegal package kokkos command");
       forward_pair_comm_changed = 0;
       iarg += 2;
+    } else if (strcmp(arg[iarg],"comm/pair/reverse") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
+      if (strcmp(arg[iarg+1],"no") == 0) reverse_pair_comm_classic = 1;
+      else if (strcmp(arg[iarg+1],"host") == 0) reverse_pair_comm_classic = 1;
+      else if (strcmp(arg[iarg+1],"device") == 0) reverse_pair_comm_classic = 0;
+      else error->all(FLERR,"Illegal package kokkos command");
+      reverse_pair_comm_changed = 0;
+      iarg += 2;
     } else if (strcmp(arg[iarg],"comm/fix/forward") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
       if (strcmp(arg[iarg+1],"no") == 0) forward_fix_comm_classic = 1;
@@ -515,6 +524,10 @@ void KokkosLMP::accelerator(int narg, char **arg)
       forward_pair_comm_classic = 1;
       forward_pair_comm_changed = 1;
     }
+    if (reverse_pair_comm_classic == 0) {
+      reverse_pair_comm_classic = 1;
+      reverse_pair_comm_changed = 1;
+    }
     if (forward_fix_comm_classic == 0) {
       forward_fix_comm_classic = 1;
       forward_fix_comm_changed = 1;
@@ -540,6 +553,10 @@ void KokkosLMP::accelerator(int narg, char **arg)
       forward_pair_comm_classic = 0;
       forward_pair_comm_changed = 0;
     }
+    if (reverse_pair_comm_changed) {
+      reverse_pair_comm_classic = 0;
+      reverse_pair_comm_changed = 0;
+    }
     if (forward_fix_comm_changed) {
       forward_fix_comm_classic = 0;
       forward_fix_comm_changed = 0;
diff --git a/src/KOKKOS/kokkos.h b/src/KOKKOS/kokkos.h
index 07d172e524..1cecd69c5f 100644
--- a/src/KOKKOS/kokkos.h
+++ b/src/KOKKOS/kokkos.h
@@ -30,6 +30,7 @@ class KokkosLMP : protected Pointers {
   int exchange_comm_classic;
   int forward_comm_classic;
   int forward_pair_comm_classic;
+  int reverse_pair_comm_classic;
   int forward_fix_comm_classic;
   int reverse_comm_classic;
   int exchange_comm_on_host;
@@ -38,6 +39,7 @@ class KokkosLMP : protected Pointers {
   int exchange_comm_changed;
   int forward_comm_changed;
   int forward_pair_comm_changed;
+  int reverse_pair_comm_changed;
   int forward_fix_comm_changed;
   int reverse_comm_changed;
   int nthreads,ngpus;
diff --git a/src/KOKKOS/kokkos_base.h b/src/KOKKOS/kokkos_base.h
index f18d55eec2..c593f0ae60 100644
--- a/src/KOKKOS/kokkos_base.h
+++ b/src/KOKKOS/kokkos_base.h
@@ -29,6 +29,10 @@ class KokkosBase {
                                        int, int *) {return 0;};
   virtual void unpack_forward_comm_kokkos(int, int, DAT::tdual_xfloat_1d &) {}
 
+  virtual int pack_reverse_comm_kokkos(int, int, DAT::tdual_xfloat_1d &) {return 0;};
+  virtual void unpack_reverse_comm_kokkos(int, DAT::tdual_int_2d,
+                                          int, DAT::tdual_xfloat_1d &) {}
+
   // Fix
   virtual int pack_forward_comm_fix_kokkos(int, DAT::tdual_int_2d,
                                            int, DAT::tdual_xfloat_1d &,
diff --git a/src/KOKKOS/math_special_kokkos.cpp b/src/KOKKOS/math_special_kokkos.cpp
index 89f6586581..eca64fbb4e 100644
--- a/src/KOKKOS/math_special_kokkos.cpp
+++ b/src/KOKKOS/math_special_kokkos.cpp
@@ -1,11 +1,202 @@
 // clang-format off
-
 #include "math_special_kokkos.h"
+
 #include <cmath>
-#include <cstdint>
+#include <cstdint> // IWYU pragma: keep
+#include <limits>
 
 using namespace LAMMPS_NS;
 
+static constexpr int nmaxfactorial = 167;
+
+/* ----------------------------------------------------------------------
+   factorial n table, size nmaxfactorial+1
+------------------------------------------------------------------------- */
+
+static const double nfac_table[] = {
+  1,
+  1,
+  2,
+  6,
+  24,
+  120,
+  720,
+  5040,
+  40320,
+  362880,
+  3628800,
+  39916800,
+  479001600,
+  6227020800,
+  87178291200,
+  1307674368000,
+  20922789888000,
+  355687428096000,
+  6.402373705728e+15,
+  1.21645100408832e+17,
+  2.43290200817664e+18,
+  5.10909421717094e+19,
+  1.12400072777761e+21,
+  2.5852016738885e+22,
+  6.20448401733239e+23,
+  1.5511210043331e+25,
+  4.03291461126606e+26,
+  1.08888694504184e+28,
+  3.04888344611714e+29,
+  8.8417619937397e+30,
+  2.65252859812191e+32,
+  8.22283865417792e+33,
+  2.63130836933694e+35,
+  8.68331761881189e+36,
+  2.95232799039604e+38,
+  1.03331479663861e+40,
+  3.71993326789901e+41,
+  1.37637530912263e+43,
+  5.23022617466601e+44,
+  2.03978820811974e+46,
+  8.15915283247898e+47,
+  3.34525266131638e+49,
+  1.40500611775288e+51,
+  6.04152630633738e+52,
+  2.65827157478845e+54,
+  1.1962222086548e+56,
+  5.50262215981209e+57,
+  2.58623241511168e+59,
+  1.24139155925361e+61,
+  6.08281864034268e+62,
+  3.04140932017134e+64,
+  1.55111875328738e+66,
+  8.06581751709439e+67,
+  4.27488328406003e+69,
+  2.30843697339241e+71,
+  1.26964033536583e+73,
+  7.10998587804863e+74,
+  4.05269195048772e+76,
+  2.35056133128288e+78,
+  1.3868311854569e+80,
+  8.32098711274139e+81,
+  5.07580213877225e+83,
+  3.14699732603879e+85,
+  1.98260831540444e+87,
+  1.26886932185884e+89,
+  8.24765059208247e+90,
+  5.44344939077443e+92,
+  3.64711109181887e+94,
+  2.48003554243683e+96,
+  1.71122452428141e+98,
+  1.19785716699699e+100,
+  8.50478588567862e+101,
+  6.12344583768861e+103,
+  4.47011546151268e+105,
+  3.30788544151939e+107,
+  2.48091408113954e+109,
+  1.88549470166605e+111,
+  1.45183092028286e+113,
+  1.13242811782063e+115,
+  8.94618213078297e+116,
+  7.15694570462638e+118,
+  5.79712602074737e+120,
+  4.75364333701284e+122,
+  3.94552396972066e+124,
+  3.31424013456535e+126,
+  2.81710411438055e+128,
+  2.42270953836727e+130,
+  2.10775729837953e+132,
+  1.85482642257398e+134,
+  1.65079551609085e+136,
+  1.48571596448176e+138,
+  1.3520015276784e+140,
+  1.24384140546413e+142,
+  1.15677250708164e+144,
+  1.08736615665674e+146,
+  1.03299784882391e+148,
+  9.91677934870949e+149,
+  9.61927596824821e+151,
+  9.42689044888324e+153,
+  9.33262154439441e+155,
+  9.33262154439441e+157,
+  9.42594775983835e+159,
+  9.61446671503512e+161,
+  9.90290071648618e+163,
+  1.02990167451456e+166,
+  1.08139675824029e+168,
+  1.14628056373471e+170,
+  1.22652020319614e+172,
+  1.32464181945183e+174,
+  1.44385958320249e+176,
+  1.58824554152274e+178,
+  1.76295255109024e+180,
+  1.97450685722107e+182,
+  2.23119274865981e+184,
+  2.54355973347219e+186,
+  2.92509369349301e+188,
+  3.3931086844519e+190,
+  3.96993716080872e+192,
+  4.68452584975429e+194,
+  5.5745857612076e+196,
+  6.68950291344912e+198,
+  8.09429852527344e+200,
+  9.8750442008336e+202,
+  1.21463043670253e+205,
+  1.50614174151114e+207,
+  1.88267717688893e+209,
+  2.37217324288005e+211,
+  3.01266001845766e+213,
+  3.8562048236258e+215,
+  4.97450422247729e+217,
+  6.46685548922047e+219,
+  8.47158069087882e+221,
+  1.118248651196e+224,
+  1.48727070609069e+226,
+  1.99294274616152e+228,
+  2.69047270731805e+230,
+  3.65904288195255e+232,
+  5.01288874827499e+234,
+  6.91778647261949e+236,
+  9.61572319694109e+238,
+  1.34620124757175e+241,
+  1.89814375907617e+243,
+  2.69536413788816e+245,
+  3.85437071718007e+247,
+  5.5502938327393e+249,
+  8.04792605747199e+251,
+  1.17499720439091e+254,
+  1.72724589045464e+256,
+  2.55632391787286e+258,
+  3.80892263763057e+260,
+  5.71338395644585e+262,
+  8.62720977423323e+264,
+  1.31133588568345e+267,
+  2.00634390509568e+269,
+  3.08976961384735e+271,
+  4.78914290146339e+273,
+  7.47106292628289e+275,
+  1.17295687942641e+278,
+  1.85327186949373e+280,
+  2.94670227249504e+282,
+  4.71472363599206e+284,
+  7.59070505394721e+286,
+  1.22969421873945e+289,
+  2.0044015765453e+291,
+  3.28721858553429e+293,
+  5.42391066613159e+295,
+  9.00369170577843e+297,
+  1.503616514865e+300, // nmaxfactorial = 167
+};
+
+/* ----------------------------------------------------------------------
+   factorial n vial lookup from precomputed table
+------------------------------------------------------------------------- */
+
+double MathSpecialKokkos::factorial(const int n)
+{
+  if (n < 0 || n > nmaxfactorial)
+    return std::numeric_limits<double>::quiet_NaN();
+
+  return nfac_table[n];
+}
+
+
 /* Library libcerf:
  *   Compute complex error functions, based on a new implementation of
  *   Faddeeva's w_of_z. Also provide Dawson and Voigt functions.
@@ -477,59 +668,3 @@ double MathSpecialKokkos::erfcx_y100(const double y100)
     return 1.0;
 } /* erfcx_y100 */
 
-/* optimizer friendly implementation of exp2(x).
- *
- * strategy:
- *
- * split argument into an integer part and a fraction:
- * ipart = floor(x+0.5);
- * fpart = x - ipart;
- *
- * compute exp2(ipart) from setting the ieee754 exponent
- * compute exp2(fpart) using a pade' approximation for x in [-0.5;0.5[
- *
- * the result becomes: exp2(x) = exp2(ipart) * exp2(fpart)
- */
-
-/* IEEE 754 double precision floating point data manipulation */
-typedef union
-{
-    double   f;
-    uint64_t u;
-    struct {int32_t  i0,i1;} s;
-}  udi_t;
-
-static const double fm_exp2_q[] = {
-/*  1.00000000000000000000e0, */
-    2.33184211722314911771e2,
-    4.36821166879210612817e3
-};
-static const double fm_exp2_p[] = {
-    2.30933477057345225087e-2,
-    2.02020656693165307700e1,
-    1.51390680115615096133e3
-};
-
-double MathSpecialKokkos::exp2_x86(double x)
-{
-    double   ipart, fpart, px, qx;
-    udi_t    epart;
-
-    ipart = floor(x+0.5);
-    fpart = x - ipart;
-    epart.s.i0 = 0;
-    epart.s.i1 = (((int) ipart) + 1023) << 20;
-
-    x = fpart*fpart;
-
-    px =        fm_exp2_p[0];
-    px = px*x + fm_exp2_p[1];
-    qx =    x + fm_exp2_q[0];
-    px = px*x + fm_exp2_p[2];
-    qx = qx*x + fm_exp2_q[1];
-
-    px = px * fpart;
-
-    x = 1.0 + 2.0*(px/(qx-px));
-    return epart.f*x;
-}
diff --git a/src/KOKKOS/math_special_kokkos.h b/src/KOKKOS/math_special_kokkos.h
index f2d23a65eb..0264cb4134 100644
--- a/src/KOKKOS/math_special_kokkos.h
+++ b/src/KOKKOS/math_special_kokkos.h
@@ -1,4 +1,3 @@
-// clang-format off
 /* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    https://www.lammps.org/, Sandia National Laboratories
@@ -15,98 +14,240 @@
 #ifndef LMP_MATH_SPECIAL_KOKKOS_H
 #define LMP_MATH_SPECIAL_KOKKOS_H
 
-#include <cmath>
 #include "kokkos_type.h"
+#include <cmath>
 
 namespace LAMMPS_NS {
 
 namespace MathSpecialKokkos {
 
+  /*! Fast tabulated factorial function
+   *
+   *  This function looks up pre-computed factorial values for arguments of n = 0
+   *  to a maximum of 167, which is the maximal value representable by a double
+   *  precision floating point number.  For other values of n a NaN value is returned.
+   *
+   *  \param   n  argument (valid: 0 <= n <= 167)
+   *  \return  value of n! as double precision number or NaN */
+
+  extern double factorial(const int n);
+
+  /* optimizer friendly implementation of exp2(x).
+   *
+   * strategy:
+   *
+   * split argument into an integer part and a fraction:
+   * ipart = floor(x+0.5);
+   * fpart = x - ipart;
+   *
+   * compute exp2(ipart) from setting the ieee754 exponent
+   * compute exp2(fpart) using a pade' approximation for x in [-0.5;0.5[
+   *
+   * the result becomes: exp2(x) = exp2(ipart) * exp2(fpart)
+   */
+
+  /* IEEE 754 double precision floating point data manipulation */
+  typedef union
+  {
+    double   f;
+    uint64_t u;
+    struct {int32_t  i0,i1;} s;
+  }  udi_t;
+
+  /* double precision constants */
+  #define FM_DOUBLE_LOG2OFE  1.4426950408889634074
+
+  /*! Fast implementation of 2^x without argument checks for little endian CPUs
+   *
+   *  This function implements an optimized version of pow(2.0, x) that does not
+   *  check for valid arguments and thus may only be used where arguments are well
+   *  behaved.  The implementation makes assumptions about the layout of double
+   *  precision floating point numbers in memory and thus will only work on little
+   *  endian CPUs.  If little endian cannot be safely detected, the result of
+   *  calling pow(2.0, x) will be returned.  This function also is the basis for
+   *  the fast exponential fm_exp(x).
+   *
+   *  \param   x argument
+   *  \return  value of 2^x as double precision number */
+
+  KOKKOS_INLINE_FUNCTION
+  static double exp2_x86(double x)
+  {
+  #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+      double   ipart, fpart, px, qx;
+      udi_t    epart;
+
+  const double fm_exp2_q[2] = {
+  /*  1.00000000000000000000e0, */
+      2.33184211722314911771e2,
+      4.36821166879210612817e3
+  };
+  const double fm_exp2_p[3] = {
+      2.30933477057345225087e-2,
+      2.02020656693165307700e1,
+      1.51390680115615096133e3
+  };
+
+      ipart = floor(x+0.5);
+      fpart = x - ipart;
+      epart.s.i0 = 0;
+      epart.s.i1 = (((int) ipart) + 1023) << 20;
+
+      x = fpart*fpart;
+
+      px =        fm_exp2_p[0];
+      px = px*x + fm_exp2_p[1];
+      qx =    x + fm_exp2_q[0];
+      px = px*x + fm_exp2_p[2];
+      qx = qx*x + fm_exp2_q[1];
+
+      px = px * fpart;
+
+      x = 1.0 + 2.0*(px/(qx-px));
+      return epart.f*x;
+  #else
+      return pow(2.0, x);
+  #endif
+  }
+
+  /*! Fast implementation of exp(x) for little endian CPUs
+   *
+   *  This function implements an optimized version of exp(x) for little endian CPUs.
+   *  It calls the exp2_x86(x) function with a suitable prefactor to x to return exp(x).
+   *  The implementation makes assumptions about the layout of double
+   *  precision floating point numbers in memory and thus will only work on little
+   *  endian CPUs.  If little endian cannot be safely detected, the result of
+   *  calling the exp(x) implementation in the standard math library will be returned.
+   *
+   *  \param   x argument
+   *  \return  value of e^x as double precision number */
+
+  KOKKOS_INLINE_FUNCTION
+  static double fm_exp(double x)
+  {
+  #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+      if (x < -1022.0/FM_DOUBLE_LOG2OFE) return 0;
+      if (x > 1023.0/FM_DOUBLE_LOG2OFE) return INFINITY;
+      return exp2_x86(FM_DOUBLE_LOG2OFE * x);
+  #else
+      return ::exp(x);
+  #endif
+  }
+
   // support function for scaled error function complement
 
   extern double erfcx_y100(const double y100);
 
-  // fast 2**x function without argument checks for little endian CPUs
-  extern double exp2_x86(double x);
-
-  // scaled error function complement exp(x*x)*erfc(x) for coul/long styles
+  /*! Fast scaled error function complement exp(x*x)*erfc(x) for coul/long styles
+   *
+   *  This is a portable fast implementation of exp(x*x)*erfc(x) that can be used
+   *  in coul/long pair styles as a replacement for the polynomial expansion that
+   *  is/was widely used.  Unlike the polynomial expansion, that is only accurate
+   *  at the level of single precision floating point it provides full double precision
+   *  accuracy, but at comparable speed (unlike the erfc() implementation shipped
+   *  with GNU standard math library).
+   *
+   *  \param   x argument
+   *  \return  value of e^(x*x)*erfc(x) */
 
   static inline double my_erfcx(const double x)
   {
-    if (x >= 0.0) return erfcx_y100(400.0/(4.0+x));
-    else return 2.0*exp(x*x) - erfcx_y100(400.0/(4.0-x));
+    if (x >= 0.0)
+      return erfcx_y100(400.0 / (4.0 + x));
+    else
+      return 2.0 * exp(x * x) - erfcx_y100(400.0 / (4.0 - x));
   }
 
-  // exp(-x*x) for coul/long styles
+  /*! Fast implementation of exp(-x*x) for little endian CPUs for coul/long styles
+   *
+   *  This function implements an optimized version of exp(-x*x) based on exp2_x86()
+   *  for use with little endian CPUs. If little endian cannot be safely detected,
+   *  the result of calling the exp(-x*x) implementation in the standard math
+   *  library will be returned.
+   *
+   *  \param   x argument
+   *  \return  value of e^(-x*x) as double precision number */
 
   static inline double expmsq(double x)
   {
     x *= x;
-    x *= 1.4426950408889634074; // log_2(e)
-#if defined(__BYTE_ORDER__) &&  __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+    x *= 1.4426950408889634074;    // log_2(e)
+#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
     return (x < 1023.0) ? exp2_x86(-x) : 0.0;
 #else
     return (x < 1023.0) ? exp2(-x) : 0.0;
 #endif
   }
 
-#define FM_DOUBLE_LOG2OFE  1.4426950408889634074
-  // fast e**x function for little endian CPUs, falls back to libc on other platforms
-  KOKKOS_INLINE_FUNCTION
-  static double fm_exp(double x)
-  {
-#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
-    return exp2_x86(FM_DOUBLE_LOG2OFE * x);
-#else
-    return ::exp(x);
-#endif
-  }
+  /*! Fast inline version of pow(x, 2.0)
+   *
+   *  \param   x argument
+   *  \return  x*x */
 
-  // x**2, use instead of pow(x,2.0)
   KOKKOS_INLINE_FUNCTION
-  static double square(const double &x) { return x*x; }
+  static double square(const double &x) { return x * x; }
+
+  /*! Fast inline version of pow(x, 3.0)
+   *
+   *  \param   x argument
+   *  \return  x*x */
 
-  // x**3, use instead of pow(x,3.0)
   KOKKOS_INLINE_FUNCTION
-  static double cube(const double &x) { return x*x*x; }
+  static double cube(const double &x) { return x * x * x; }
+
+  /* Fast inline version of pow(-1.0, n)
+   *
+   *  \param   n argument (integer)
+   *  \return  -1 if n is odd, 1.0 if n is even */
 
-  // return -1.0 for odd n, 1.0 for even n, like pow(-1.0,n)
   KOKKOS_INLINE_FUNCTION
   static double powsign(const int n) { return (n & 1) ? -1.0 : 1.0; }
 
-  // optimized version of pow(x,n) with n being integer
-  // up to 10x faster than pow(x,y)
+  /* Fast inline version of pow(x,n) for integer n
+   *
+   * This is a version of pow(x,n) optimized for n being integer.
+   * Speedups of up to 10x faster than pow(x,y) have been measured.
+   *
+   *  \param   n argument (integer)
+   *  \return  value of x^n */
 
   KOKKOS_INLINE_FUNCTION
-  static double powint(const double &x, const int n) {
-    double yy,ww;
+  static double powint(const double &x, const int n)
+  {
+    double yy, ww;
 
     if (x == 0.0) return 0.0;
     int nn = (n > 0) ? n : -n;
     ww = x;
 
-    for (yy = 1.0; nn != 0; nn >>= 1, ww *=ww)
+    for (yy = 1.0; nn != 0; nn >>= 1, ww *= ww)
       if (nn & 1) yy *= ww;
 
-    return (n > 0) ? yy : 1.0/yy;
+    return (n > 0) ? yy : 1.0 / yy;
   }
 
-  // optimized version of (sin(x)/x)**n with n being a _positive_ integer
+  /* Fast inline version of (sin(x)/x)^n as used by PPPM kspace styles
+   *
+   * This is an optimized function to compute (sin(x)/x)^n as frequently used by PPPM.
+   *
+   *  \param   n argument (integer). Expected to be positive.
+   *  \return  value of (sin(x)/x)^n */
 
   KOKKOS_INLINE_FUNCTION
-  static double powsinxx(const double &x, int n) {
-    double yy,ww;
+  static double powsinxx(const double &x, int n)
+  {
+    double yy, ww;
 
     if (x == 0.0) return 1.0;
 
-    ww = sin(x)/x;
+    ww = sin(x) / x;
 
-    for (yy = 1.0; n != 0; n >>= 1, ww *=ww)
+    for (yy = 1.0; n != 0; n >>= 1, ww *= ww)
       if (n & 1) yy *= ww;
 
     return yy;
   }
-}
-}
+}    // namespace MathSpecialKokkos
+}    // namespace LAMMPS_NS
 
 #endif
diff --git a/src/KOKKOS/meam_dens_final_kokkos.h b/src/KOKKOS/meam_dens_final_kokkos.h
index b44a8af3bd..4a212d3597 100644
--- a/src/KOKKOS/meam_dens_final_kokkos.h
+++ b/src/KOKKOS/meam_dens_final_kokkos.h
@@ -3,179 +3,148 @@
 
 using namespace LAMMPS_NS;
 
+/* ---------------------------------------------------------------------- */
+
 template<class DeviceType>
 void
-MEAMKokkos<DeviceType>::meam_dens_final(int nlocal, int eflag_either, int eflag_global, int eflag_atom, double* eng_vdwl,
-                      typename ArrayTypes<DeviceType>::t_efloat_1d eatom, int ntype, typename AT::t_int_1d_randomread type, typename AT::t_int_1d_randomread fmap, int& errorflag)
+MEAMKokkos<DeviceType>::meam_dens_final(int nlocal, int eflag_either, int eflag_global, int eflag_atom,
+                      typename ArrayTypes<DeviceType>::t_efloat_1d eatom, int ntype, typename AT::t_int_1d type, typename AT::t_int_1d d_map, typename AT::t_int_2d d_scale, int& errorflag, EV_FLOAT &ev_all)
 {
   EV_FLOAT ev;
-  ev.evdwl = *eng_vdwl;
   this->eflag_either = eflag_either;
   this->eflag_global = eflag_global;
   this->eflag_atom = eflag_atom;
   this->d_eatom = eatom;
   this->ntype = ntype;
   this->type = type;
-  this->fmap = fmap;
+  this->d_map = d_map;
+  this->d_scale = d_scale;
 
-  //     Complete the calculation of density
+  Kokkos::deep_copy(d_errorflag,0);
 
+  // Complete the calculation of density
+
+  copymode = 1;
   Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagMEAMDensFinal>(0,nlocal),*this,ev);
-  *eng_vdwl = ev.evdwl;
+  ev_all.evdwl =+ ev.evdwl;
+  copymode = 0;
+
+  auto h_errorflag = Kokkos::create_mirror_view_and_copy(LMPHostType(),d_errorflag);
+  errorflag = h_errorflag();
 }
 
+/* ---------------------------------------------------------------------- */
+
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void MEAMKokkos<DeviceType>::operator()(TagMEAMDensFinal, const int &i, EV_FLOAT& ev) const {
 
-  lattice_t elti;
-  int m;
-  int errorflag;
   F_FLOAT rhob, G, dG, Gbar, dGbar, gam, shp[3], Z;
-  F_FLOAT B, denom, rho_bkgd;
+  F_FLOAT denom, rho_bkgd, Fl;
+  double scaleii;
 
-  // may or may not be legal
-    elti = static_cast<lattice_t>(fmap[type[i]]);
-    if (elti >= 0) {
-      d_rho1[i] = 0.0;
-      d_rho2[i] = -1.0 / 3.0 * d_arho2b[i] * d_arho2b[i];
-      d_rho3[i] = 0.0;
-      for (m = 0; m < 3; m++) {
-        d_rho1[i] = d_rho1[i] + d_arho1(i,m) * d_arho1(i,m);
-        d_rho3[i] = d_rho3[i] - 3.0 / 5.0 * d_arho3b(i,m) * d_arho3b(i,m);
-      }
-      for (m = 0; m < 6; m++) {
-        d_rho2[i] = d_rho2[i] + v2D[m] * d_arho2(i,m) * d_arho2(i,m);
-      }
-      for (m = 0; m < 10; m++) {
-        d_rho3[i] = d_rho3[i] + v3D[m] * d_arho3(i,m) * d_arho3(i,m);
-      }
+  int elti = d_map[type[i]];
+  if (elti >= 0) {
+    scaleii = d_scale(type[i],type[i]);
+    d_rho1[i] = 0.0;
+    d_rho2[i] = -1.0 / 3.0 * d_arho2b[i] * d_arho2b[i];
+    d_rho3[i] = 0.0;
+    for (int m = 0; m < 3; m++) {
+      d_rho1[i] += d_arho1(i,m) * d_arho1(i,m);
+      d_rho3[i] -= 3.0 / 5.0 * d_arho3b(i,m) * d_arho3b(i,m);
+    }
+    for (int m = 0; m < 6; m++)
+      d_rho2[i] += v2D[m] * d_arho2(i,m) * d_arho2(i,m);
+    for (int m = 0; m < 10; m++)
+      d_rho3[i] += v3D[m] * d_arho3(i,m) * d_arho3(i,m);
 
-      if (d_rho0[i] > 0.0) {
-        if (this->ialloy == 1) {
-          d_t_ave(i,0) = fdiv_zero_kk(d_t_ave(i,0), d_tsq_ave(i,0));
-          d_t_ave(i,1) = fdiv_zero_kk(d_t_ave(i,1), d_tsq_ave(i,1));
-          d_t_ave(i,2) = fdiv_zero_kk(d_t_ave(i,2), d_tsq_ave(i,2));
-        } else if (this->ialloy == 2) {
-          d_t_ave(i,0) = this->t1_meam[elti];
-          d_t_ave(i,1) = this->t2_meam[elti];
-          d_t_ave(i,2) = this->t3_meam[elti];
-        } else {
-          d_t_ave(i,0) = d_t_ave(i,0) / d_rho0[i];
-          d_t_ave(i,1) = d_t_ave(i,1) / d_rho0[i];
-          d_t_ave(i,2) = d_t_ave(i,2) / d_rho0[i];
-        }
-      }
-
-      d_gamma[i] = d_t_ave(i,0) * d_rho1[i] + d_t_ave(i,1) * d_rho2[i] + d_t_ave(i,2) * d_rho3[i];
-
-      if (d_rho0[i] > 0.0) {
-        d_gamma[i] = d_gamma[i] / (d_rho0[i] * d_rho0[i]);
-      }
-
-      // need to double check, the analogous function is
-      // Z = get_Zij(this->lattce_meam[elti][elti]); in the non-KOKKOS version?
-      Z = get_Zij(elti);
-
-      G = G_gam(d_gamma[i], this->ibar_meam[elti], errorflag);
-      if (errorflag != 0)
-      {
-        //char str[128];
-        //sprintf(str,"MEAMKokkos library error %d",errorflag);
-        //error->one(FLERR,str);
-        return;
-      }
-      get_shpfcn(this->lattce_meam[elti][elti], shp);
-      if (this->ibar_meam[elti] <= 0) {
-        Gbar = 1.0;
-        dGbar = 0.0;
+    if (d_rho0[i] > 0.0) {
+      if (ialloy == 1) {
+	d_t_ave(i,0) = fdiv_zero_kk(d_t_ave(i,0), d_tsq_ave(i,0));
+	d_t_ave(i,1) = fdiv_zero_kk(d_t_ave(i,1), d_tsq_ave(i,1));
+	d_t_ave(i,2) = fdiv_zero_kk(d_t_ave(i,2), d_tsq_ave(i,2));
+      } else if (ialloy == 2) {
+	d_t_ave(i,0) = t1_meam[elti];
+	d_t_ave(i,1) = t2_meam[elti];
+	d_t_ave(i,2) = t3_meam[elti];
       } else {
-        if (this->mix_ref_t == 1) {
-          gam = (d_t_ave(i,0) * shp[0] + d_t_ave(i,1) * shp[1] + d_t_ave(i,2) * shp[2]) / (Z * Z);
-        } else {
-          gam = (this->t1_meam[elti] * shp[0] + this->t2_meam[elti] * shp[1] + this->t3_meam[elti] * shp[2]) /
-                (Z * Z);
-        }
-        Gbar = G_gam(gam, this->ibar_meam[elti], errorflag);
-      }
-      d_rho[i] = d_rho0[i] * G;
-
-      if (this->mix_ref_t == 1) {
-        if (this->ibar_meam[elti] <= 0) {
-          Gbar = 1.0;
-          dGbar = 0.0;
-        } else {
-          gam = (d_t_ave(i,0) * shp[0] + d_t_ave(i,1) * shp[1] + d_t_ave(i,2) * shp[2]) / (Z * Z);
-          Gbar = dG_gam(gam, this->ibar_meam[elti], dGbar);
-        }
-        rho_bkgd = this->rho0_meam[elti] * Z * Gbar;
-      } else {
-        if (this->bkgd_dyn == 1) {
-          rho_bkgd = this->rho0_meam[elti] * Z;
-        } else {
-          rho_bkgd = this->rho_ref_meam[elti];
-        }
-      }
-      rhob = d_rho[i] / rho_bkgd;
-      denom = 1.0 / rho_bkgd;
-
-      G = dG_gam(d_gamma[i], this->ibar_meam[elti], dG);
-
-      d_dgamma1[i] = (G - 2 * dG * d_gamma[i]) * denom;
-
-      if (!iszero_kk(d_rho0[i])) {
-        d_dgamma2[i] = (dG / d_rho0[i]) * denom;
-      } else {
-        d_dgamma2[i] = 0.0;
-      }
-
-      //     dgamma3 is nonzero only if we are using the "mixed" rule for
-      //     computing t in the reference system (which is not correct, but
-      //     included for backward compatibility
-      if (this->mix_ref_t == 1) {
-        d_dgamma3[i] = d_rho0[i] * G * dGbar / (Gbar * Z * Z) * denom;
-      } else {
-        d_dgamma3[i] = 0.0;
-      }
-
-      B = this->A_meam[elti] * this->Ec_meam[elti][elti];
-
-      if (!iszero_kk(rhob)) {
-        if (this->emb_lin_neg == 1 && rhob <= 0) {
-          d_frhop[i] = -B;
-        } else {
-          d_frhop[i] = B * (log(rhob) + 1.0);
-        }
-        if (eflag_either != 0) {
-          if (eflag_global != 0) {
-            if (this->emb_lin_neg == 1 && rhob <= 0) {
-              //*eng_vdwl = *eng_vdwl - B * rhob;
-              ev.evdwl = ev.evdwl - B * rhob;
-            } else {
-              //*eng_vdwl = *eng_vdwl + B * rhob * log(rhob);
-              ev.evdwl = ev.evdwl + B * rhob * log(rhob);
-            }
-          }
-          if (eflag_atom != 0) {
-            if (this->emb_lin_neg == 1 && rhob <= 0) {
-              d_eatom[i] = d_eatom[i] - B * rhob;
-            } else {
-              d_eatom[i] = d_eatom[i] + B * rhob * log(rhob);
-            }
-          }
-        }
-      } else {
-        if (this->emb_lin_neg == 1) {
-          d_frhop[i] = -B;
-        } else {
-          d_frhop[i] = B;
-        }
+	d_t_ave(i,0) /= d_rho0[i];
+	d_t_ave(i,1) /= d_rho0[i];
+	d_t_ave(i,2) /= d_rho0[i];
       }
     }
-    if (errorflag)
-    {
-        //char str[128];
-        //sprintf(str,"MEAMKokkos library error %d",errorflag);
-        //error->one(FLERR,str);
+
+    d_gamma[i] = d_t_ave(i,0) * d_rho1[i] + d_t_ave(i,1) * d_rho2[i] + d_t_ave(i,2) * d_rho3[i];
+
+    if (d_rho0[i] > 0.0)
+      d_gamma[i] /= (d_rho0[i] * d_rho0[i]);
+
+    Z = get_Zij(lattce_meam[elti][elti]);
+
+    G = G_gam(d_gamma[i], ibar_meam[elti], d_errorflag());
+    if (d_errorflag() != 0)
+      return;
+
+    get_shpfcn(lattce_meam[elti][elti], stheta_meam[elti][elti], ctheta_meam[elti][elti], shp);
+    if (ibar_meam[elti] <= 0) {
+      Gbar = 1.0;
+      dGbar = 0.0;
+    } else {
+      if (mix_ref_t == 1)
+	gam = (d_t_ave(i,0) * shp[0] + d_t_ave(i,1) * shp[1] + d_t_ave(i,2) * shp[2]) / (Z * Z);
+      else
+	gam = (t1_meam[elti] * shp[0] + t2_meam[elti] * shp[1] + t3_meam[elti] * shp[2]) /
+	      (Z * Z);
+      Gbar = G_gam(gam, ibar_meam[elti], d_errorflag());
     }
+    d_rho[i] = d_rho0[i] * G;
+
+    if (mix_ref_t == 1) {
+      if (ibar_meam[elti] <= 0) {
+	Gbar = 1.0;
+	dGbar = 0.0;
+      } else {
+	gam = (d_t_ave(i,0) * shp[0] + d_t_ave(i,1) * shp[1] + d_t_ave(i,2) * shp[2]) / (Z * Z);
+	Gbar = dG_gam(gam, ibar_meam[elti], dGbar);
+      }
+      rho_bkgd = rho0_meam[elti] * Z * Gbar;
+    } else {
+      if (bkgd_dyn == 1)
+	rho_bkgd = rho0_meam[elti] * Z;
+      else
+	rho_bkgd = rho_ref_meam[elti];
+    }
+    rhob = d_rho[i] / rho_bkgd;
+    denom = 1.0 / rho_bkgd;
+
+    G = dG_gam(d_gamma[i], ibar_meam[elti], dG);
+
+    d_dgamma1[i] = (G - 2 * dG * d_gamma[i]) * denom;
+
+    if (!iszero_kk(d_rho0[i]))
+      d_dgamma2[i] = (dG / d_rho0[i]) * denom;
+    else
+      d_dgamma2[i] = 0.0;
+
+    // dgamma3 is nonzero only if we are using the "mixed" rule for
+    // computing t in the reference system (which is not correct, but
+    // included for backward compatibility
+    if (mix_ref_t == 1)
+      d_dgamma3[i] = d_rho0[i] * G * dGbar / (Gbar * Z * Z) * denom;
+    else
+      d_dgamma3[i] = 0.0;
+
+    Fl = embedding(A_meam[elti], Ec_meam[elti][elti], rhob, d_frhop[i]);
+
+    if (eflag_either) {
+      Fl *= scaleii;
+      if (eflag_global) {
+        ev.evdwl += Fl;
+      }
+      if (eflag_atom) {
+        d_eatom[i] += Fl;
+      }
+    }
+  }
 }
+
diff --git a/src/KOKKOS/meam_dens_init_kokkos.h b/src/KOKKOS/meam_dens_init_kokkos.h
index c3dfdbbcb8..bcf0e7433e 100644
--- a/src/KOKKOS/meam_dens_init_kokkos.h
+++ b/src/KOKKOS/meam_dens_init_kokkos.h
@@ -1,41 +1,45 @@
 #include "meam_kokkos.h"
-#include "math_special.h"
-#include "mpi.h"
+#include "math_special_kokkos.h"
 
 using namespace LAMMPS_NS;
 using namespace MathSpecialKokkos;
 
+/* ---------------------------------------------------------------------- */
+
 template<class DeviceType>
 template<int NEIGHFLAG>
 KOKKOS_INLINE_FUNCTION
-void MEAMKokkos<DeviceType>::operator()(TagMEAMDensInit<NEIGHFLAG>, const int &i, EV_FLOAT &ev) const {
+void MEAMKokkos<DeviceType>::operator()(TagMEAMDensInit<NEIGHFLAG>, const int &i) const {
   int ii, offsetval;
   ii = d_ilist_half[i];
   offsetval = d_offset[i];
-  //     Compute screening function and derivatives
+  // compute screening function and derivatives
   this->template getscreen<NEIGHFLAG>(ii, offsetval, x, d_numneigh_half, 
-            d_numneigh_full, ntype, type, fmap);
+            d_numneigh_full, ntype, type, d_map);
 
-  //     Calculate intermediate density terms to be communicated
-  this->template calc_rho1<NEIGHFLAG>(ii, ntype, type, fmap, x, d_numneigh_half, offsetval);
-  ev.evdwl += d_numneigh_half[i];
+  // calculate intermediate density terms to be communicated
+  this->template calc_rho1<NEIGHFLAG>(ii, ntype, type, d_map, x, d_numneigh_half, offsetval);
 }
 
+/* ---------------------------------------------------------------------- */
+
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void MEAMKokkos<DeviceType>::operator()(TagMEAMInitialize, const int &i) const {
-    d_rho0[i] = 0.0;
-    d_arho2b[i] = 0.0;
-    d_arho1(i,0) = d_arho1(i,1) = d_arho1(i,2) = 0.0;
-    for (int j = 0; j < 6; j++)
-      d_arho2(i,j) = 0.0;
-    for (int j = 0; j < 10; j++)
-      d_arho3(i,j) = 0.0;
-    d_arho3b(i,0) = d_arho3b(i,1) = d_arho3b(i,2) = 0.0;
-    d_t_ave(i,0) = d_t_ave(i,1) = d_t_ave(i,2) = 0.0;
-    d_tsq_ave(i,0) = d_tsq_ave(i,1) = d_tsq_ave(i,2) = 0.0;
+void MEAMKokkos<DeviceType>::operator()(TagMEAMZero, const int &i) const {
+  d_rho0[i] = 0.0;
+  d_arho2b[i] = 0.0;
+  d_arho1(i,0) = d_arho1(i,1) = d_arho1(i,2) = 0.0;
+  for (int j = 0; j < 6; j++)
+    d_arho2(i,j) = 0.0;
+  for (int j = 0; j < 10; j++)
+    d_arho3(i,j) = 0.0;
+  d_arho3b(i,0) = d_arho3b(i,1) = d_arho3b(i,2) = 0.0;
+  d_t_ave(i,0) = d_t_ave(i,1) = d_t_ave(i,2) = 0.0;
+  d_tsq_ave(i,0) = d_tsq_ave(i,1) = d_tsq_ave(i,2) = 0.0;
 }
 
+/* ---------------------------------------------------------------------- */
+
 template<class DeviceType>
 void
 MEAMKokkos<DeviceType>::meam_dens_setup(int atom_nmax, int nall, int n_neigh)
@@ -155,20 +159,21 @@ MEAMKokkos<DeviceType>::meam_dens_setup(int atom_nmax, int nall, int n_neigh)
 
   // zero out local arrays
 
-   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagMEAMInitialize>(0, nall),*this);   
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagMEAMZero>(0, nall),*this);
+  copymode = 0;
 }
 
+/* ---------------------------------------------------------------------- */
+
 template<class DeviceType>
 void
-MEAMKokkos<DeviceType>::meam_dens_init(int inum_half, int ntype, typename AT::t_int_1d_randomread type, typename AT::t_int_1d_randomread fmap, typename AT::t_x_array_randomread x, typename AT::t_int_1d_randomread d_numneigh_half, typename AT::t_int_1d_randomread d_numneigh_full,
-                     int *fnoffset, typename AT::t_int_1d_randomread d_ilist_half, typename AT::t_neighbors_2d d_neighbors_half, typename AT::t_neighbors_2d d_neighbors_full, typename AT::t_int_1d_randomread d_offset, int neighflag)
+MEAMKokkos<DeviceType>::meam_dens_init(int inum_half, int ntype, typename AT::t_int_1d type, typename AT::t_int_1d d_map, typename AT::t_x_array x, typename AT::t_int_1d d_numneigh_half, typename AT::t_int_1d d_numneigh_full,
+                     typename AT::t_int_1d d_ilist_half, typename AT::t_neighbors_2d d_neighbors_half, typename AT::t_neighbors_2d d_neighbors_full, typename AT::t_int_1d d_offset, int neighflag, int need_dup)
 {
-  EV_FLOAT ev;
- 
-  ev.evdwl = 0; 
   this->ntype = ntype;
   this->type = type;
-  this->fmap = fmap;
+  this->d_map = d_map;
   this->x = x;
   this->d_numneigh_half = d_numneigh_half;
   this->d_numneigh_full = d_numneigh_full;
@@ -176,298 +181,325 @@ MEAMKokkos<DeviceType>::meam_dens_init(int inum_half, int ntype, typename AT::t_
   this->d_neighbors_half = d_neighbors_half;
   this->d_neighbors_full = d_neighbors_full;
   this->d_offset = d_offset;
-  if (neighflag == FULL)
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagMEAMDensInit<FULL> >(0,inum_half),*this, ev);
-  else if (neighflag == HALF)
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagMEAMDensInit<HALF> >(0,inum_half),*this, ev);
+  this->nlocal = nlocal;
+
+  if (need_dup) {
+    dup_rho0 = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_rho0);
+    dup_arho2b = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_arho2b);
+    dup_arho1 = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_arho1);
+    dup_arho2 = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_arho2);
+    dup_arho3 = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_arho3);
+    dup_arho3b = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_arho3b);
+    dup_t_ave = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_t_ave);
+    dup_tsq_ave = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_tsq_ave);
+  } else {
+    ndup_rho0 = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_rho0);
+    ndup_arho2b = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_arho2b);
+    ndup_arho1 = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_arho1);
+    ndup_arho2 = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_arho2);
+    ndup_arho3 = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_arho3);
+    ndup_arho3b = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_arho3b);
+    ndup_t_ave = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_t_ave);
+    ndup_tsq_ave = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_tsq_ave);
+  }
+
+  copymode = 1;
+  if (neighflag == HALF)
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagMEAMDensInit<HALF> >(0,inum_half),*this);
   else if (neighflag == HALFTHREAD)
-      Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagMEAMDensInit<HALFTHREAD> >(0,inum_half),*this, ev);
-  *fnoffset = (int)ev.evdwl;
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagMEAMDensInit<HALFTHREAD> >(0,inum_half),*this);
+  copymode = 0;
+
+  if (need_dup) {
+    Kokkos::Experimental::contribute(d_rho0, dup_rho0);
+    Kokkos::Experimental::contribute(d_arho2b, dup_arho2b);
+    Kokkos::Experimental::contribute(d_arho1, dup_arho1);
+    Kokkos::Experimental::contribute(d_arho2, dup_arho2);
+    Kokkos::Experimental::contribute(d_arho3, dup_arho3);
+    Kokkos::Experimental::contribute(d_arho3b, dup_arho3b);
+    Kokkos::Experimental::contribute(d_t_ave, dup_t_ave);
+    Kokkos::Experimental::contribute(d_tsq_ave, dup_tsq_ave);
+
+    // free duplicated memory
+    dup_rho0 = decltype(dup_rho0)();
+    dup_arho2b = decltype(dup_arho2b)();
+    dup_arho1 = decltype(dup_arho1)();
+    dup_arho2 = decltype(dup_arho2)();
+    dup_arho3 = decltype(dup_arho3)();
+    dup_arho3b = decltype(dup_arho3b)();
+    dup_t_ave = decltype(dup_t_ave)();
+    dup_tsq_ave = decltype(dup_tsq_ave)();
+  }
 }
 
-// ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
+/* ---------------------------------------------------------------------- */
+
 template<class DeviceType>
 template<int NEIGHFLAG>
 KOKKOS_INLINE_FUNCTION
 void
-MEAMKokkos<DeviceType>::getscreen(int i, int offset, typename AT::t_x_array_randomread x, typename AT::t_int_1d_randomread numneigh_half,
-                typename AT::t_int_1d_randomread numneigh_full, int /*ntype*/, typename AT::t_int_1d_randomread type, typename AT::t_int_1d_randomread fmap)
+MEAMKokkos<DeviceType>::getscreen(int i, int offset, typename AT::t_x_array x, typename AT::t_int_1d d_numneigh_half,
+                typename AT::t_int_1d d_numneigh_full, int /*ntype*/, typename AT::t_int_1d type, typename AT::t_int_1d d_map)
 const {
-  int jn, j, kn, k;
-  int elti, eltj, eltk;
-  X_FLOAT xitmp, yitmp, zitmp, delxij, delyij, delzij;
-  F_FLOAT  rij2, rij;
-  X_FLOAT xjtmp, yjtmp, zjtmp, delxik, delyik, delzik;
-  F_FLOAT  rik2 /*,rik*/;
-  X_FLOAT xktmp, yktmp, zktmp, delxjk, delyjk, delzjk;
-  F_FLOAT  rjk2 /*,rjk*/;
-  X_FLOAT xik, xjk;
-  F_FLOAT sij, fcij, sfcij, dfcij, sikj, dfikj, cikj;
-  F_FLOAT Cmin, Cmax, delc, /*ebound,*/ a, coef1, coef2;
-  F_FLOAT dCikj;
-  F_FLOAT rnorm, fc, dfc, drinv;
-
-  drinv = 1.0 / this->delr_meam;
-  elti = fmap[type[i]];
+  const double drinv = 1.0 / delr_meam;
+  const int elti = d_map[type[i]];
   if (elti < 0) return;
 
-  xitmp = x(i,0);
-  yitmp = x(i,1);
-  zitmp = x(i,2);
+  const double xitmp = x(i,0);
+  const double yitmp = x(i,1);
+  const double zitmp = x(i,2);
 
-  for (jn = 0; jn < numneigh_half[i]; jn++) {
-    //j = firstneigh[jn];
-    j = d_neighbors_half(i,jn);
+  for (int jn = 0; jn < d_numneigh_half[i]; jn++) {
+    const int j = d_neighbors_half(i,jn);
 
-    eltj = fmap[type[j]];
+    const int eltj = d_map[type[j]];
     if (eltj < 0) continue;
 
-    //     First compute screening function itself, sij
-    xjtmp = x(j,0);
-    yjtmp = x(j,1);
-    zjtmp = x(j,2);
-    delxij = xjtmp - xitmp;
-    delyij = yjtmp - yitmp;
-    delzij = zjtmp - zitmp;
+    // First compute screening function itself, sij
+    const double xjtmp = x(j,0);
+    const double yjtmp = x(j,1);
+    const double zjtmp = x(j,2);
+    const double delxij = xjtmp - xitmp;
+    const double delyij = yjtmp - yitmp;
+    const double delzij = zjtmp - zitmp;
 
-    rij2 = delxij * delxij + delyij * delyij + delzij * delzij;
-    rij = sqrt(rij2);
+    const double rij2 = delxij * delxij + delyij * delyij + delzij * delzij;
 
-    double rbound = this->ebound_meam[elti][eltj] * rij2;
-    if (rij > this->rc_meam) {
-      fcij = 0.0;
-      dfcij = 0.0;
-      sij = 0.0;
-    } else {
-      rnorm = (this->rc_meam - rij) * drinv;
-      sij = 1.0;
-
-      //     if rjk2 > ebound*rijsq, atom k is definitely outside the ellipse
-      for (kn = 0; kn < numneigh_full[i]; kn++) {
-        //k = firstneigh_full[kn];
-        k = d_neighbors_full(i,kn);
-        eltk = fmap[type[k]];
-        if (eltk < 0) continue;
-        if (k == j) continue;
-
-        delxjk = x(k,0) - xjtmp;
-        delyjk = x(k,1) - yjtmp;
-        delzjk = x(k,2) - zjtmp;
-        rjk2 = delxjk * delxjk + delyjk * delyjk + delzjk * delzjk;
-        if (rjk2 > rbound) continue;
-
-        delxik = x(k,0) - xitmp;
-        delyik = x(k,1) - yitmp;
-        delzik = x(k,2) - zitmp;
-        rik2 = delxik * delxik + delyik * delyik + delzik * delzik;
-        if (rik2 > rbound) continue;
-
-        xik = rik2 / rij2;
-        xjk = rjk2 / rij2;
-        a = 1 - (xik - xjk) * (xik - xjk);
-        //     if a < 0, then ellipse equation doesn't describe this case and
-        //     atom k can't possibly screen i-j
-        if (a <= 0.0) continue;
-
-        cikj = (2.0 * (xik + xjk) + a - 2.0) / a;
-        Cmax = this->Cmax_meam[elti][eltj][eltk];
-        Cmin = this->Cmin_meam[elti][eltj][eltk];
-        if (cikj >= Cmax) continue;
-        //     note that cikj may be slightly negative (within numerical
-        //     tolerance) if atoms are colinear, so don't reject that case here
-        //     (other negative cikj cases were handled by the test on "a" above)
-        else if (cikj <= Cmin) {
-          sij = 0.0;
-          break;
-        } else {
-       delc = Cmax - Cmin;
-          cikj = (cikj - Cmin) / delc;
-          sikj = fcut(cikj);
-        }
-        sij *= sikj;
-      }
-
-      fc = dfcut(rnorm, dfc);
-      fcij = fc;
-      dfcij = dfc * drinv;
-    }
-    //     Now compute derivatives
-    d_dscrfcn[offset+jn] = 0.0;
-    sfcij = sij * fcij;
-    if (iszero_kk(sfcij) || iszero_kk(sfcij - 1.0))
-    {
-      d_scrfcn[offset+jn] = sij;
-      d_fcpair[offset+jn] = fcij;
+    if (rij2 > cutforcesq) {
+      d_dscrfcn[offset+jn] = 0.0;
+      d_scrfcn[offset+jn] = 0.0;
+      d_fcpair[offset+jn] = 0.0;
       continue;
-      //goto LABEL_100;
     }
 
-    for (kn = 0; kn < numneigh_full[i]; kn++) {
-      //k = firstneigh_full[kn];
-      k = d_neighbors_full(i,kn);
+    // Now compute derivatives
+    const double rbound = ebound_meam[elti][eltj] * rij2;
+    const double rij = sqrt(rij2);
+    const double rnorm = (cutforce - rij) * drinv;
+    double sij = 1.0;
+
+    // if rjk2 > ebound*rijsq, atom k is definitely outside the ellipse
+    for (int kn = 0; kn < d_numneigh_full[i]; kn++) {
+      int k = d_neighbors_full(i,kn);
       if (k == j) continue;
-      eltk = fmap[type[k]];
+      int eltk = d_map[type[k]];
       if (eltk < 0) continue;
 
-      xktmp = x(k,0);
-      yktmp = x(k,1);
-      zktmp = x(k,2);
-      delxjk = xktmp - xjtmp;
-      delyjk = yktmp - yjtmp;
-      delzjk = zktmp - zjtmp;
-      rjk2 = delxjk * delxjk + delyjk * delyjk + delzjk * delzjk;
+      const double xktmp = x(k,0);
+      const double yktmp = x(k,1);
+      const double zktmp = x(k,2);
+
+      const double delxjk = xktmp - xjtmp;
+      const double delyjk = yktmp - yjtmp;
+      const double delzjk = zktmp - zjtmp;
+      const double rjk2 = delxjk * delxjk + delyjk * delyjk + delzjk * delzjk;
       if (rjk2 > rbound) continue;
 
-      delxik = xktmp - xitmp;
-      delyik = yktmp - yitmp;
-      delzik = zktmp - zitmp;
-      rik2 = delxik * delxik + delyik * delyik + delzik * delzik;
+      const double delxik = xktmp - xitmp;
+      const double delyik = yktmp - yitmp;
+      const double delzik = zktmp - zitmp;
+      const double rik2 = delxik * delxik + delyik * delyik + delzik * delzik;
       if (rik2 > rbound) continue;
 
-      xik = rik2 / rij2;
-      xjk = rjk2 / rij2;
-      a = 1 - (xik - xjk) * (xik - xjk);
-      //     if a < 0, then ellipse equation doesn't describe this case and
-      //     atom k can't possibly screen i-j
+      const double xik = rik2 / rij2;
+      const double xjk = rjk2 / rij2;
+      const double a = 1 - (xik - xjk) * (xik - xjk);
+      // if a < 0, then ellipse equation doesn't describe this case and
+      // atom k can't possibly screen i-j
       if (a <= 0.0) continue;
 
-      cikj = (2.0 * (xik + xjk) + a - 2.0) / a;
-      Cmax = this->Cmax_meam[elti][eltj][eltk];
-      Cmin = this->Cmin_meam[elti][eltj][eltk];
-      if (cikj >= Cmax) {
-        continue;
-        //     Note that cikj may be slightly negative (within numerical
-        //     tolerance) if atoms are colinear, so don't reject that case
-        //     here
-        //     (other negative cikj cases were handled by the test on "a"
-        //     above)
-        //     Note that we never have 0<cikj<Cmin here, else sij=0
-        //     (rejected above)
+      double cikj = (2.0 * (xik + xjk) + a - 2.0) / a;
+      const double Cmax = Cmax_meam[elti][eltj][eltk];
+      const double Cmin = Cmin_meam[elti][eltj][eltk];
+      double sikj;
+      if (cikj >= Cmax) continue;
+      // note that cikj may be slightly negative (within numerical
+      // tolerance) if atoms are colinear, so don't reject that case here
+      // (other negative cikj cases were handled by the test on "a" above)
+      else if (cikj <= Cmin) {
+        sij = 0.0;
+        break;
       } else {
-        delc = Cmax - Cmin;
+        const double delc = Cmax - Cmin;
         cikj = (cikj - Cmin) / delc;
-        sikj = dfcut(cikj, dfikj);
-        coef1 = dfikj / (delc * sikj);
-        dCikj = dCfunc(rij2, rik2, rjk2);
-        d_dscrfcn[offset+jn] = d_dscrfcn[offset+jn] + coef1 * dCikj;
+        sikj = fcut(cikj);
       }
+      sij *= sikj;
+    }
+
+    double dfc;
+    const double fc = dfcut(rnorm, dfc);
+    const double fcij = fc;
+    const double dfcij = dfc * drinv;
+
+    // Now compute derivatives
+    d_dscrfcn[offset+jn] = 0.0;
+    const double sfcij = sij * fcij;
+    if (!iszero_kk(sfcij) && !isone_kk(sfcij)) {
+      for (int kn = 0; kn < d_numneigh_full[i]; kn++) {
+        const int k = d_neighbors_full(i,kn);
+        if (k == j) continue;
+        const int eltk = d_map[type[k]];
+        if (eltk < 0) continue;
+
+        const double delxjk = x(k,0) - xjtmp;
+        const double delyjk = x(k,1) - yjtmp;
+        const double delzjk = x(k,2) - zjtmp;
+        const double rjk2 = delxjk * delxjk + delyjk * delyjk + delzjk * delzjk;
+        if (rjk2 > rbound) continue;
+
+        const double delxik = x(k,0) - xitmp;
+        const double delyik = x(k,1) - yitmp;
+        const double delzik = x(k,2) - zitmp;
+        const double rik2 = delxik * delxik + delyik * delyik + delzik * delzik;
+        if (rik2 > rbound) continue;
+
+        const double xik = rik2 / rij2;
+        const double xjk = rjk2 / rij2;
+        const double a = 1 - (xik - xjk) * (xik - xjk);
+        // if a < 0, then ellipse equation doesn't describe this case and
+        // atom k can't possibly screen i-j
+        if (a <= 0.0) continue;
+
+        double cikj = (2.0 * (xik + xjk) + a - 2.0) / a;
+        const double Cmax = Cmax_meam[elti][eltj][eltk];
+        const double Cmin = Cmin_meam[elti][eltj][eltk];
+        if (cikj >= Cmax) {
+          continue;
+          // Note that cikj may be slightly negative (within numerical
+          // tolerance) if atoms are colinear, so don't reject that case
+          // here
+          // (other negative cikj cases were handled by the test on "a"
+          // above)
+          // Note that we never have 0<cikj<Cmin here, else sij=0
+          // (rejected above)
+        } else {
+          const double delc = Cmax - Cmin;
+          cikj = (cikj - Cmin) / delc;
+          double dfikj;
+          const double sikj = dfcut(cikj, dfikj);
+          const double coef1 = dfikj / (delc * sikj);
+          const double dCikj = dCfunc(rij2, rik2, rjk2);
+          d_dscrfcn[offset+jn] += coef1 * dCikj;
+        }
+      }
+      const double coef1 = sfcij;
+      const double coef2 = sij * dfcij / rij;
+      d_dscrfcn[offset+jn] = d_dscrfcn[offset+jn] * coef1 - coef2;
     }
-    coef1 = sfcij;
-    coef2 = sij * dfcij / rij;
-    d_dscrfcn[offset+jn] = d_dscrfcn[offset+jn] * coef1 - coef2;
 
-    //LABEL_100:
     d_scrfcn[offset+jn] = sij;
     d_fcpair[offset+jn] = fcij;
   }
 }
 
-// ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
+/* ---------------------------------------------------------------------- */
+
 template<class DeviceType>
 template<int NEIGHFLAG>
 KOKKOS_INLINE_FUNCTION
 void
-MEAMKokkos<DeviceType>::calc_rho1(int i, int /*ntype*/, typename AT::t_int_1d_randomread type, typename AT::t_int_1d_randomread fmap, typename AT::t_x_array_randomread x, typename AT::t_int_1d_randomread numneigh,
+MEAMKokkos<DeviceType>::calc_rho1(int i, int /*ntype*/, typename AT::t_int_1d type, typename AT::t_int_1d d_map, typename AT::t_x_array x, typename AT::t_int_1d d_numneigh,
                 int offset) const
 {
-  int jn, j, m, n, p, elti, eltj;
-  int nv2, nv3;
-  X_FLOAT xtmp, ytmp, ztmp, delij[3];
-  F_FLOAT rij2, rij, sij;
-  F_FLOAT ai, aj, rhoa0j, rhoa1j, rhoa2j, rhoa3j, A1j, A2j, A3j;
-  // double G,Gbar,gam,shp[3+1];
-  F_FLOAT ro0i, ro0j;
-  F_FLOAT rhoa0i, rhoa1i, rhoa2i, rhoa3i, A1i, A2i, A3i;
+  // The rho0, etc. arrays are duplicated for OpenMP, atomic for CUDA, and neither for Serial
 
-  // Likely to do: replace with duplicated view for OpenMP, atomic view for GPU abstraction
-  Kokkos::View<F_FLOAT*, typename DAT::t_ffloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_rho0 = d_rho0;
-  Kokkos::View<F_FLOAT*, typename DAT::t_ffloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_arho2b = d_arho2b;
-  Kokkos::View<F_FLOAT**, typename DAT::t_ffloat_2d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_t_ave = d_t_ave;
-  Kokkos::View<F_FLOAT**, typename DAT::t_ffloat_2d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_tsq_ave = d_tsq_ave;
-  Kokkos::View<F_FLOAT**, typename DAT::t_ffloat_2d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_arho1 = d_arho1;
-  Kokkos::View<F_FLOAT**, typename DAT::t_ffloat_2d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_arho2 = d_arho2;
-  Kokkos::View<F_FLOAT**, typename DAT::t_ffloat_2d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_arho3 = d_arho3;
-  Kokkos::View<F_FLOAT**, typename DAT::t_ffloat_2d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_arho3b = d_arho3b;
+  auto v_rho0 = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_rho0),decltype(ndup_rho0)>::get(dup_rho0,ndup_rho0);
+  auto a_rho0 = v_rho0.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();
+  auto v_arho2b = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_arho2b),decltype(ndup_arho2b)>::get(dup_arho2b,ndup_arho2b);
+  auto a_arho2b = v_arho2b.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();
+  auto v_arho1 = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_arho1),decltype(ndup_arho1)>::get(dup_arho1,ndup_arho1);
+  auto a_arho1 = v_arho1.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();
+  auto v_arho2 = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_arho2),decltype(ndup_arho2)>::get(dup_arho2,ndup_arho2);
+  auto a_arho2 = v_arho2.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();
+  auto v_arho3 = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_arho3),decltype(ndup_arho3)>::get(dup_arho3,ndup_arho3);
+  auto a_arho3 = v_arho3.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();
+  auto v_arho3b = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_arho3b),decltype(ndup_arho3b)>::get(dup_arho3b,ndup_arho3b);
+  auto a_arho3b = v_arho3b.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();
+  auto v_t_ave = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_t_ave),decltype(ndup_t_ave)>::get(dup_t_ave,ndup_t_ave);
+  auto a_t_ave = v_t_ave.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();
+  auto v_tsq_ave = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_tsq_ave),decltype(ndup_tsq_ave)>::get(dup_tsq_ave,ndup_tsq_ave);
+  auto a_tsq_ave = v_tsq_ave.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();
 
-  elti = fmap[type[i]];
-  xtmp = x(i,0);
-  ytmp = x(i,1);
-  ztmp = x(i,2);
-  for (jn = 0; jn < numneigh[i]; jn++) {
+  const int elti = d_map[type[i]];
+  const double xtmp = x(i,0);
+  const double ytmp = x(i,1);
+  const double ztmp = x(i,2);
+  for (int jn = 0; jn < d_numneigh[i]; jn++) {
     if (!iszero_kk(d_scrfcn[offset+jn])) {
-      //j = firstneigh[jn];
-      j = d_neighbors_half(i,jn);
-      sij = d_scrfcn[offset+jn] * d_fcpair[offset+jn];
+      const int j = d_neighbors_half(i,jn);
+      const double sij = d_scrfcn[offset+jn] * d_fcpair[offset+jn];
+      double delij[3];
       delij[0] = x(j,0) - xtmp;
       delij[1] = x(j,1) - ytmp;
       delij[2] = x(j,2) - ztmp;
-      rij2 = delij[0] * delij[0] + delij[1] * delij[1] + delij[2] * delij[2];
-      if (rij2 < this->cutforcesq) {
-        eltj = fmap[type[j]];
-        rij = sqrt(rij2);
-        ai = rij / this->re_meam[elti][elti] - 1.0;
-        aj = rij / this->re_meam[eltj][eltj] - 1.0;
-        ro0i = this->rho0_meam[elti];
-        ro0j = this->rho0_meam[eltj];
-        rhoa0j = ro0j * MathSpecialKokkos::fm_exp(-this->beta0_meam[eltj] * aj) * sij;
-        rhoa1j = ro0j * MathSpecialKokkos::fm_exp(-this->beta1_meam[eltj] * aj) * sij;
-        rhoa2j = ro0j * MathSpecialKokkos::fm_exp(-this->beta2_meam[eltj] * aj) * sij;
-        rhoa3j = ro0j * MathSpecialKokkos::fm_exp(-this->beta3_meam[eltj] * aj) * sij;
-        rhoa0i = ro0i * MathSpecialKokkos::fm_exp(-this->beta0_meam[elti] * ai) * sij;
-        rhoa1i = ro0i * MathSpecialKokkos::fm_exp(-this->beta1_meam[elti] * ai) * sij;
-        rhoa2i = ro0i * MathSpecialKokkos::fm_exp(-this->beta2_meam[elti] * ai) * sij;
-        rhoa3i = ro0i * MathSpecialKokkos::fm_exp(-this->beta3_meam[elti] * ai) * sij;
-        if (this->ialloy == 1) {
-          rhoa1j = rhoa1j * this->t1_meam[eltj];
-          rhoa2j = rhoa2j * this->t2_meam[eltj];
-          rhoa3j = rhoa3j * this->t3_meam[eltj];
-          rhoa1i = rhoa1i * this->t1_meam[elti];
-          rhoa2i = rhoa2i * this->t2_meam[elti];
-          rhoa3i = rhoa3i * this->t3_meam[elti];
+      const double rij2 = delij[0] * delij[0] + delij[1] * delij[1] + delij[2] * delij[2];
+      if (rij2 < cutforcesq) {
+        const int eltj = d_map[type[j]];
+        const double rij = sqrt(rij2);
+        const double ai = rij / re_meam[elti][elti] - 1.0;
+        const double aj = rij / re_meam[eltj][eltj] - 1.0;
+        const double ro0i = rho0_meam[elti];
+        const double ro0j = rho0_meam[eltj];
+        const double rhoa0j = ro0j * MathSpecialKokkos::fm_exp(-beta0_meam[eltj] * aj) * sij;
+        double rhoa1j = ro0j * MathSpecialKokkos::fm_exp(-beta1_meam[eltj] * aj) * sij;
+        double rhoa2j = ro0j * MathSpecialKokkos::fm_exp(-beta2_meam[eltj] * aj) * sij;
+        double rhoa3j = ro0j * MathSpecialKokkos::fm_exp(-beta3_meam[eltj] * aj) * sij;
+        const double rhoa0i = ro0i * MathSpecialKokkos::fm_exp(-beta0_meam[elti] * ai) * sij;
+        double rhoa1i = ro0i * MathSpecialKokkos::fm_exp(-beta1_meam[elti] * ai) * sij;
+        double rhoa2i = ro0i * MathSpecialKokkos::fm_exp(-beta2_meam[elti] * ai) * sij;
+        double rhoa3i = ro0i * MathSpecialKokkos::fm_exp(-beta3_meam[elti] * ai) * sij;
+        if (ialloy == 1) {
+          rhoa1j *= t1_meam[eltj];
+          rhoa2j *= t2_meam[eltj];
+          rhoa3j *= t3_meam[eltj];
+          rhoa1i *= t1_meam[elti];
+          rhoa2i *= t2_meam[elti];
+          rhoa3i *= t3_meam[elti];
         }
         a_rho0[i] += rhoa0j;
         a_rho0[j] += rhoa0i;
         // For ialloy = 2, use single-element value (not average)
-        if (this->ialloy != 2) {
-          a_t_ave(i,0) = a_t_ave(i,0) + this->t1_meam[eltj] * rhoa0j;
-          a_t_ave(i,1) = a_t_ave(i,1) + this->t2_meam[eltj] * rhoa0j;
-          a_t_ave(i,2) = a_t_ave(i,2) + this->t3_meam[eltj] * rhoa0j;
-          a_t_ave(j,0) = a_t_ave(j,0) + this->t1_meam[elti] * rhoa0i;
-          a_t_ave(j,1) = a_t_ave(j,1) + this->t2_meam[elti] * rhoa0i;
-          a_t_ave(j,2) = a_t_ave(j,2) + this->t3_meam[elti] * rhoa0i;
+        if (ialloy != 2) {
+          a_t_ave(i,0) += t1_meam[eltj] * rhoa0j;
+          a_t_ave(i,1) += t2_meam[eltj] * rhoa0j;
+          a_t_ave(i,2) += t3_meam[eltj] * rhoa0j;
+          a_t_ave(j,0) += t1_meam[elti] * rhoa0i;
+          a_t_ave(j,1) += t2_meam[elti] * rhoa0i;
+          a_t_ave(j,2) += t3_meam[elti] * rhoa0i;
         }
-        if (this->ialloy == 1) {
-          a_tsq_ave(i,0) = a_tsq_ave(i,0) + this->t1_meam[eltj] * this->t1_meam[eltj] * rhoa0j;
-          a_tsq_ave(i,1) = a_tsq_ave(i,1) + this->t2_meam[eltj] * this->t2_meam[eltj] * rhoa0j;
-          a_tsq_ave(i,2) = a_tsq_ave(i,2) + this->t3_meam[eltj] * this->t3_meam[eltj] * rhoa0j;
-          a_tsq_ave(j,0) = a_tsq_ave(j,0) + this->t1_meam[elti] * this->t1_meam[elti] * rhoa0i;
-          a_tsq_ave(j,1) = a_tsq_ave(j,1) + this->t2_meam[elti] * this->t2_meam[elti] * rhoa0i;
-          a_tsq_ave(j,2) = a_tsq_ave(j,2) + this->t3_meam[elti] * this->t3_meam[elti] * rhoa0i;
+        if (ialloy == 1) {
+          a_tsq_ave(i,0) += t1_meam[eltj] * t1_meam[eltj] * rhoa0j;
+          a_tsq_ave(i,1) += t2_meam[eltj] * t2_meam[eltj] * rhoa0j;
+          a_tsq_ave(i,2) += t3_meam[eltj] * t3_meam[eltj] * rhoa0j;
+          a_tsq_ave(j,0) += t1_meam[elti] * t1_meam[elti] * rhoa0i;
+          a_tsq_ave(j,1) += t2_meam[elti] * t2_meam[elti] * rhoa0i;
+          a_tsq_ave(j,2) += t3_meam[elti] * t3_meam[elti] * rhoa0i;
         }
         a_arho2b[i] += rhoa2j;
         a_arho2b[j] += rhoa2i;
 
-        A1j = rhoa1j / rij;
-        A2j = rhoa2j / rij2;
-        A3j = rhoa3j / (rij2 * rij);
-        A1i = rhoa1i / rij;
-        A2i = rhoa2i / rij2;
-        A3i = rhoa3i / (rij2 * rij);
-        nv2 = 0;
-        nv3 = 0;
-        for (m = 0; m < 3; m++) {
+        const double A1j = rhoa1j / rij;
+        const double A2j = rhoa2j / rij2;
+        const double A3j = rhoa3j / (rij2 * rij);
+        const double A1i = rhoa1i / rij;
+        const double A2i = rhoa2i / rij2;
+        const double A3i = rhoa3i / (rij2 * rij);
+        int nv2 = 0;
+        int nv3 = 0;
+        for (int m = 0; m < 3; m++) {
           a_arho1(i,m) += A1j * delij[m];
-          a_arho1(j,m) += (-A1i * delij[m]);
+          a_arho1(j,m) += -A1i * delij[m];
           a_arho3b(i,m) += rhoa3j * delij[m] / rij;
-          a_arho3b(j,m) += (- rhoa3i * delij[m] / rij);
-          for (n = m; n < 3; n++) {
+          a_arho3b(j,m) += -rhoa3i * delij[m] / rij;
+          for (int n = m; n < 3; n++) {
             a_arho2(i,nv2) += A2j * delij[m] * delij[n];
             a_arho2(j,nv2) += A2i * delij[m] * delij[n];
-            nv2 = nv2 + 1;
-            for (p = n; p < 3; p++) {
+            nv2++;
+            for (int p = n; p < 3; p++) {
               a_arho3(i,nv3) += A3j * delij[m] * delij[n] * delij[p];
-              a_arho3(j,nv3) += (- A3i * delij[m] * delij[n] * delij[p]);
-              nv3 = nv3 + 1;
+              a_arho3(j,nv3) += -A3i * delij[m] * delij[n] * delij[p];
+              nv3++;
             }
           }
         }
@@ -476,79 +508,81 @@ MEAMKokkos<DeviceType>::calc_rho1(int i, int /*ntype*/, typename AT::t_int_1d_ra
   }
 }
 
+/* ---------------------------------------------------------------------- */
+
 //Cutoff function and derivative
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-double MEAMKokkos<DeviceType>::dfcut(const double xi, double& dfc) const {
-    double a, a3, a4, a1m4;
-    if (xi >= 1.0) {
-      dfc = 0.0;
-      return 1.0;
-    } else if (xi <= 0.0) {
-      dfc = 0.0;
-      return 0.0;
-    } else {
-      a = 1.0 - xi;
-      a3 = a * a * a;
-      a4 = a * a3;
-      a1m4 = 1.0-a4;
+double MEAMKokkos<DeviceType>::dfcut(const double xi, double& dfc) const
+{
+  if (xi >= 1.0) {
+    dfc = 0.0;
+    return 1.0;
+  } else if (xi <= 0.0) {
+    dfc = 0.0;
+    return 0.0;
+  } else {
+    const double a = 1.0 - xi;
+    const double a3 = a * a * a;
+    const double a4 = a * a3;
+    const double a1m4 = 1.0 - a4;
 
-      dfc = 8 * a1m4 * a3;
-      return a1m4*a1m4;
-    }
+    dfc = 8 * a1m4 * a3;
+    return a1m4*a1m4;
   }
+}
 
   //-----------------------------------------------------------------------------
-  //  // Derivative of Cikj w.r.t. rij
-  //    //     Inputs: rij,rij2,rik2,rjk2
-  //      //
-  //
+  // Derivative of Cikj w.r.t. rij
+  // Inputs: rij,rij2,rik2,rjk2
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 double MEAMKokkos<DeviceType>::dCfunc(const double rij2, const double rik2, const double rjk2) const
 {
-    double rij4, a, asq, b,denom;
-
-    rij4 = rij2 * rij2;
-    a = rik2 - rjk2;
-    b = rik2 + rjk2;
-    asq = a*a;
-    denom = rij4 - asq;
-    denom = denom * denom;
-    return -4 * (-2 * rij2 * asq + rij4 * b + asq * b) / denom;
+  const double rij4 = rij2 * rij2;
+  const double a = rik2 - rjk2;
+  const double b = rik2 + rjk2;
+  const double asq = a*a;
+  double denom = rij4 - asq;
+  denom = denom * denom;
+  return -4 * (-2 * rij2 * asq + rij4 * b + asq * b) / denom;
 }
 
+/* ---------------------------------------------------------------------- */
+
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void MEAMKokkos<DeviceType>::dCfunc2(const double rij2, const double rik2, const double rjk2, double& dCikj1, double& dCikj2) const
 {
-    double rij4, rik4, rjk4, a, denom;
-
-    rij4 = rij2 * rij2;
-    rik4 = rik2 * rik2;
-    rjk4 = rjk2 * rjk2;
-    a = rik2 - rjk2;
-    denom = rij4 - a * a;
-    denom = denom * denom;
-    dCikj1 = 4 * rij2 * (rij4 + rik4 + 2 * rik2 * rjk2 - 3 * rjk4 - 2 * rij2 * a) / denom;
-    dCikj2 = 4 * rij2 * (rij4 - 3 * rik4 + 2 * rik2 * rjk2 + rjk4 + 2 * rij2 * a) / denom;
-
+  const double rij4 = rij2 * rij2;
+  const double rik4 = rik2 * rik2;
+  const double rjk4 = rjk2 * rjk2;
+  const double a = rik2 - rjk2;
+  double denom = rij4 - a * a;
+  denom = denom * denom;
+  dCikj1 = 4 * rij2 * (rij4 + rik4 + 2 * rik2 * rjk2 - 3 * rjk4 - 2 * rij2 * a) / denom;
+  dCikj2 = 4 * rij2 * (rij4 - 3 * rik4 + 2 * rik2 * rjk2 + rjk4 + 2 * rij2 * a) / denom;
 }
+
+/* ---------------------------------------------------------------------- */
+
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-double MEAMKokkos<DeviceType>::fcut(const double xi) const{
-    double a;
-    if (xi >= 1.0)
-      return 1.0;
-    else if (xi <= 0.0)
-      return 0.0;
-    else {
-      a = 1.0 - xi;
-      a *= a; a *= a;
-      a = 1.0 - a;
-      return a * a;
-    }
+double MEAMKokkos<DeviceType>::fcut(const double xi) const
+{
+  double a;
+  if (xi >= 1.0)
+    return 1.0;
+  else if (xi <= 0.0)
+    return 0.0;
+  else {
+    // ( 1.d0 - (1.d0 - xi)**4 )**2, but with better codegen
+    a = 1.0 - xi;
+    a *= a; a *= a;
+    a = 1.0 - a;
+    return a * a;
   }
+}
 
diff --git a/src/KOKKOS/meam_force_kokkos.h b/src/KOKKOS/meam_force_kokkos.h
index 4e061c9043..aafcf80473 100644
--- a/src/KOKKOS/meam_force_kokkos.h
+++ b/src/KOKKOS/meam_force_kokkos.h
@@ -5,106 +5,129 @@
 using namespace LAMMPS_NS;
 using namespace MathSpecialKokkos;
 
-
 template<class DeviceType>
 void
-MEAMKokkos<DeviceType>::meam_force(int inum_half, int eflag_either, int eflag_global, int eflag_atom, int vflag_atom, double* eng_vdwl,
-                 typename ArrayTypes<DeviceType>::t_efloat_1d eatom, int ntype, typename AT::t_int_1d_randomread type, typename AT::t_int_1d_randomread fmap, typename AT::t_x_array_randomread x, typename AT::t_int_1d_randomread numneigh,
-                 typename AT::t_int_1d_randomread numneigh_full, typename AT::t_f_array f, typename ArrayTypes<DeviceType>::t_virial_array vatom, typename AT::t_int_1d_randomread d_ilist_half, typename AT::t_int_1d_randomread d_offset, typename AT::t_neighbors_2d d_neighbors_half, typename AT::t_neighbors_2d d_neighbors_full, int neighflag)
+MEAMKokkos<DeviceType>::meam_force(int inum_half, int eflag_global, int eflag_atom, int vflag_global, int vflag_atom,
+                 typename ArrayTypes<DeviceType>::t_efloat_1d eatom, int ntype, typename AT::t_int_1d type, typename AT::t_int_1d d_map, typename AT::t_x_array x, typename AT::t_int_1d numneigh,
+                 typename AT::t_int_1d numneigh_full, typename AT::t_f_array f, typename ArrayTypes<DeviceType>::t_virial_array vatom, typename AT::t_int_1d d_ilist_half, typename AT::t_int_1d d_offset, typename AT::t_neighbors_2d d_neighbors_half, typename AT::t_neighbors_2d d_neighbors_full, int neighflag, int need_dup, EV_FLOAT &ev_all)
 {
-    EV_FLOAT ev;
-    ev.evdwl = *eng_vdwl;
+  EV_FLOAT ev;
 
-    this->eflag_either = eflag_either;
-    this->eflag_global = eflag_global;
-    this->eflag_atom = eflag_atom;
-    this->vflag_atom = vflag_atom;
-    this->d_eatom = eatom;
-    this->ntype = ntype;
-    this->type = type;
-    this->fmap = fmap;
-    this->x = x;
-    this->d_numneigh_half = numneigh;
-    this->d_numneigh_full = numneigh_full;
-    this->d_neighbors_half = d_neighbors_half;
-    this->d_neighbors_full = d_neighbors_full;
-    this->f = f;
-    this->d_vatom = vatom;
-    this->d_ilist_half = d_ilist_half;
-    this->d_offset = d_offset;
-  
-    if (neighflag == FULL) 
-       Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagMEAMforce<FULL>>(0,inum_half),*this,ev);
-    else if (neighflag == HALF)
-       Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagMEAMforce<HALF>>(0,inum_half),*this,ev);
-    else if (neighflag == HALFTHREAD)
-       Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagMEAMforce<HALFTHREAD>>(0,inum_half),*this,ev);
-    *eng_vdwl = ev.evdwl;
+  this->eflag_either = eflag_either;
+  this->eflag_global = eflag_global;
+  this->eflag_atom = eflag_atom;
+  this->vflag_global = vflag_global;
+  this->vflag_atom = vflag_atom;
+  eflag_either = eflag_atom || eflag_global;
+  vflag_either = vflag_atom || vflag_global;
+  this->d_eatom = eatom;
+  this->ntype = ntype;
+  this->type = type;
+  this->d_map = d_map;
+  this->x = x;
+  this->d_numneigh_half = numneigh;
+  this->d_numneigh_full = numneigh_full;
+  this->d_neighbors_half = d_neighbors_half;
+  this->d_neighbors_full = d_neighbors_full;
+  this->f = f;
+  this->d_vatom = vatom;
+  this->d_ilist_half = d_ilist_half;
+  this->d_offset = d_offset;
+
+  if (need_dup) {
+    dup_f = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(f);
+    if (eflag_atom) dup_eatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_eatom);
+    if (vflag_atom) dup_vatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_vatom);
+  } else {
+    ndup_f = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(f);
+    if (eflag_atom) ndup_eatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_eatom);
+    if (vflag_atom) ndup_vatom = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_vatom);
+  }
+
+  copymode = 1;
+  if (neighflag == HALF)
+     Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagMEAMForce<HALF>>(0,inum_half),*this,ev);
+  else if (neighflag == HALFTHREAD)
+     Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagMEAMForce<HALFTHREAD>>(0,inum_half),*this,ev);
+  ev_all += ev;
+  copymode = 0;
+
+  if (need_dup) {
+    Kokkos::Experimental::contribute(f, dup_f);
+    if (eflag_atom) Kokkos::Experimental::contribute(d_eatom, dup_eatom);
+    if (vflag_atom) Kokkos::Experimental::contribute(d_vatom, dup_vatom);
+
+    // free duplicated memory
+    dup_f = decltype(dup_f)();
+    if (eflag_atom) dup_eatom = decltype(dup_eatom)();
+    if (vflag_atom) dup_vatom = decltype(dup_vatom)();
+  }
 }
 
 template<class DeviceType>
 template<int NEIGHFLAG>
 KOKKOS_INLINE_FUNCTION
 void
-MEAMKokkos<DeviceType>::operator()(TagMEAMforce<NEIGHFLAG>, const int &ii, EV_FLOAT& ev) const {
+MEAMKokkos<DeviceType>::operator()(TagMEAMForce<NEIGHFLAG>, const int &ii, EV_FLOAT& ev) const {
   int i, j, jn, k, kn, kk, m, n, p, q;
   int nv2, nv3, elti, eltj, eltk, ind;
   X_FLOAT xitmp, yitmp, zitmp, delij[3];
-  F_FLOAT rij2, rij, rij3;
-  F_FLOAT v[6], fi[3], fj[3];
-  F_FLOAT third, sixth;
-  F_FLOAT pp, dUdrij, dUdsij, dUdrijm[3], force, forcem;
-  F_FLOAT r, recip, phi, phip;
-  F_FLOAT sij;
-  F_FLOAT a1, a1i, a1j, a2, a2i, a2j;
-  F_FLOAT a3i, a3j;
-  F_FLOAT shpi[3], shpj[3];
-  F_FLOAT ai, aj, ro0i, ro0j, invrei, invrej;
-  F_FLOAT rhoa0j, drhoa0j, rhoa0i, drhoa0i;
-  F_FLOAT rhoa1j, drhoa1j, rhoa1i, drhoa1i;
-  F_FLOAT rhoa2j, drhoa2j, rhoa2i, drhoa2i;
-  F_FLOAT a3, a3a, rhoa3j, drhoa3j, rhoa3i, drhoa3i;
-  F_FLOAT drho0dr1, drho0dr2, drho0ds1, drho0ds2;
-  F_FLOAT drho1dr1, drho1dr2, drho1ds1, drho1ds2;
-  F_FLOAT drho1drm1[3], drho1drm2[3];
-  F_FLOAT drho2dr1, drho2dr2, drho2ds1, drho2ds2;
-  F_FLOAT drho2drm1[3], drho2drm2[3];
-  F_FLOAT drho3dr1, drho3dr2, drho3ds1, drho3ds2;
-  F_FLOAT drho3drm1[3], drho3drm2[3];
-  F_FLOAT dt1dr1, dt1dr2, dt1ds1, dt1ds2;
-  F_FLOAT dt2dr1, dt2dr2, dt2ds1, dt2ds2;
-  F_FLOAT dt3dr1, dt3dr2, dt3ds1, dt3ds2;
-  F_FLOAT drhodr1, drhodr2, drhods1, drhods2, drhodrm1[3], drhodrm2[3];
-  F_FLOAT arg;
-  F_FLOAT arg1i1, arg1j1, arg1i2, arg1j2, arg1i3, arg1j3, arg3i3, arg3j3;
-  F_FLOAT dsij1, dsij2, force1, force2;
-  F_FLOAT t1i, t2i, t3i, t1j, t2j, t3j;
+  double rij2, rij, rij3;
+  double v[6], fi[3], fj[3];
+  double third, sixth;
+  double pp, dUdrij, dUdsij, dUdrijm[3], force, forcem;
+  double recip, phi, phip;
+  double sij;
+  double a1, a1i, a1j, a2, a2i, a2j;
+  double a3i, a3j;
+  double shpi[3], shpj[3];
+  double ai, aj, ro0i, ro0j, invrei, invrej;
+  double rhoa0j, drhoa0j, rhoa0i, drhoa0i;
+  double rhoa1j, drhoa1j, rhoa1i, drhoa1i;
+  double rhoa2j, drhoa2j, rhoa2i, drhoa2i;
+  double a3, a3a, rhoa3j, drhoa3j, rhoa3i, drhoa3i;
+  double drho0dr1, drho0dr2, drho0ds1, drho0ds2;
+  double drho1dr1, drho1dr2, drho1ds1, drho1ds2;
+  double drho1drm1[3], drho1drm2[3];
+  double drho2dr1, drho2dr2, drho2ds1, drho2ds2;
+  double drho2drm1[3], drho2drm2[3];
+  double drho3dr1, drho3dr2, drho3ds1, drho3ds2;
+  double drho3drm1[3], drho3drm2[3];
+  double dt1dr1, dt1dr2, dt1ds1, dt1ds2;
+  double dt2dr1, dt2dr2, dt2ds1, dt2ds2;
+  double dt3dr1, dt3dr2, dt3ds1, dt3ds2;
+  double drhodr1, drhodr2, drhods1, drhods2, drhodrm1[3], drhodrm2[3];
+  double arg;
+  double arg1i1, arg1j1, arg1i2, arg1j2, arg1i3, arg1j3, arg3i3, arg3j3;
+  double dsij1, dsij2, force1, force2;
+  double t1i, t2i, t3i, t1j, t2j, t3j;
   int fnoffset;
 
-  // To do: update with duplication for OpenMP, atomic for GPUs
-  Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_eatom = d_eatom;
-  Kokkos::View<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_vatom = d_vatom;
-  Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f;
+  // The f, etc. arrays are duplicated for OpenMP, atomic for CUDA, and neither for Serial
 
+  auto v_f = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f);
+  auto a_f = v_f.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();
+  auto v_eatom = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_eatom),decltype(ndup_eatom)>::get(dup_eatom,ndup_eatom);
+  auto a_eatom = v_eatom.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();
+  auto v_vatom = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_vatom),decltype(ndup_vatom)>::get(dup_vatom,ndup_vatom);
+  auto a_vatom = v_vatom.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();
 
   i = d_ilist_half[ii];
   fnoffset = d_offset[i];
   third = 1.0 / 3.0;
   sixth = 1.0 / 6.0;
-  //     Compute forces atom i
 
-  elti = fmap[type[i]];
+  elti = d_map[type[i]];
   if (elti < 0) return;
 
   xitmp = x(i,0);
   yitmp = x(i,1);
   zitmp = x(i,2);
 
-  //     Treat each pair
+  // Treat each pair
   for (jn = 0; jn < d_numneigh_half[i]; jn++) {
-    //j = firstneigh[jn];
     j = d_neighbors_half(i,jn);
-    eltj = fmap[type[j]];
+    eltj = d_map[type[j]];
 
     if (!iszero_kk(d_scrfcn[fnoffset + jn]) && eltj >= 0) {
 
@@ -113,60 +136,60 @@ MEAMKokkos<DeviceType>::operator()(TagMEAMforce<NEIGHFLAG>, const int &ii, EV_FL
       delij[1] = x(j,1) - yitmp;
       delij[2] = x(j,2) - zitmp;
       rij2 = delij[0] * delij[0] + delij[1] * delij[1] + delij[2] * delij[2];
-      if (rij2 < this->cutforcesq) {
+      if (rij2 < cutforcesq) {
         rij = sqrt(rij2);
-        r = rij;
+        recip = 1.0 / rij;
 
-        //     Compute phi and phip
-        ind = this->eltind[elti][eltj];
-        pp = rij * this->rdrar;
+        // Compute phi and phip
+        ind = eltind[elti][eltj];
+        pp = rij * rdrar;
         kk = (int)pp;
-        kk = (kk <= (this->nrar - 2))?kk:this->nrar - 2;
+        kk = (kk <= (nrar - 2))?kk:nrar - 2;
         pp = pp - kk;
         pp = (pp <= 1.0)?pp:1.0;
-        phi = ((this->d_phirar3(ind,kk) * pp + this->d_phirar2(ind,kk)) * pp + this->d_phirar1(ind,kk)) * pp +
-          this->d_phirar(ind,kk);
-        phip = (this->d_phirar6(ind,kk) * pp + this->d_phirar5(ind,kk)) * pp + this->d_phirar4(ind,kk);
-        recip = 1.0 / r;
+        phi = ((d_phirar3(ind,kk) * pp + d_phirar2(ind,kk)) * pp + d_phirar1(ind,kk)) * pp +
+          d_phirar(ind,kk);
+        phip = (d_phirar6(ind,kk) * pp + d_phirar5(ind,kk)) * pp + d_phirar4(ind,kk);
 
-        if (eflag_either != 0) {
-          if (eflag_global != 0)
-            //*eng_vdwl = *eng_vdwl + phi * sij;
-            ev.evdwl = ev.evdwl + phi * sij;
-          if (eflag_atom != 0) {
-            a_eatom[i] += (0.5 * phi * sij);
-            a_eatom[j] += (0.5 * phi * sij);
+        if (eflag_either) {
+          double scaleij = d_scale(type[i],type[i]);
+          double phi_sc = phi * scaleij;
+          if (eflag_global)
+            ev.evdwl += phi_sc * sij;
+          if (eflag_atom) {
+            a_eatom[i] += 0.5 * phi * sij;
+            a_eatom[j] += 0.5 * phi * sij;
           }
         }
 
-        //     write(1,*) "force_meamf: phi: ",phi
-        //     write(1,*) "force_meamf: phip: ",phip
+        // write(1,*) "force_meamf: phi: ",phi
+        // write(1,*) "force_meamf: phip: ",phip
 
-        //     Compute pair densities and derivatives
-        invrei = 1.0 / this->re_meam[elti][elti];
+        // Compute pair densities and derivatives
+        invrei = 1.0 / re_meam[elti][elti];
         ai = rij * invrei - 1.0;
-        ro0i = this->rho0_meam[elti];
-        rhoa0i = ro0i * MathSpecialKokkos::fm_exp(-this->beta0_meam[elti] * ai);
-        drhoa0i = -this->beta0_meam[elti] * invrei * rhoa0i;
-        rhoa1i = ro0i * MathSpecialKokkos::fm_exp(-this->beta1_meam[elti] * ai);
-        drhoa1i = -this->beta1_meam[elti] * invrei * rhoa1i;
-        rhoa2i = ro0i * MathSpecialKokkos::fm_exp(-this->beta2_meam[elti] * ai);
-        drhoa2i = -this->beta2_meam[elti] * invrei * rhoa2i;
-        rhoa3i = ro0i * MathSpecialKokkos::fm_exp(-this->beta3_meam[elti] * ai);
-        drhoa3i = -this->beta3_meam[elti] * invrei * rhoa3i;
+        ro0i = rho0_meam[elti];
+        rhoa0i = ro0i * MathSpecialKokkos::fm_exp(-beta0_meam[elti] * ai);
+        drhoa0i = -beta0_meam[elti] * invrei * rhoa0i;
+        rhoa1i = ro0i * MathSpecialKokkos::fm_exp(-beta1_meam[elti] * ai);
+        drhoa1i = -beta1_meam[elti] * invrei * rhoa1i;
+        rhoa2i = ro0i * MathSpecialKokkos::fm_exp(-beta2_meam[elti] * ai);
+        drhoa2i = -beta2_meam[elti] * invrei * rhoa2i;
+        rhoa3i = ro0i * MathSpecialKokkos::fm_exp(-beta3_meam[elti] * ai);
+        drhoa3i = -beta3_meam[elti] * invrei * rhoa3i;
 
         if (elti != eltj) {
-          invrej = 1.0 / this->re_meam[eltj][eltj];
+          invrej = 1.0 / re_meam[eltj][eltj];
           aj = rij * invrej - 1.0;
-          ro0j = this->rho0_meam[eltj];
-          rhoa0j = ro0j * MathSpecialKokkos::fm_exp(-this->beta0_meam[eltj] * aj);
-          drhoa0j = -this->beta0_meam[eltj] * invrej * rhoa0j;
-          rhoa1j = ro0j * MathSpecialKokkos::fm_exp(-this->beta1_meam[eltj] * aj);
-          drhoa1j = -this->beta1_meam[eltj] * invrej * rhoa1j;
-          rhoa2j = ro0j * MathSpecialKokkos::fm_exp(-this->beta2_meam[eltj] * aj);
-          drhoa2j = -this->beta2_meam[eltj] * invrej * rhoa2j;
-          rhoa3j = ro0j * MathSpecialKokkos::fm_exp(-this->beta3_meam[eltj] * aj);
-          drhoa3j = -this->beta3_meam[eltj] * invrej * rhoa3j;
+          ro0j = rho0_meam[eltj];
+          rhoa0j = ro0j * MathSpecialKokkos::fm_exp(-beta0_meam[eltj] * aj);
+          drhoa0j = -beta0_meam[eltj] * invrej * rhoa0j;
+          rhoa1j = ro0j * MathSpecialKokkos::fm_exp(-beta1_meam[eltj] * aj);
+          drhoa1j = -beta1_meam[eltj] * invrej * rhoa1j;
+          rhoa2j = ro0j * MathSpecialKokkos::fm_exp(-beta2_meam[eltj] * aj);
+          drhoa2j = -beta2_meam[eltj] * invrej * rhoa2j;
+          rhoa3j = ro0j * MathSpecialKokkos::fm_exp(-beta3_meam[eltj] * aj);
+          drhoa3j = -beta3_meam[eltj] * invrej * rhoa3j;
         } else {
           rhoa0j = rhoa0i;
           drhoa0j = drhoa0i;
@@ -178,14 +201,14 @@ MEAMKokkos<DeviceType>::operator()(TagMEAMforce<NEIGHFLAG>, const int &ii, EV_FL
           drhoa3j = drhoa3i;
         }
 
-        const double t1mi = this->t1_meam[elti];
-        const double t2mi = this->t2_meam[elti];
-        const double t3mi = this->t3_meam[elti];
-        const double t1mj = this->t1_meam[eltj];
-        const double t2mj = this->t2_meam[eltj];
-        const double t3mj = this->t3_meam[eltj];
+        const double t1mi = t1_meam[elti];
+        const double t2mi = t2_meam[elti];
+        const double t3mi = t3_meam[elti];
+        const double t1mj = t1_meam[eltj];
+        const double t2mj = t2_meam[eltj];
+        const double t3mj = t3_meam[eltj];
 
-        if (this->ialloy == 1) {
+        if (ialloy == 1) {
           rhoa1j  *= t1mj;
           rhoa2j  *= t2mj;
           rhoa3j  *= t3mj;
@@ -213,12 +236,12 @@ MEAMKokkos<DeviceType>::operator()(TagMEAMforce<NEIGHFLAG>, const int &ii, EV_FL
         for (n = 0; n < 3; n++) {
           for (p = n; p < 3; p++) {
             for (q = p; q < 3; q++) {
-              arg = delij[n] * delij[p] * delij[q] * this->v3D[nv3];
+              arg = delij[n] * delij[p] * delij[q] * v3D[nv3];
               arg1i3 = arg1i3 + d_arho3(i,nv3) * arg;
               arg1j3 = arg1j3 - d_arho3(j,nv3) * arg;
               nv3 = nv3 + 1;
             }
-            arg = delij[n] * delij[p] * this->v2D[nv2];
+            arg = delij[n] * delij[p] * v2D[nv2];
             arg1i2 = arg1i2 + d_arho2(i,nv2) * arg;
             arg1j2 = arg1j2 + d_arho2(j,nv2) * arg;
             nv2 = nv2 + 1;
@@ -229,11 +252,11 @@ MEAMKokkos<DeviceType>::operator()(TagMEAMforce<NEIGHFLAG>, const int &ii, EV_FL
           arg3j3 = arg3j3 - d_arho3b(j,n) * delij[n];
         }
 
-        //     rho0 terms
+        // rho0 terms
         drho0dr1 = drhoa0j * sij;
         drho0dr2 = drhoa0i * sij;
 
-        //     rho1 terms
+        // rho1 terms
         a1 = 2 * sij / rij;
         drho1dr1 = a1 * (drhoa1j - rhoa1j / rij) * arg1i1;
         drho1dr2 = a1 * (drhoa1i - rhoa1i / rij) * arg1j1;
@@ -243,7 +266,7 @@ MEAMKokkos<DeviceType>::operator()(TagMEAMforce<NEIGHFLAG>, const int &ii, EV_FL
           drho1drm2[m] = -a1 * rhoa1i * d_arho1(j,m);
         }
 
-        //     rho2 terms
+        // rho2 terms
         a2 = 2 * sij / rij2;
         drho2dr1 = a2 * (drhoa2j - 2 * rhoa2j / rij) * arg1i2 - 2.0 / 3.0 * d_arho2b[i] * drhoa2j * sij;
         drho2dr2 = a2 * (drhoa2i - 2 * rhoa2i / rij) * arg1j2 - 2.0 / 3.0 * d_arho2b[j] * drhoa2i * sij;
@@ -252,14 +275,14 @@ MEAMKokkos<DeviceType>::operator()(TagMEAMforce<NEIGHFLAG>, const int &ii, EV_FL
           drho2drm1[m] = 0.0;
           drho2drm2[m] = 0.0;
           for (n = 0; n < 3; n++) {
-            drho2drm1[m] = drho2drm1[m] + d_arho2(i,this->vind2D[m][n]) * delij[n];
-            drho2drm2[m] = drho2drm2[m] - d_arho2(j,this->vind2D[m][n]) * delij[n];
+            drho2drm1[m] = drho2drm1[m] + d_arho2(i,vind2D[m][n]) * delij[n];
+            drho2drm2[m] = drho2drm2[m] - d_arho2(j,vind2D[m][n]) * delij[n];
           }
           drho2drm1[m] = a2 * rhoa2j * drho2drm1[m];
           drho2drm2[m] = -a2 * rhoa2i * drho2drm2[m];
         }
 
-        //     rho3 terms
+        // rho3 terms
         rij3 = rij * rij2;
         a3 = 2 * sij / rij3;
         a3a = 6.0 / 5.0 * sij / rij;
@@ -273,9 +296,9 @@ MEAMKokkos<DeviceType>::operator()(TagMEAMforce<NEIGHFLAG>, const int &ii, EV_FL
           nv2 = 0;
           for (n = 0; n < 3; n++) {
             for (p = n; p < 3; p++) {
-              arg = delij[n] * delij[p] * this->v2D[nv2];
-              drho3drm1[m] = drho3drm1[m] + d_arho3(i,this->vind3D[m][n][p]) * arg;
-              drho3drm2[m] = drho3drm2[m] + d_arho3(j,this->vind3D[m][n][p]) * arg;
+              arg = delij[n] * delij[p] * v2D[nv2];
+              drho3drm1[m] = drho3drm1[m] + d_arho3(i,vind3D[m][n][p]) * arg;
+              drho3drm2[m] = drho3drm2[m] + d_arho3(j,vind3D[m][n][p]) * arg;
               nv2 = nv2 + 1;
             }
           }
@@ -283,7 +306,7 @@ MEAMKokkos<DeviceType>::operator()(TagMEAMforce<NEIGHFLAG>, const int &ii, EV_FL
           drho3drm2[m] = (-a3 * drho3drm2[m] + a3a * d_arho3b(j,m)) * rhoa3i;
         }
 
-        //     Compute derivatives of weighting functions t wrt rij
+        // Compute derivatives of weighting functions t wrt rij
         t1i = d_t_ave(i,0);
         t2i = d_t_ave(i,1);
         t3i = d_t_ave(i,2);
@@ -291,7 +314,7 @@ MEAMKokkos<DeviceType>::operator()(TagMEAMforce<NEIGHFLAG>, const int &ii, EV_FL
         t2j = d_t_ave(j,1);
         t3j = d_t_ave(j,2);
 
-        if (this->ialloy == 1) {
+        if (ialloy == 1) {
 
           a1i = fdiv_zero_kk(drhoa0j * sij, d_tsq_ave(i,0));
           a1j = fdiv_zero_kk(drhoa0i * sij, d_tsq_ave(j,0));
@@ -307,7 +330,7 @@ MEAMKokkos<DeviceType>::operator()(TagMEAMforce<NEIGHFLAG>, const int &ii, EV_FL
           dt3dr1 = a3i * (t3mj - t3i * MathSpecialKokkos::square(t3mj));
           dt3dr2 = a3j * (t3mi - t3j * MathSpecialKokkos::square(t3mi));
 
-        } else if (this->ialloy == 2) {
+        } else if (ialloy == 2) {
 
           dt1dr1 = 0.0;
           dt1dr2 = 0.0;
@@ -333,25 +356,27 @@ MEAMKokkos<DeviceType>::operator()(TagMEAMforce<NEIGHFLAG>, const int &ii, EV_FL
           dt3dr2 = aj * (t3mi - t3j);
         }
 
-        //     Compute derivatives of total density wrt rij, sij and rij(3)
-         get_shpfcn(this->lattce_meam[elti][elti], shpi);
-         get_shpfcn(this->lattce_meam[eltj][eltj], shpj);
-        drhodr1 = dgamma1[i] * drho0dr1 +
-          dgamma2[i] * (dt1dr1 * d_rho1[i] + t1i * drho1dr1 + dt2dr1 * d_rho2[i] + t2i * drho2dr1 +
+        // Compute derivatives of total density wrt rij, sij and rij(3)
+        get_shpfcn(lattce_meam[elti][elti], stheta_meam[elti][elti], ctheta_meam[elti][elti], shpi);
+        get_shpfcn(lattce_meam[eltj][eltj], stheta_meam[elti][elti], ctheta_meam[elti][elti], shpj);
+
+        drhodr1 = d_dgamma1[i] * drho0dr1 +
+          d_dgamma2[i] * (dt1dr1 * d_rho1[i] + t1i * drho1dr1 + dt2dr1 * d_rho2[i] + t2i * drho2dr1 +
                         dt3dr1 * d_rho3[i] + t3i * drho3dr1) -
-          dgamma3[i] * (shpi[0] * dt1dr1 + shpi[1] * dt2dr1 + shpi[2] * dt3dr1);
-        drhodr2 = dgamma1[j] * drho0dr2 +
-          dgamma2[j] * (dt1dr2 * d_rho1[j] + t1j * drho1dr2 + dt2dr2 * d_rho2[j] + t2j * drho2dr2 +
+          d_dgamma3[i] * (shpi[0] * dt1dr1 + shpi[1] * dt2dr1 + shpi[2] * dt3dr1);
+        drhodr2 = d_dgamma1[j] * drho0dr2 +
+          d_dgamma2[j] * (dt1dr2 * d_rho1[j] + t1j * drho1dr2 + dt2dr2 * d_rho2[j] + t2j * drho2dr2 +
                         dt3dr2 * d_rho3[j] + t3j * drho3dr2) -
-          dgamma3[j] * (shpj[0] * dt1dr2 + shpj[1] * dt2dr2 + shpj[2] * dt3dr2);
+          d_dgamma3[j] * (shpj[0] * dt1dr2 + shpj[1] * dt2dr2 + shpj[2] * dt3dr2);
         for (m = 0; m < 3; m++) {
           drhodrm1[m] = 0.0;
           drhodrm2[m] = 0.0;
-          drhodrm1[m] = dgamma2[i] * (t1i * drho1drm1[m] + t2i * drho2drm1[m] + t3i * drho3drm1[m]);
-          drhodrm2[m] = dgamma2[j] * (t1j * drho1drm2[m] + t2j * drho2drm2[m] + t3j * drho3drm2[m]);
+          drhodrm1[m] = d_dgamma2[i] * (t1i * drho1drm1[m] + t2i * drho2drm1[m] + t3i * drho3drm1[m]);
+          drhodrm2[m] = d_dgamma2[j] * (t1j * drho1drm2[m] + t2j * drho2drm2[m] + t3j * drho3drm2[m]);
+
         }
 
-        //     Compute derivatives wrt sij, but only if necessary
+        // Compute derivatives wrt sij, but only if necessary
         if (!iszero_kk(d_dscrfcn[fnoffset + jn])) {
           drho0ds1 = rhoa0j;
           drho0ds2 = rhoa0i;
@@ -366,7 +391,7 @@ MEAMKokkos<DeviceType>::operator()(TagMEAMforce<NEIGHFLAG>, const int &ii, EV_FL
           drho3ds1 = a3 * rhoa3j * arg1i3 - a3a * rhoa3j * arg3i3;
           drho3ds2 = a3 * rhoa3i * arg1j3 - a3a * rhoa3i * arg3j3;
 
-          if (this->ialloy == 1) {
+          if (ialloy == 1) {
             a1i = fdiv_zero_kk(rhoa0j, d_tsq_ave(i,0));
             a1j = fdiv_zero_kk(rhoa0i, d_tsq_ave(j,0));
             a2i = fdiv_zero_kk(rhoa0j, d_tsq_ave(i,1));
@@ -381,7 +406,7 @@ MEAMKokkos<DeviceType>::operator()(TagMEAMforce<NEIGHFLAG>, const int &ii, EV_FL
             dt3ds1 = a3i * (t3mj - t3i * MathSpecialKokkos::square(t3mj));
             dt3ds2 = a3j * (t3mi - t3j * MathSpecialKokkos::square(t3mi));
 
-          } else if (this->ialloy == 2) {
+          } else if (ialloy == 2) {
 
             dt1ds1 = 0.0;
             dt1ds2 = 0.0;
@@ -417,7 +442,7 @@ MEAMKokkos<DeviceType>::operator()(TagMEAMforce<NEIGHFLAG>, const int &ii, EV_FL
             d_dgamma3[j] * (shpj[0] * dt1ds2 + shpj[1] * dt2ds2 + shpj[2] * dt3ds2);
         }
 
-        //     Compute derivatives of energy wrt rij, sij and rij[3]
+        // Compute derivatives of energy wrt rij, sij and rij[3]
         dUdrij = phip * sij + d_frhop[i] * drhodr1 + d_frhop[j] * drhodr2;
         dUdsij = 0.0;
         if (!iszero_kk(d_dscrfcn[fnoffset + jn])) {
@@ -427,17 +452,17 @@ MEAMKokkos<DeviceType>::operator()(TagMEAMforce<NEIGHFLAG>, const int &ii, EV_FL
           dUdrijm[m] = d_frhop[i] * drhodrm1[m] + d_frhop[j] * drhodrm2[m];
         }
 
-        //     Add the part of the force due to dUdrij and dUdsij
+        // Add the part of the force due to dUdrij and dUdsij
         force = dUdrij * recip + dUdsij * d_dscrfcn[fnoffset + jn];
         for (m = 0; m < 3; m++) {
           forcem = delij[m] * force + dUdrijm[m];
           a_f(i,m) += forcem;
-          a_f(j,m) -= -forcem;
+          a_f(j,m) -= forcem;
         }
 
-        //     Tabulate per-atom virial as symmetrized stress tensor
+        // Tabulate per-atom virial as symmetrized stress tensor
 
-        if (vflag_atom != 0) {
+        if (vflag_either) {
           fi[0] = delij[0] * force + dUdrijm[0];
           fi[1] = delij[1] * force + dUdrijm[1];
           fi[2] = delij[2] * force + dUdrijm[2];
@@ -448,36 +473,41 @@ MEAMKokkos<DeviceType>::operator()(TagMEAMforce<NEIGHFLAG>, const int &ii, EV_FL
           v[4] = -0.25 * (delij[0] * fi[2] + delij[2] * fi[0]);
           v[5] = -0.25 * (delij[1] * fi[2] + delij[2] * fi[1]);
 
-          for (m = 0; m < 6; m++) {
-            a_vatom(i,m) += v[m];
-            a_vatom(j,m) += v[m];
+          if (vflag_global)
+            for (m = 0; m < 6; m++)
+              ev.v[m] += 2.0*v[m];
+
+          if (vflag_atom) {
+            for (m = 0; m < 6; m++) {
+              a_vatom(i,m) += v[m];
+              a_vatom(j,m) += v[m];
+            }
           }
         }
 
-        //     Now compute forces on other atoms k due to change in sij
+        // Now compute forces on other atoms k due to change in sij
 
-        if (iszero_kk(sij) || iszero_kk(sij - 1.0)) continue; //: cont jn loop
+        if (iszero_kk(sij) || isone_kk(sij)) continue; //: cont jn loop
 
         double dxik(0), dyik(0), dzik(0);
         double dxjk(0), dyjk(0), dzjk(0);
 
         for (kn = 0; kn < d_numneigh_full[i]; kn++) {
-          //k = firstneigh_full[kn];
           k = d_neighbors_full(i,kn);
-          eltk = fmap[type[k]];
+          eltk = d_map[type[k]];
           if (k != j && eltk >= 0) {
             double xik, xjk, cikj, sikj, dfc, a;
             double dCikj1, dCikj2;
             double delc, rik2, rjk2;
 
             sij = d_scrfcn[jn+fnoffset] * d_fcpair[jn+fnoffset];
-            const double Cmax = this->Cmax_meam[elti][eltj][eltk];
-            const double Cmin = this->Cmin_meam[elti][eltj][eltk];
+            const double Cmax = Cmax_meam[elti][eltj][eltk];
+            const double Cmin = Cmin_meam[elti][eltj][eltk];
 
             dsij1 = 0.0;
             dsij2 = 0.0;
-            if (!iszero_kk(sij) && !iszero_kk(sij - 1.0)) {
-              const double rbound = rij2 * this->ebound_meam[elti][eltj];
+            if (!iszero_kk(sij) && !isone_kk(sij)) {
+              const double rbound = rij2 * ebound_meam[elti][eltj];
               delc = Cmax - Cmin;
               dxjk = x(k,0) - x(j,0);
               dyjk = x(k,1) - x(j,1);
@@ -521,9 +551,9 @@ MEAMKokkos<DeviceType>::operator()(TagMEAMforce<NEIGHFLAG>, const int &ii, EV_FL
               a_f(k,1) -= force1 * dyik + force2 * dyjk;
               a_f(k,2) -= force1 * dzik + force2 * dzjk;
 
-              //     Tabulate per-atom virial as symmetrized stress tensor
+              // Tabulate per-atom virial as symmetrized stress tensor
 
-              if (vflag_atom != 0) {
+              if (vflag_either) {
                 fi[0] = force1 * dxik;
                 fi[1] = force1 * dyik;
                 fi[2] = force1 * dzik;
@@ -537,18 +567,24 @@ MEAMKokkos<DeviceType>::operator()(TagMEAMforce<NEIGHFLAG>, const int &ii, EV_FL
                 v[4] = -sixth * (dxik * fi[2] + dxjk * fj[2] + dzik * fi[0] + dzjk * fj[0]);
                 v[5] = -sixth * (dyik * fi[2] + dyjk * fj[2] + dzik * fi[1] + dzjk * fj[1]);
 
-                for (m = 0; m < 6; m++) {
-                  a_vatom(i,m) += v[m];
-                  a_vatom(j,m) += v[m];
-                  a_vatom(k,m) += v[m];
+                if (vflag_global)
+                  for (m = 0; m < 6; m++)
+                    ev.v[m] += 3.0*v[m];
+
+                if (vflag_atom) {
+                  for (m = 0; m < 6; m++) {
+                    a_vatom(i,m) += v[m];
+                    a_vatom(j,m) += v[m];
+                    a_vatom(k,m) += v[m];
+                  }
                 }
               }
             }
           }
-          //     end of k loop
+          // end of k loop
         }
       }
     }
-    //     end of j loop
+    // end of j loop
   }
 }
diff --git a/src/KOKKOS/meam_funcs_kokkos.h b/src/KOKKOS/meam_funcs_kokkos.h
index 9e41be9ea8..f02def0676 100644
--- a/src/KOKKOS/meam_funcs_kokkos.h
+++ b/src/KOKKOS/meam_funcs_kokkos.h
@@ -12,7 +12,7 @@
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing author: Sebastian Hütter (OvGU)
+   Contributing author: Naga Vydyanathan (NVIDIA)
 ------------------------------------------------------------------------- */
 
 #include "math_special_kokkos.h"
@@ -22,12 +22,12 @@ using namespace MathSpecialKokkos;
 
 //-----------------------------------------------------------------------------
 // Compute G(gamma) based on selection flag ibar:
-//   0 => G = sqrt(1+gamma)
-//   1 => G = exp(gamma/2)
-//   2 => not implemented
-//   3 => G = 2/(1+exp(-gamma))
-//   4 => G = sqrt(1+gamma)
-//  -5 => G = +-sqrt(abs(1+gamma))
+//  0 => G = sqrt(1+gamma)
+//  1 => G = exp(gamma/2)
+//  2 => not implemented
+//  3 => G = 2/(1+exp(-gamma))
+//  4 => G = sqrt(1+gamma)
+// -5 => G = +-sqrt(abs(1+gamma))
 //
 template<class DeviceType> 
 KOKKOS_INLINE_FUNCTION
@@ -40,9 +40,9 @@ double MEAMKokkos<DeviceType>::G_gam(const double gamma, const int ibar, int &er
     case 4:
       gsmooth_switchpoint = -gsmooth_factor / (gsmooth_factor + 1);
       if (gamma < gsmooth_switchpoint) {
-        //         e.g. gsmooth_factor is 99, {:
-        //         gsmooth_switchpoint = -0.99
-        //         G = 0.01*(-0.99/gamma)**99
+        // e.g. gsmooth_factor is 99, {:
+        // gsmooth_switchpoint = -0.99
+        // G = 0.01*(-0.99/gamma)**99
         double G = 1 / (gsmooth_factor + 1) * pow((gsmooth_switchpoint / gamma), gsmooth_factor);
         return sqrt(G);
       } else {
@@ -51,7 +51,7 @@ double MEAMKokkos<DeviceType>::G_gam(const double gamma, const int ibar, int &er
     case 1:
       return MathSpecialKokkos::fm_exp(gamma / 2.0);
     case 3:
-      return 2.0 / (1.0 + exp(-gamma));
+      return 2.0 / (1.0 + MathSpecialKokkos::fm_exp(-gamma));
     case -5:
       if ((1.0 + gamma) >= 0) {
         return sqrt(1.0 + gamma);
@@ -65,12 +65,12 @@ double MEAMKokkos<DeviceType>::G_gam(const double gamma, const int ibar, int &er
 
 //-----------------------------------------------------------------------------
 // Compute G(gamma and dG(gamma) based on selection flag ibar:
-//   0 => G = sqrt(1+gamma)
-//   1 => G = exp(gamma/2)
-//   2 => not implemented
-//   3 => G = 2/(1+exp(-gamma))
-//   4 => G = sqrt(1+gamma)
-//  -5 => G = +-sqrt(abs(1+gamma))
+//  0 => G = sqrt(1+gamma)
+//  1 => G = exp(gamma/2)
+//  2 => not implemented
+//  3 => G = 2/(1+exp(-gamma))
+//  4 => G = sqrt(1+gamma)
+// -5 => G = +-sqrt(abs(1+gamma))
 //
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
@@ -84,9 +84,9 @@ double MEAMKokkos<DeviceType>::dG_gam(const double gamma, const int ibar, double
     case 4:
       gsmooth_switchpoint = -gsmooth_factor / (gsmooth_factor + 1);
       if (gamma < gsmooth_switchpoint) {
-        //         e.g. gsmooth_factor is 99, {:
-        //         gsmooth_switchpoint = -0.99
-        //         G = 0.01*(-0.99/gamma)**99
+        // e.g. gsmooth_factor is 99, {:
+        // gsmooth_switchpoint = -0.99
+        // G = 0.01*(-0.99/gamma)**99
         G = 1 / (gsmooth_factor + 1) * pow((gsmooth_switchpoint / gamma), gsmooth_factor);
         G = sqrt(G);
         dG = -gsmooth_factor * G / (2.0 * gamma);
@@ -144,6 +144,30 @@ double MEAMKokkos<DeviceType>::zbl(const double r, const int z1, const int z2) c
   return result;
 }
 
+//-----------------------------------------------------------------------------
+// Compute embedding function F(rhobar) and derivative F'(rhobar), eqn I.5
+//
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+double MEAMKokkos<DeviceType>::embedding(const double A, const double Ec, const double rhobar, double& dF) const
+{
+  const double AEc = A * Ec;
+
+  if (rhobar > 0.0) {
+      const double lrb = log(rhobar);
+      dF = AEc * (1.0 + lrb);
+      return AEc * rhobar * lrb;
+  } else {
+    if (emb_lin_neg == 0) {
+      dF = 0.0;
+      return 0.0;
+    } else {
+      dF = - AEc;
+      return - AEc * rhobar;
+    }
+  }
+}
+
 //-----------------------------------------------------------------------------
 // Compute Rose energy function, I.16
 //
@@ -178,7 +202,7 @@ double MEAMKokkos<DeviceType>::erose(const double r, const double re, const doub
 //
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void MEAMKokkos<DeviceType>::get_shpfcn(const lattice_t latt, double (&s)[3]) const
+void MEAMKokkos<DeviceType>::get_shpfcn(const lattice_t latt, const double sthe, const double cthe, double (&s)[3]) const
 {
   switch (latt) {
     case FCC:
@@ -194,7 +218,9 @@ void MEAMKokkos<DeviceType>::get_shpfcn(const lattice_t latt, double (&s)[3]) co
       s[1] = 0.0;
       s[2] = 1.0 / 3.0;
       break;
+    case CH4: // CH4 actually needs shape factor for diamond for C, dimer for H
     case DIA:
+    case DIA3:
       s[0] = 0.0;
       s[1] = 0.0;
       s[2] = 32.0 / 9.0;
@@ -202,12 +228,24 @@ void MEAMKokkos<DeviceType>::get_shpfcn(const lattice_t latt, double (&s)[3]) co
     case DIM:
       s[0] = 1.0;
       s[1] = 2.0 / 3.0;
-      //        s(3) = 1.d0
-      s[2] = 0.40;
+      // s(4) = 1.d0 // this should be 0.4 unless (1-legendre) is multiplied in the density calc.
+      s[2] = 0.40; // this is (1-legendre) where legendre = 0.6 in dynamo is accounted.
+      break;
+    case LIN: // linear, theta being 180
+      s[0] = 0.0;
+      s[1] = 8.0 / 3.0; // 4*(co**4 + si**4 - 1.0/3.0) in zig become 4*(1-1/3)
+      s[2] = 0.0;
+      break;
+    case ZIG: //zig-zag
+    case TRI: //trimer e.g. H2O
+      s[0] = 4.0*pow(cthe,2);
+      s[1] = 4.0*(pow(cthe,4) + pow(sthe,4) - 1.0/3.0);
+      s[2] = 4.0*(pow(cthe,2) * (3*pow(sthe,4) + pow(cthe,4)));
+      s[2] = s[2] - 0.6*s[0]; //legend in dyn, 0.6 is default value.
       break;
     default:
       s[0] = 0.0;
-      //        call error('Lattice not defined in get_shpfcn.')
+      // call error('Lattice not defined in get_shpfcn.')
   }
 }
 
@@ -225,102 +263,26 @@ int MEAMKokkos<DeviceType>::get_Zij(const lattice_t latt) const
       return 8;
     case HCP:
       return 12;
-    case B1:
-      return 6;
     case DIA:
+    case DIA3:
       return 4;
     case DIM:
       return 1;
+    case B1:
+      return 6;
     case C11:
       return 10;
     case L12:
       return 12;
     case B2:
       return 8;
-      //        call error('Lattice not defined in get_Zij.')
+    case CH4: // DYNAMO currently implemented this way while it needs two Z values, 4 and 1
+      return 4;
+    case LIN:
+    case ZIG:
+    case TRI:
+      return 2;
+      // call error('Lattice not defined in get_Zij.')
   }
   return 0;
 }
-
-//-----------------------------------------------------------------------------
-// Number of second neighbors for the reference structure
-//   a = distance ratio R1/R2
-//   S = second neighbor screening function
-//
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-int MEAMKokkos<DeviceType>::get_Zij2(const lattice_t latt, const double cmin, const double cmax, double& a, double& S) const
-{
-
-  double C, x, sijk;
-  int Zij2 = 0, numscr = 0;
-
-  switch (latt) {
-
-  case FCC:
-    Zij2 = 6;
-    a = sqrt(2.0);
-    numscr = 4;
-    break;
-
-  case BCC:
-    Zij2 = 6;
-    a = 2.0 / sqrt(3.0);
-    numscr = 4;
-    break;
-
-  case HCP:
-    Zij2 = 6;
-    a = sqrt(2.0);
-    numscr = 4;
-    break;
-
-  case B1:
-    Zij2 = 12;
-    a = sqrt(2.0);
-    numscr = 2;
-    break;
-
-  case DIA:
-    Zij2 = 12;
-    a = sqrt(8.0 / 3.0);
-    numscr = 1;
-    if (cmin < 0.500001) {
-        //          call error('can not do 2NN MEAM for dia')
-    }
-    break;
-
-  case DIM:
-    //        this really shouldn't be allowed; make sure screening is zero
-    a = 1.0;
-    S = 0.0;
-    return 0;
-
-  case L12:
-    Zij2 = 6;
-    a = sqrt(2.0);
-    numscr = 4;
-    break;
-
-  case B2:
-    Zij2 = 6;
-    a = 2.0 / sqrt(3.0);
-    numscr = 4;
-    break;
-  case C11:
-    // unsupported lattice flag C11 in get_Zij
-    break;
-  default:
-    // unknown lattic flag in get Zij
-    //        call error('Lattice not defined in get_Zij.')
-    break;
-  }
-
-  // Compute screening for each first neighbor
-  C = 4.0 / (a * a) - 1.0;
-  x = (C - cmin) / (cmax - cmin);
-  sijk = fcut(x);
-  // There are numscr first neighbors screening the second neighbors
-  S = MathSpecialKokkos::powint(sijk, numscr);
-  return Zij2;
-}
diff --git a/src/KOKKOS/meam_impl_kokkos.h b/src/KOKKOS/meam_impl_kokkos.h
index 0d2ec99445..0a787ba2eb 100644
--- a/src/KOKKOS/meam_impl_kokkos.h
+++ b/src/KOKKOS/meam_impl_kokkos.h
@@ -12,11 +12,12 @@
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing author: Sebastian Hütter (OvGU)
+   Contributing author: Naga Vydyanathan (NVIDIA), Stan Moore (SNL)
 ------------------------------------------------------------------------- */
 
 #include "memory_kokkos.h"
 #include "meam_kokkos.h"
+
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
@@ -24,21 +25,15 @@ using namespace LAMMPS_NS;
 template<class DeviceType>
 MEAMKokkos<DeviceType>::MEAMKokkos(Memory *mem) : MEAM(mem)
 {
+  d_errorflag = typename AT::t_int_scalar("meam:errorflag");
 }
 
 template<class DeviceType>
 MEAMKokkos<DeviceType>::~MEAMKokkos()
 {
+  if (copymode) return;
+
   MemoryKokkos *memoryKK = (MemoryKokkos *)memory;
-  
-  memoryKK->destroy_kokkos(k_phirar6,phirar6);
-  memoryKK->destroy_kokkos(k_phirar5,phirar5);
-  memoryKK->destroy_kokkos(k_phirar4,phirar4);
-  memoryKK->destroy_kokkos(k_phirar3,phirar3);
-  memoryKK->destroy_kokkos(k_phirar2,phirar2);
-  memoryKK->destroy_kokkos(k_phirar1,phirar1);
-  memoryKK->destroy_kokkos(k_phirar,phirar);
-  memoryKK->destroy_kokkos(k_phir,phir);
 
   memoryKK->destroy_kokkos(k_rho,rho);
   memoryKK->destroy_kokkos(k_rho0,rho0);
@@ -63,10 +58,10 @@ MEAMKokkos<DeviceType>::~MEAMKokkos()
   memoryKK->destroy_kokkos(k_dscrfcn,dscrfcn);
   memoryKK->destroy_kokkos(k_fcpair,fcpair);
 }
+
 #include "meam_setup_done_kokkos.h"
 #include "meam_funcs_kokkos.h"
 #include "meam_dens_init_kokkos.h"
 #include "meam_dens_final_kokkos.h"
 #include "meam_force_kokkos.h"
-//
 
diff --git a/src/KOKKOS/meam_kokkos.h b/src/KOKKOS/meam_kokkos.h
index 0231f4b767..fb34f45b5c 100644
--- a/src/KOKKOS/meam_kokkos.h
+++ b/src/KOKKOS/meam_kokkos.h
@@ -14,68 +14,77 @@ namespace LAMMPS_NS {
 struct TagMEAMDensFinal{};
 template<int NEIGHFLAG>
 struct TagMEAMDensInit{};
-struct TagMEAMInitialize{};
+struct TagMEAMZero{};
 template<int NEIGHFLAG>
-struct TagMEAMforce{};
+struct TagMEAMForce{};
 
 template<class DeviceType>
 class MEAMKokkos : public MEAM
 {
-public:
+ public:
   typedef ArrayTypes<DeviceType> AT;
   typedef EV_FLOAT value_type;
   MEAMKokkos(Memory* mem);
-  ~MEAMKokkos();
+  ~MEAMKokkos() override;
 
   KOKKOS_INLINE_FUNCTION
   void operator()(TagMEAMDensFinal, const int&, EV_FLOAT&) const;
 
   template<int NEIGHFLAG>
   KOKKOS_INLINE_FUNCTION
-  void operator()(TagMEAMDensInit<NEIGHFLAG>, const int&, EV_FLOAT&) const;
+  void operator()(TagMEAMDensInit<NEIGHFLAG>, const int&) const;
   
   KOKKOS_INLINE_FUNCTION
-  void operator()(TagMEAMInitialize, const int&) const;
+  void operator()(TagMEAMZero, const int&) const;
 
   template<int NEIGHFLAG>
   KOKKOS_INLINE_FUNCTION
-  void operator()(TagMEAMforce<NEIGHFLAG>, const int&, EV_FLOAT&) const;
-protected:
-//Parameters to meam_dens_init - is there a better way to do this?
-  int ntype;
-  typename AT::t_int_1d_randomread type;
-  typename AT::t_int_1d_randomread d_offset;
-  typename AT::t_int_1d_randomread fmap;
-  typename AT::t_x_array_randomread x;
-  typename AT::t_int_1d_randomread d_numneigh_half;
-  typename AT::t_int_1d_randomread d_numneigh_full;
+  void operator()(TagMEAMForce<NEIGHFLAG>, const int&, EV_FLOAT&) const;
+ private:
+
+  // parameters to meam_dens_init
+
+  int ntype, nlocal;
+  typename AT::t_int_1d type;
+  typename AT::t_int_1d d_offset;
+  typename AT::t_int_1d d_map;
+  typename AT::t_int_2d d_scale;
+  typename AT::t_x_array x;
+  typename AT::t_int_1d d_numneigh_half;
+  typename AT::t_int_1d d_numneigh_full;
   typename AT::t_neighbors_2d d_neighbors_half;
   typename AT::t_neighbors_2d d_neighbors_full;
-  typename AT::t_int_1d_randomread d_ilist_half;
+  typename AT::t_int_1d d_ilist_half;
   typename AT::t_f_array f;
   typename ArrayTypes<DeviceType>::t_virial_array d_vatom;
-//Parameters to meam_dens_final - is there a better way to do this?
-  int eflag_either, eflag_global, eflag_atom, vflag_atom;
-  double *eng_vdwl;
+
+  // parameters to meam_dens_final
+
+  typename AT::t_int_scalar d_errorflag;
+  int eflag_either, eflag_global, eflag_atom, vflag_either, vflag_global, vflag_atom;
   typename ArrayTypes<DeviceType>::t_efloat_1d d_eatom;
 
-public:
-  void meam_dens_setup(int, int, int);
-  void meam_setup_done(void);
-  void meam_dens_init(int , int , typename AT::t_int_1d_randomread , typename AT::t_int_1d_randomread, typename AT::t_x_array_randomread, typename AT::t_int_1d_randomread, 
-                      typename AT::t_int_1d_randomread , int* , typename AT::t_int_1d_randomread, typename AT::t_neighbors_2d,typename AT::t_neighbors_2d,typename AT::t_int_1d_randomread, int );
-  void meam_dens_final(int , int , int , int , double* ,
-                       typename ArrayTypes<DeviceType>::t_efloat_1d , int , typename AT::t_int_1d_randomread , typename AT::t_int_1d_randomread , int& );
-  void meam_force(int , int , int , int , int , double* ,
-                  typename ArrayTypes<DeviceType>::t_efloat_1d , int , typename AT::t_int_1d_randomread , typename AT::t_int_1d_randomread , typename AT::t_x_array_randomread , typename AT::t_int_1d_randomread , 
-                  typename AT::t_int_1d_randomread , typename AT::t_f_array , typename ArrayTypes<DeviceType>::t_virial_array ,typename AT::t_int_1d_randomread , typename AT::t_int_1d_randomread, typename AT::t_neighbors_2d, typename AT::t_neighbors_2d, int);
+ public:
+  void meam_dens_setup(int, int, int) override;
+  void meam_setup_done(double*) override;
+  void meam_dens_init(int, int, typename AT::t_int_1d, typename AT::t_int_1d, typename AT::t_x_array,
+                      typename AT::t_int_1d, typename AT::t_int_1d, typename AT::t_int_1d,
+                      typename AT::t_neighbors_2d, typename AT::t_neighbors_2d, typename AT::t_int_1d,
+                      int, int);
+  void meam_dens_final(int, int, int, int, typename ArrayTypes<DeviceType>::t_efloat_1d,
+                       int, typename AT::t_int_1d, typename AT::t_int_1d, typename AT::t_int_2d, int&, EV_FLOAT&);
+  void meam_force(int, int, int, int, int, typename ArrayTypes<DeviceType>::t_efloat_1d, 
+                  int, typename AT::t_int_1d, typename AT::t_int_1d, typename AT::t_x_array, typename AT::t_int_1d, 
+                  typename AT::t_int_1d, typename AT::t_f_array, typename ArrayTypes<DeviceType>::t_virial_array,
+                  typename AT::t_int_1d, typename AT::t_int_1d, typename AT::t_neighbors_2d, typename AT::t_neighbors_2d,
+                  int, int, EV_FLOAT&);
   template<int NEIGHFLAG>
   KOKKOS_INLINE_FUNCTION
-  void getscreen(int , int, typename AT::t_x_array_randomread , typename AT::t_int_1d_randomread,
-                 typename AT::t_int_1d_randomread, int , typename AT::t_int_1d_randomread , typename AT::t_int_1d_randomread ) const;
+  void getscreen(int, int, typename AT::t_x_array, typename AT::t_int_1d,
+                 typename AT::t_int_1d, int, typename AT::t_int_1d, typename AT::t_int_1d) const;
   template<int NEIGHFLAG>
   KOKKOS_INLINE_FUNCTION
-  void calc_rho1(int , int , typename AT::t_int_1d_randomread , typename AT::t_int_1d_randomread , typename AT::t_x_array_randomread , typename AT::t_int_1d_randomread, int ) const;
+  void calc_rho1(int, int, typename AT::t_int_1d, typename AT::t_int_1d, typename AT::t_x_array, typename AT::t_int_1d, int) const;
   KOKKOS_INLINE_FUNCTION
   double fcut(const double xi) const;
   KOKKOS_INLINE_FUNCTION
@@ -89,18 +98,19 @@ public:
   KOKKOS_INLINE_FUNCTION
   double dG_gam(const double, const int, double&) const;
   KOKKOS_INLINE_FUNCTION
-  double zbl(const double, const int, const int) const; 
+  double zbl(const double, const int, const int) const;
+  KOKKOS_INLINE_FUNCTION
+  double embedding(const double, const double, const double, double&) const; 
   KOKKOS_INLINE_FUNCTION
   double erose(const double, const double, const double, const double, const double, const double, const int) const;
   KOKKOS_INLINE_FUNCTION
-  void get_shpfcn(const lattice_t latt, double (&s)[3]) const;
+  void get_shpfcn(const lattice_t latt, const double sthe, const double cthe,
+                  double (&s)[3]) const;
   KOKKOS_INLINE_FUNCTION
-  int get_Zij(const lattice_t ) const;
-  KOKKOS_INLINE_FUNCTION
-  int get_Zij2(const lattice_t, const double, const double, double&, double&) const; 
-public:
+  int get_Zij(const lattice_t) const;
+ public:
   DAT::tdual_ffloat_1d k_rho, k_rho0, k_rho1, k_rho2, k_rho3, k_frhop;
-  typename ArrayTypes<DeviceType>::t_ffloat_1d d_rho, d_rho0,d_rho1, d_rho2, d_rho3, d_frhop;
+  typename ArrayTypes<DeviceType>::t_ffloat_1d d_rho, d_rho0, d_rho1, d_rho2, d_rho3, d_frhop;
   HAT::t_ffloat_1d h_rho, h_rho0, h_rho1, h_rho2, h_rho3, h_frhop;
   DAT::tdual_ffloat_1d k_gamma, k_dgamma1, k_dgamma2, k_dgamma3, k_arho2b;
   typename ArrayTypes<DeviceType>::t_ffloat_1d d_gamma, d_dgamma1, d_dgamma2, d_dgamma3, d_arho2b;
@@ -108,20 +118,54 @@ public:
   DAT::tdual_ffloat_2d k_arho1, k_arho2, k_arho3, k_arho3b, k_t_ave, k_tsq_ave;
   typename ArrayTypes<DeviceType>::t_ffloat_2d d_arho1, d_arho2, d_arho3, d_arho3b, d_t_ave, d_tsq_ave;
   HAT::t_ffloat_2d h_arho1, h_arho2, h_arho3, h_arho3b, h_t_ave, h_tsq_ave;
-  DAT::tdual_ffloat_2d k_phir, k_phirar, k_phirar1, k_phirar2, k_phirar3, k_phirar4, k_phirar5, k_phirar6;
   typename ArrayTypes<DeviceType>::t_ffloat_2d d_phir, d_phirar, d_phirar1, d_phirar2, d_phirar3, d_phirar4, d_phirar5, d_phirar6;
-  HAT::t_ffloat_2d h_phir, h_phirar, h_phirar1, h_phirar2, h_phirar3, h_phirar4, h_phirar5, h_phirar6;
   DAT::tdual_ffloat_1d k_scrfcn, k_dscrfcn, k_fcpair;
   typename ArrayTypes<DeviceType>::t_ffloat_1d d_scrfcn, d_dscrfcn, d_fcpair;
   HAT::t_ffloat_1d h_scrfcn, h_dscrfcn, h_fcpair;
 
-  
+ protected:
+  int need_dup;
+  using KKDeviceType = typename KKDevice<DeviceType>::value;
+
+  template<typename DataType, typename Layout>
+  using DupScatterView = KKScatterView<DataType, Layout, KKDeviceType, KKScatterSum, KKScatterDuplicated>;
+
+  template<typename DataType, typename Layout>
+  using NonDupScatterView = KKScatterView<DataType, Layout, KKDeviceType, KKScatterSum, KKScatterNonDuplicated>;
+
+  DupScatterView<typename decltype(d_rho0)::data_type, typename decltype(d_rho0)::array_layout> dup_rho0;
+  NonDupScatterView<typename decltype(d_rho0)::data_type, typename decltype(d_rho0)::array_layout> ndup_rho0;
+  DupScatterView<typename decltype(d_arho2b)::data_type, typename decltype(d_arho2b)::array_layout> dup_arho2b;
+  NonDupScatterView<typename decltype(d_arho2b)::data_type, typename decltype(d_arho2b)::array_layout> ndup_arho2b;
+  DupScatterView<typename decltype(d_arho1)::data_type, typename decltype(d_arho1)::array_layout> dup_arho1;
+  NonDupScatterView<typename decltype(d_arho1)::data_type, typename decltype(d_arho1)::array_layout> ndup_arho1;
+  DupScatterView<typename decltype(d_arho2)::data_type, typename decltype(d_arho2)::array_layout> dup_arho2;
+  NonDupScatterView<typename decltype(d_arho2)::data_type, typename decltype(d_arho2)::array_layout> ndup_arho2;
+  DupScatterView<typename decltype(d_arho3)::data_type, typename decltype(d_arho3)::array_layout> dup_arho3;
+  NonDupScatterView<typename decltype(d_arho3)::data_type, typename decltype(d_arho3)::array_layout> ndup_arho3;
+  DupScatterView<typename decltype(d_arho3b)::data_type, typename decltype(d_arho3b)::array_layout> dup_arho3b;
+  NonDupScatterView<typename decltype(d_arho3b)::data_type, typename decltype(d_arho3b)::array_layout> ndup_arho3b;
+  DupScatterView<typename decltype(d_t_ave)::data_type, typename decltype(d_t_ave)::array_layout> dup_t_ave;
+  NonDupScatterView<typename decltype(d_t_ave)::data_type, typename decltype(d_t_ave)::array_layout> ndup_t_ave;
+  DupScatterView<typename decltype(d_tsq_ave)::data_type, typename decltype(d_tsq_ave)::array_layout> dup_tsq_ave;
+  NonDupScatterView<typename decltype(d_tsq_ave)::data_type, typename decltype(d_tsq_ave)::array_layout> ndup_tsq_ave;
+  DupScatterView<typename decltype(f)::data_type, typename decltype(f)::array_layout> dup_f;
+  NonDupScatterView<typename decltype(f)::data_type, typename decltype(f)::array_layout> ndup_f;
+  DupScatterView<typename decltype(d_eatom)::data_type, typename decltype(d_eatom)::array_layout> dup_eatom;
+  NonDupScatterView<typename decltype(d_eatom)::data_type, typename decltype(d_eatom)::array_layout> ndup_eatom;
+  DupScatterView<typename decltype(d_vatom)::data_type, typename decltype(d_vatom)::array_layout> dup_vatom;
+  NonDupScatterView<typename decltype(d_vatom)::data_type, typename decltype(d_vatom)::array_layout> ndup_vatom;
 };
+
 KOKKOS_INLINE_FUNCTION
 static bool iszero_kk(const double f) {
   return fabs(f) < 1e-20;
 }
 
+KOKKOS_INLINE_FUNCTION
+static bool isone_kk(const double f) {
+  return fabs(f-1.0) < 1e-20;
+}
 
 KOKKOS_INLINE_FUNCTION
 static double fdiv_zero_kk(const double n, const double d) {
@@ -134,9 +178,5 @@ static double fdiv_zero_kk(const double n, const double d) {
 
 }
 #include "meam_impl_kokkos.h"
-//#include "meam_setup_done_kokkos.h"
-//#include "meam_funcs_kokkos.h"
-//#include "meam_dens_init_kokkos.h"
-//#include "meam_dens_final_kokkos.h"
-//#include "meam_force_kokkos.h"
+
 #endif
diff --git a/src/KOKKOS/meam_setup_done_kokkos.h b/src/KOKKOS/meam_setup_done_kokkos.h
index 72852bf2cb..7d5de12427 100644
--- a/src/KOKKOS/meam_setup_done_kokkos.h
+++ b/src/KOKKOS/meam_setup_done_kokkos.h
@@ -1,71 +1,46 @@
 #include "meam_kokkos.h"
 
 template<class DeviceType>
-void MEAMKokkos<DeviceType>::meam_setup_done(void)
+void MEAMKokkos<DeviceType>::meam_setup_done(double* cutmax)
 {
-  MemoryKokkos *memoryKK = (MemoryKokkos *)memory;
+  MEAM::meam_setup_done(cutmax);
 
-  memoryKK->destroy_kokkos(k_phirar6,phirar6);
-  memoryKK->destroy_kokkos(k_phirar5,phirar5);
-  memoryKK->destroy_kokkos(k_phirar4,phirar4);
-  memoryKK->destroy_kokkos(k_phirar3,phirar3);
-  memoryKK->destroy_kokkos(k_phirar2,phirar2);
-  memoryKK->destroy_kokkos(k_phirar1,phirar1);
-  memoryKK->destroy_kokkos(k_phirar,phirar);
-  memoryKK->destroy_kokkos(k_phir,phir);
+  MemKK::realloc_kokkos(d_phir, "pair:phir", (neltypes * (neltypes + 1)) / 2, nr);
+  MemKK::realloc_kokkos(d_phirar, "pair:phirar", (neltypes * (neltypes + 1)) / 2, nr);
+  MemKK::realloc_kokkos(d_phirar1, "pair:phirar1", (neltypes * (neltypes + 1)) / 2, nr);
+  MemKK::realloc_kokkos(d_phirar2, "pair:phirar2", (neltypes * (neltypes + 1)) / 2, nr);
+  MemKK::realloc_kokkos(d_phirar3, "pair:phirar3", (neltypes * (neltypes + 1)) / 2, nr);
+  MemKK::realloc_kokkos(d_phirar4, "pair:phirar4", (neltypes * (neltypes + 1)) / 2, nr);
+  MemKK::realloc_kokkos(d_phirar5, "pair:phirar5", (neltypes * (neltypes + 1)) / 2, nr);
+  MemKK::realloc_kokkos(d_phirar6, "pair:phirar6", (neltypes * (neltypes + 1)) / 2, nr);
 
-  memoryKK->create_kokkos(k_phir, phir, (neltypes * (neltypes + 1)) / 2, nr, "pair:phir");
-  memoryKK->create_kokkos(k_phirar, phirar, (neltypes * (neltypes + 1)) / 2, nr, "pair:phirar");
-  memoryKK->create_kokkos(k_phirar1, phirar1, (neltypes * (neltypes + 1)) / 2, nr, "pair:phirar1");
-  memoryKK->create_kokkos(k_phirar2, phirar2, (neltypes * (neltypes + 1)) / 2, nr, "pair:phirar2");
-  memoryKK->create_kokkos(k_phirar3, phirar3, (neltypes * (neltypes + 1)) / 2, nr, "pair:phirar3");
-  memoryKK->create_kokkos(k_phirar4, phirar4, (neltypes * (neltypes + 1)) / 2, nr, "pair:phirar4");
-  memoryKK->create_kokkos(k_phirar5, phirar5, (neltypes * (neltypes + 1)) / 2, nr, "pair:phirar5");
-  memoryKK->create_kokkos(k_phirar6, phirar6, (neltypes * (neltypes + 1)) / 2, nr, "pair:phirar6");
-
-  h_phir = k_phir.h_view;
-  h_phirar = k_phirar.h_view;
-  h_phirar1 = k_phirar1.h_view;
-  h_phirar2 = k_phirar2.h_view;
-  h_phirar3 = k_phirar3.h_view;
-  h_phirar4 = k_phirar4.h_view;
-  h_phirar5 = k_phirar5.h_view;
-  h_phirar6 = k_phirar6.h_view;
+  auto h_phir = Kokkos::create_mirror_view(d_phir);
+  auto h_phirar = Kokkos::create_mirror_view(d_phirar);
+  auto h_phirar1 = Kokkos::create_mirror_view(d_phirar1);
+  auto h_phirar2 = Kokkos::create_mirror_view(d_phirar2);
+  auto h_phirar3 = Kokkos::create_mirror_view(d_phirar3);
+  auto h_phirar4 = Kokkos::create_mirror_view(d_phirar4);
+  auto h_phirar5 = Kokkos::create_mirror_view(d_phirar5);
+  auto h_phirar6 = Kokkos::create_mirror_view(d_phirar6);
 
   for (int i = 0; i <(neltypes * (neltypes + 1)) / 2; i++)
-    for(int j = 0; j < nr; j++)
-    {
-        h_phir(i,j) = phir[i][j];
-        h_phirar(i,j) = phirar[i][j];
-        h_phirar1(i,j) = phirar1[i][j];
-        h_phirar2(i,j) = phirar2[i][j];
-        h_phirar3(i,j) = phirar3[i][j];
-        h_phirar4(i,j) = phirar4[i][j];
-        h_phirar5(i,j) = phirar5[i][j];
-        h_phirar6(i,j) = phirar6[i][j];
+    for(int j = 0; j < nr; j++) {
+      h_phir(i,j) = phir[i][j];
+      h_phirar(i,j) = phirar[i][j];
+      h_phirar1(i,j) = phirar1[i][j];
+      h_phirar2(i,j) = phirar2[i][j];
+      h_phirar3(i,j) = phirar3[i][j];
+      h_phirar4(i,j) = phirar4[i][j];
+      h_phirar5(i,j) = phirar5[i][j];
+      h_phirar6(i,j) = phirar6[i][j];
     }
-  k_phir.template modify<LMPHostType>();
-  k_phir.template sync<DeviceType>();  
-  d_phir = k_phir.template view<DeviceType>();
-  k_phirar.template modify<LMPHostType>();
-  k_phirar.template sync<DeviceType>();  
-  d_phirar = k_phirar.template view<DeviceType>();
-  k_phirar1.template modify<LMPHostType>();
-  k_phirar1.template sync<DeviceType>();  
-  d_phirar1 = k_phirar1.template view<DeviceType>();
-  k_phirar2.template modify<LMPHostType>();
-  k_phirar2.template sync<DeviceType>();  
-  d_phirar2 = k_phirar2.template view<DeviceType>();
-  k_phirar3.template modify<LMPHostType>();
-  k_phirar3.template sync<DeviceType>();  
-  d_phirar3 = k_phirar3.template view<DeviceType>();
-  k_phirar4.template modify<LMPHostType>();
-  k_phirar4.template sync<DeviceType>();  
-  d_phirar4 = k_phirar4.template view<DeviceType>();
-  k_phirar5.template modify<LMPHostType>();
-  k_phirar5.template sync<DeviceType>();  
-  d_phirar5 = k_phirar5.template view<DeviceType>();
-  k_phirar6.template modify<LMPHostType>();
-  k_phirar6.template sync<DeviceType>();  
-  d_phirar6 = k_phirar6.template view<DeviceType>();
+
+  Kokkos::deep_copy(d_phir,h_phir);
+  Kokkos::deep_copy(d_phirar,h_phirar);
+  Kokkos::deep_copy(d_phirar1,h_phirar1);
+  Kokkos::deep_copy(d_phirar2,h_phirar2);
+  Kokkos::deep_copy(d_phirar3,h_phirar3);
+  Kokkos::deep_copy(d_phirar4,h_phirar4);
+  Kokkos::deep_copy(d_phirar5,h_phirar5);
+  Kokkos::deep_copy(d_phirar6,h_phirar6);
 }
diff --git a/src/KOKKOS/pair_meam_kokkos.cpp b/src/KOKKOS/pair_meam_kokkos.cpp
index 38386d0a7a..33343f4af5 100644
--- a/src/KOKKOS/pair_meam_kokkos.cpp
+++ b/src/KOKKOS/pair_meam_kokkos.cpp
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
+   https://www.lammps.org/, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
@@ -12,86 +12,67 @@
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing author: Greg Wagner (SNL)
+   Contributing authors: Naga Vydyanathan (NVIDIA), Stan Moore (SNL)
 ------------------------------------------------------------------------- */
 
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-//KK*
-#include "meam_kokkos.h"
-#include "kokkos.h"
-#include "pair_kokkos.h"
-//#include "pair_meamc.h"
-
 #include "pair_meam_kokkos.h"
+#include "meam_kokkos.h"
+
 #include "atom_kokkos.h"
-//*KK
-#include "force.h"
-#include "comm.h"
-//KK*
-//#include "memory.h"
-#include "memory_kokkos.h"
-//*KK
-#include "neighbor.h"
-//KK*
-//#include "neigh_list.h"
-#include "neigh_list_kokkos.h"
-//*KK
-#include "neigh_request.h"
-#include "error.h"
-//*KK
 #include "atom_masks.h"
-//*KK
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "kokkos.h"
+#include "memory_kokkos.h"
+#include "neigh_list_kokkos.h"
+#include "neigh_request.h"
+#include "neighbor.h"
+
+#include <cmath>
 
 using namespace LAMMPS_NS;
 
-#if 0
-static const int nkeywords = 21;
-static const char *keywords[] = {
-  "Ec","alpha","rho0","delta","lattce",
-  "attrac","repuls","nn2","Cmin","Cmax","rc","delr",
-  "augt1","gsmooth_factor","re","ialloy",
-  "mixture_ref_t","erose_form","zbl",
-  "emb_lin_neg","bkgd_dyn"};
-#endif
-
 /* ---------------------------------------------------------------------- */
+
 template<class DeviceType>
 PairMEAMKokkos<DeviceType>::PairMEAMKokkos(LAMMPS *lmp) : PairMEAM(lmp)
 {
   respa_enable = 0;
 
+  kokkosable = 1;
+  reverse_comm_device = 1;
   atomKK = (AtomKokkos *) atom;
   execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
   datamask_read = X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK;
   datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK;
-  meam_inst_kk = new MEAMKokkos<DeviceType>(memory);
+
   delete meam_inst;
+  meam_inst_kk = new MEAMKokkos<DeviceType>(memory);
   meam_inst = meam_inst_kk;
 }
+
 /* ---------------------------------------------------------------------- */
 
 template<class DeviceType>
 PairMEAMKokkos<DeviceType>::~PairMEAMKokkos()
 {
-  if (!copymode) {
-    memoryKK->destroy_kokkos(k_eatom,eatom);
-    memoryKK->destroy_kokkos(k_vatom,vatom);
-    delete meam_inst_kk;
-  }
+  if (copymode) return;
+
+  memoryKK->destroy_kokkos(k_eatom,eatom);
+  memoryKK->destroy_kokkos(k_vatom,vatom);
+  delete meam_inst_kk;
+  meam_inst = nullptr;
 }
 
 /* ---------------------------------------------------------------------- */
 
-/* ---------------------------------------------------------------------- */
 template<class DeviceType>
 void PairMEAMKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
 {
   eflag = eflag_in;
   vflag = vflag_in;
-  
+
   if (neighflag == FULL) no_virial_fdotr_compute = 1;
 
   ev_init(eflag,vflag,0);
@@ -105,36 +86,32 @@ void PairMEAMKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   }
   if (vflag_atom) {
     memoryKK->destroy_kokkos(k_vatom,vatom);
-    memoryKK->create_kokkos(k_vatom,vatom,maxvatom,6,"pair:vatom");
+    memoryKK->create_kokkos(k_vatom,vatom,maxvatom,"pair:vatom");
     d_vatom = k_vatom.view<DeviceType>();
   }
 
-  atomKK->sync(execution_space,datamask_read);
-  if (eflag || vflag) atomKK->modified(execution_space,datamask_modify);
-  else atomKK->modified(execution_space,F_MASK);
-
-
   // neighbor list info
 
-  NeighListKokkos<DeviceType>* k_halflist = static_cast<NeighListKokkos<DeviceType>*>(listhalf);
   int inum_half = listhalf->inum;
-  int* numneigh_half = listhalf->numneigh;
-  int* ilist_half = listhalf->ilist;
-
+  NeighListKokkos<DeviceType>* k_halflist = static_cast<NeighListKokkos<DeviceType>*>(listhalf);
   d_ilist_half = k_halflist->d_ilist;
   d_numneigh_half = k_halflist->d_numneigh;
   d_neighbors_half = k_halflist->d_neighbors;
+
   NeighListKokkos<DeviceType>* k_fulllist = static_cast<NeighListKokkos<DeviceType>*>(listfull);
   d_numneigh_full = k_fulllist->d_numneigh;
   d_neighbors_full = k_fulllist->d_neighbors;
 
+  EV_FLOAT ev;
+
   copymode = 1;
+  meam_inst_kk->copymode = 1;
 
   // strip neighbor lists of any special bond flags before using with MEAM
   // necessary before doing neigh_f2c and neigh_c2f conversions each step
-  if (neighbor->ago == 0) {
-      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMEAMKernelNeighStrip >(0,inum_half),*this);
-  }
+
+  if (neighbor->ago == 0)
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMEAMNeighStrip >(0,inum_half),*this);
 
   // check size of scrfcn based on half neighbor list
 
@@ -142,152 +119,137 @@ void PairMEAMKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
   nall = nlocal + atom->nghost;
 
   int n = 0;
-  //for (ii = 0; ii < inum_half; ii++) n += numneigh_half[ilist_half[ii]];
-  Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairMEAMKernelA>(0,inum_half), *this, n);
+  Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairMEAMOffsets>(0,inum_half),*this,n);
 
   meam_inst_kk->meam_dens_setup(atom->nmax, nall, n);
 
-  //double **x = atom->x;
   x = atomKK->k_x.view<DeviceType>();
-
-  //double **f = atom->f;
   f = atomKK->k_f.view<DeviceType>();
-
-  //int *type = atom->type;
   type = atomKK->k_type.view<DeviceType>();
 
+  atomKK->sync(execution_space,datamask_read);
+
   int ntype = atom->ntypes;
 
   // 3 stages of MEAM calculation
   // loop over my atoms followed by communication
 
-  int offset = 0;
   int errorflag = 0;
-#if 0
-  for (ii = 0; ii < inum_half; ii++) {
-    i = ilist_half[ii];
-    meam_inst->meam_dens_init(i,ntype,type,map,x,
-                    numneigh_half[i],firstneigh_half[i],
-                    numneigh_full[i],firstneigh_full[i],
-                    offset);
-    offset += numneigh_half[i];
-  }
-#endif
-  // To do: create the cumulative offset array in host and device
   
-  k_offset = DAT::tdual_int_1d("pair:offset",inum_half+1);
-  h_offset = k_offset.h_view;
-  d_offset = k_offset.template view<DeviceType>();
-  ArrayTypes<LMPHostType>::t_int_1d h_ilist;
-  ArrayTypes<LMPHostType>::t_int_1d h_numneigh;
-  h_ilist = Kokkos::create_mirror_view(k_halflist->d_ilist);
-  h_numneigh = Kokkos::create_mirror_view(k_halflist->d_numneigh);
-  Kokkos::deep_copy(h_ilist,k_halflist->d_ilist);
-  Kokkos::deep_copy(h_numneigh,k_halflist->d_numneigh);
+  d_offset = typename AT::t_int_1d("pair:offset",inum_half+1);
+  {
+    // local variables for lambda capture
 
-  h_offset[0] = 0;
-  for (int ii = 0; ii < inum_half; ii++) {
-    int i = h_ilist[ii];
-    h_offset[ii+1] = h_offset[ii] + h_numneigh[i]; 
+    auto l_ilist_half = d_ilist_half;
+    auto l_numneigh_half = d_numneigh_half;
+    auto l_offset = d_offset;
+
+    Kokkos::parallel_scan(inum_half, LAMMPS_LAMBDA(int ii, int &m_fill, bool final) {
+      int i = l_ilist_half[ii];
+      m_fill += l_numneigh_half[i];
+      if (final)
+        l_offset[ii+1] = m_fill;
+    });
   }
-  k_offset.template modify<LMPHostType>();
-  k_offset.template sync<DeviceType>();
-  meam_inst_kk->meam_dens_init(inum_half,ntype,type,d_map,x,d_numneigh_half,d_numneigh_full,&offset,d_ilist_half,d_neighbors_half, d_neighbors_full, d_offset, neighflag);
+
+  int need_dup = lmp->kokkos->need_dup<DeviceType>();
+
+  meam_inst_kk->meam_dens_init(inum_half,ntype,type,d_map,x,d_numneigh_half,d_numneigh_full,d_ilist_half,d_neighbors_half, d_neighbors_full, d_offset, neighflag, need_dup);
+
   meam_inst_kk->k_rho0.template modify<DeviceType>();
-  meam_inst_kk->k_rho0.template sync<LMPHostType>();
-
   meam_inst_kk->k_arho2b.template modify<DeviceType>();
-  meam_inst_kk->k_arho2b.template sync<LMPHostType>();
-
   meam_inst_kk->k_arho1.template modify<DeviceType>();
-  meam_inst_kk->k_arho1.template sync<LMPHostType>();
-
   meam_inst_kk->k_arho2.template modify<DeviceType>();
-  meam_inst_kk->k_arho2.template sync<LMPHostType>();
-
   meam_inst_kk->k_arho3.template modify<DeviceType>();
-  meam_inst_kk->k_arho3.template sync<LMPHostType>();
-
   meam_inst_kk->k_arho3b.template modify<DeviceType>();
-  meam_inst_kk->k_arho3b.template sync<LMPHostType>();
-
   meam_inst_kk->k_t_ave.template modify<DeviceType>();
-  meam_inst_kk->k_t_ave.template sync<LMPHostType>();
-
   meam_inst_kk->k_tsq_ave.template modify<DeviceType>();
-  meam_inst_kk->k_tsq_ave.template sync<LMPHostType>();
 
   comm->reverse_comm(this);
 
-  meam_inst_kk->k_rho0.template modify<LMPHostType>();
   meam_inst_kk->k_rho0.template sync<DeviceType>();
-
-  meam_inst_kk->k_arho2b.template modify<LMPHostType>();
   meam_inst_kk->k_arho2b.template sync<DeviceType>();
-
-  meam_inst_kk->k_arho1.template modify<LMPHostType>();
   meam_inst_kk->k_arho1.template sync<DeviceType>();
-
-  meam_inst_kk->k_arho2.template modify<LMPHostType>();
   meam_inst_kk->k_arho2.template sync<DeviceType>();
-
-  meam_inst_kk->k_arho3.template modify<LMPHostType>();
   meam_inst_kk->k_arho3.template sync<DeviceType>();
-
-  meam_inst_kk->k_arho3b.template modify<LMPHostType>();
   meam_inst_kk->k_arho3b.template sync<DeviceType>();
-
-  meam_inst_kk->k_t_ave.template modify<LMPHostType>();
   meam_inst_kk->k_t_ave.template sync<DeviceType>();
-
-  meam_inst_kk->k_tsq_ave.template modify<LMPHostType>();
   meam_inst_kk->k_tsq_ave.template sync<DeviceType>();
 
-
   meam_inst_kk->meam_dens_final(nlocal,eflag_either,eflag_global,eflag_atom,
-                   &eng_vdwl,d_eatom,ntype,type,d_map,errorflag);
-  if (errorflag) {
-    char str[128];
-    sprintf(str,"MEAM library error %d",errorflag);
-    error->one(FLERR,str);
-  }
+                   d_eatom,ntype,type,d_map,d_scale,errorflag,ev);
+
+  if (errorflag)
+    error->one(FLERR,"MEAM library error {}",errorflag);
+
+  meam_inst_kk->k_rho0.template modify<DeviceType>();
+  meam_inst_kk->k_rho1.template modify<DeviceType>();
+  meam_inst_kk->k_rho2.template modify<DeviceType>();
+  meam_inst_kk->k_rho3.template modify<DeviceType>();
+  meam_inst_kk->k_frhop.template modify<DeviceType>();
+  meam_inst_kk->k_gamma.template modify<DeviceType>();
+  meam_inst_kk->k_dgamma1.template modify<DeviceType>();
+  meam_inst_kk->k_dgamma2.template modify<DeviceType>();
+  meam_inst_kk->k_dgamma3.template modify<DeviceType>();
+  meam_inst_kk->k_arho2b.template modify<DeviceType>();
+  meam_inst_kk->k_arho1.template modify<DeviceType>();
+  meam_inst_kk->k_arho2.template modify<DeviceType>();
+  meam_inst_kk->k_arho3.template modify<DeviceType>();
+  meam_inst_kk->k_arho3b.template modify<DeviceType>();
+  meam_inst_kk->k_t_ave.template modify<DeviceType>();
+  meam_inst_kk->k_tsq_ave.template modify<DeviceType>();
 
   comm->forward_comm(this);
 
-  offset = 0;
+  meam_inst_kk->k_rho0.template sync<DeviceType>();
+  meam_inst_kk->k_rho1.template sync<DeviceType>();
+  meam_inst_kk->k_rho2.template sync<DeviceType>();
+  meam_inst_kk->k_rho3.template sync<DeviceType>();
+  meam_inst_kk->k_frhop.template sync<DeviceType>();
+  meam_inst_kk->k_gamma.template sync<DeviceType>();
+  meam_inst_kk->k_dgamma1.template sync<DeviceType>();
+  meam_inst_kk->k_dgamma2.template sync<DeviceType>();
+  meam_inst_kk->k_dgamma3.template sync<DeviceType>();
+  meam_inst_kk->k_arho2b.template sync<DeviceType>();
+  meam_inst_kk->k_arho1.template sync<DeviceType>();
+  meam_inst_kk->k_arho2.template sync<DeviceType>();
+  meam_inst_kk->k_arho3.template sync<DeviceType>();
+  meam_inst_kk->k_arho3b.template sync<DeviceType>();
+  meam_inst_kk->k_t_ave.template sync<DeviceType>();
+  meam_inst_kk->k_tsq_ave.template sync<DeviceType>();
 
-  // vptr is first value in vatom if it will be used by meam_force()
-  // else vatom may not exist, so pass dummy ptr
+  meam_inst_kk->meam_force(inum_half,eflag_global,eflag_atom,vflag_global,
+                           vflag_atom,d_eatom,ntype,type,d_map,x,
+                           d_numneigh_half, d_numneigh_full,f,d_vatom,
+                           d_ilist_half, d_offset, d_neighbors_half, d_neighbors_full,
+                           neighflag, need_dup, ev);
 
-#if 0 // To do: is this correct? vflag_atom is used to access vatom
-  typename ArrayTypes<DeviceType>::t_virial_array vptr;
-  if (vflag_atom) vptr = d_vatom;
-  else vptr = NULL;
-  for (ii = 0; ii < inum_half; ii++) {
-    i = ilist_half[ii];
-    meam_inst->meam_force(i,eflag_either,eflag_global,eflag_atom,
-                vflag_atom,&eng_vdwl,eatom,ntype,type,map,x,
-                numneigh_half[i],firstneigh_half[i],
-                numneigh_full[i],firstneigh_full[i],
-                offset,f,vptr);
-    offset += numneigh_half[i];
+  if (eflag_global) eng_vdwl += ev.evdwl;
+  if (vflag_global) {
+    virial[0] += ev.v[0];
+    virial[1] += ev.v[1];
+    virial[2] += ev.v[2];
+    virial[3] += ev.v[3];
+    virial[4] += ev.v[4];
+    virial[5] += ev.v[5];
   }
-#endif
-  meam_inst_kk->meam_force(inum_half, eflag_either,eflag_global,eflag_atom,
-                vflag_atom,&eng_vdwl,d_eatom,ntype,type,d_map,x,
-                d_numneigh_half, d_numneigh_full,f,d_vatom,d_ilist_half, d_offset, d_neighbors_half, d_neighbors_full, neighflag);
 
   if (vflag_fdotr) pair_virial_fdotr_compute(this);
   if (eflag_atom) {
     k_eatom.template modify<DeviceType>();
-    k_eatom.template sync<LMPHostType>();
+    k_eatom.sync_host();
   }
 
   if (vflag_atom) {
     k_vatom.template modify<DeviceType>();
-    k_vatom.template sync<LMPHostType>();
+    k_vatom.sync_host();
   }
 
+  if (eflag || vflag) atomKK->modified(execution_space,datamask_modify);
+  else atomKK->modified(execution_space,F_MASK);
+
+  copymode = 0;
+  meam_inst_kk->copymode = 0;
 }
 
 /* ----------------------------------------------------------------------
@@ -298,24 +260,22 @@ void PairMEAMKokkos<DeviceType>::coeff(int narg, char **arg)
 {
   PairMEAM::coeff(narg,arg);
 
-  //sync map
+  // sync map and scale
 
   int n = atom->ntypes;
-  k_map = DAT::tdual_int_1d("pair:map",n+1);
-  HAT::t_int_1d h_map = k_map.h_view;
+  MemKK::realloc_kokkos(d_map,"pair:map",n+1);
+  MemKK::realloc_kokkos(d_scale,"pair:scale",n+1,n+1);
+  auto h_map = Kokkos::create_mirror_view(d_map);
+  auto h_scale = Kokkos::create_mirror_view(d_scale);
 
-  for (int i = 1; i <= n; i++)
+  for (int i = 1; i <= n; i++) {
     h_map[i] = map[i];
+    for (int j = 1; j <= n; j++)
+      h_scale(i,j) = scale[i][j];
+  }
 
-  k_map.template modify<LMPHostType>();
-  k_map.template sync<DeviceType>();
-
-  d_map = k_map.template view<DeviceType>();
-
-  // To do: need to synchronize phirar variables
-   
-  meam_inst_kk->meam_setup_done(); 
-  
+  Kokkos::deep_copy(d_map,h_map);
+  Kokkos::deep_copy(d_scale,h_scale);
 }
 
 /* ----------------------------------------------------------------------
@@ -324,24 +284,27 @@ void PairMEAMKokkos<DeviceType>::coeff(int narg, char **arg)
 template<class DeviceType>
 void PairMEAMKokkos<DeviceType>::init_style()
 {
-
   PairMEAM::init_style();
 
+  // adjust neighbor list request for KOKKOS
+
   neighflag = lmp->kokkos->neighflag;
-  auto request = neighbor->find_request(this);
-
-  // MEAM needs both a full and half neighbor list? Not sure how to get that.
-  if (!(neighflag == FULL || neighflag == HALF || neighflag == HALFTHREAD))
-    error->all(FLERR, "Cannot use chosen neighbor list style with pair meam/kk");
-
+  auto request = neighbor->find_request(this,1);
   request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
                            !std::is_same<DeviceType,LMPDeviceType>::value);
   request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
-  if (neighflag == FULL) request->enable_full();
 
+  request = neighbor->find_request(this,2);
+  request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
+                           !std::is_same<DeviceType,LMPDeviceType>::value);
+  request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
+
+  if (neighflag == FULL)
+    error->all(FLERR,"Must use half neighbor list style with pair meam/kk");
 }
 
 /* ---------------------------------------------------------------------- */
+
 template<class DeviceType>
 int PairMEAMKokkos<DeviceType>::pack_forward_comm_kokkos(int n, DAT::tdual_int_2d k_sendlist, int iswap_in, DAT::tdual_xfloat_1d &buf,
                                 int pbc_flag, int *pbc)
@@ -350,9 +313,11 @@ int PairMEAMKokkos<DeviceType>::pack_forward_comm_kokkos(int n, DAT::tdual_int_2
   iswap = iswap_in;
   v_buf = buf.view<DeviceType>();
   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMEAMPackForwardComm>(0,n),*this);
-  return n;
+  return n*38;
 }
 
+/* ---------------------------------------------------------------------- */
+
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairMEAMKokkos<DeviceType>::operator()(TagPairMEAMPackForwardComm, const int &i) const {
@@ -361,35 +326,36 @@ void PairMEAMKokkos<DeviceType>::operator()(TagPairMEAMPackForwardComm, const in
   v_buf[m++] = meam_inst_kk->d_rho0[j];
   v_buf[m++] = meam_inst_kk->d_rho1[j];
   v_buf[m++] = meam_inst_kk->d_rho2[j];
-    v_buf[m++] = meam_inst_kk->d_rho3[j];
-    v_buf[m++] = meam_inst_kk->d_frhop[j];
-    v_buf[m++] = meam_inst_kk->d_gamma[j];
-    v_buf[m++] = meam_inst_kk->d_dgamma1[j];
-    v_buf[m++] = meam_inst_kk->d_dgamma2[j];
-    v_buf[m++] = meam_inst_kk->d_dgamma3[j];
-    v_buf[m++] = meam_inst_kk->d_arho2b[j];
-    v_buf[m++] = meam_inst_kk->d_arho1(j,0);
-    v_buf[m++] = meam_inst_kk->d_arho1(j,1);
-    v_buf[m++] = meam_inst_kk->d_arho1(j,2);
-    v_buf[m++] = meam_inst_kk->d_arho2(j,0);
-    v_buf[m++] = meam_inst_kk->d_arho2(j,1);
-    v_buf[m++] = meam_inst_kk->d_arho2(j,2);
-    v_buf[m++] = meam_inst_kk->d_arho2(j,3);
-    v_buf[m++] = meam_inst_kk->d_arho2(j,4);
-    v_buf[m++] = meam_inst_kk->d_arho2(j,5);
-    for (int k = 0; k < 10; k++) v_buf[m++] = meam_inst_kk->d_arho3(j,k);
-    v_buf[m++] = meam_inst_kk->d_arho3b(j,0);
-    v_buf[m++] = meam_inst_kk->d_arho3b(j,1);
-    v_buf[m++] = meam_inst_kk->d_arho3b(j,2);
-    v_buf[m++] = meam_inst_kk->d_t_ave(j,0);
-    v_buf[m++] = meam_inst_kk->d_t_ave(j,1);
-    v_buf[m++] = meam_inst_kk->d_t_ave(j,2);
-    v_buf[m++] = meam_inst_kk->d_tsq_ave(j,0);
-    v_buf[m++] = meam_inst_kk->d_tsq_ave(j,1);
-    v_buf[m++] = meam_inst_kk->d_tsq_ave(j,2);
+  v_buf[m++] = meam_inst_kk->d_rho3[j];
+  v_buf[m++] = meam_inst_kk->d_frhop[j];
+  v_buf[m++] = meam_inst_kk->d_gamma[j];
+  v_buf[m++] = meam_inst_kk->d_dgamma1[j];
+  v_buf[m++] = meam_inst_kk->d_dgamma2[j];
+  v_buf[m++] = meam_inst_kk->d_dgamma3[j];
+  v_buf[m++] = meam_inst_kk->d_arho2b[j];
+  v_buf[m++] = meam_inst_kk->d_arho1(j,0);
+  v_buf[m++] = meam_inst_kk->d_arho1(j,1);
+  v_buf[m++] = meam_inst_kk->d_arho1(j,2);
+  v_buf[m++] = meam_inst_kk->d_arho2(j,0);
+  v_buf[m++] = meam_inst_kk->d_arho2(j,1);
+  v_buf[m++] = meam_inst_kk->d_arho2(j,2);
+  v_buf[m++] = meam_inst_kk->d_arho2(j,3);
+  v_buf[m++] = meam_inst_kk->d_arho2(j,4);
+  v_buf[m++] = meam_inst_kk->d_arho2(j,5);
+  for (int k = 0; k < 10; k++) v_buf[m++] = meam_inst_kk->d_arho3(j,k);
+  v_buf[m++] = meam_inst_kk->d_arho3b(j,0);
+  v_buf[m++] = meam_inst_kk->d_arho3b(j,1);
+  v_buf[m++] = meam_inst_kk->d_arho3b(j,2);
+  v_buf[m++] = meam_inst_kk->d_t_ave(j,0);
+  v_buf[m++] = meam_inst_kk->d_t_ave(j,1);
+  v_buf[m++] = meam_inst_kk->d_t_ave(j,2);
+  v_buf[m++] = meam_inst_kk->d_tsq_ave(j,0);
+  v_buf[m++] = meam_inst_kk->d_tsq_ave(j,1);
+  v_buf[m++] = meam_inst_kk->d_tsq_ave(j,2);
 } 
 
 /* ---------------------------------------------------------------------- */
+
 template<class DeviceType>
 void PairMEAMKokkos<DeviceType>::unpack_forward_comm_kokkos(int n, int first_in, DAT::tdual_xfloat_1d &buf)
 {
@@ -398,6 +364,8 @@ void PairMEAMKokkos<DeviceType>::unpack_forward_comm_kokkos(int n, int first_in,
   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMEAMUnpackForwardComm>(0,n),*this);
 }
 
+/* ---------------------------------------------------------------------- */
+
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
 void PairMEAMKokkos<DeviceType>::operator()(TagPairMEAMUnpackForwardComm, const int &i) const{
@@ -432,18 +400,34 @@ void PairMEAMKokkos<DeviceType>::operator()(TagPairMEAMUnpackForwardComm, const
     meam_inst_kk->d_tsq_ave(i+first,0) = v_buf[m++];
     meam_inst_kk->d_tsq_ave(i+first,1) = v_buf[m++];
     meam_inst_kk->d_tsq_ave(i+first,2) = v_buf[m++];
- } 
+ }
+
+/* ---------------------------------------------------------------------- */ 
 
 template<class DeviceType>
 int PairMEAMKokkos<DeviceType>::pack_forward_comm(int n, int *list, double *buf,
                                int pbc_flag, int *pbc)
 {
-    int i,j,k,m;
+  meam_inst_kk->k_rho0.sync_host();
+  meam_inst_kk->k_rho1.sync_host();
+  meam_inst_kk->k_rho2.sync_host();
+  meam_inst_kk->k_rho3.sync_host();
+  meam_inst_kk->k_frhop.sync_host();
+  meam_inst_kk->k_gamma.sync_host();
+  meam_inst_kk->k_dgamma1.sync_host();
+  meam_inst_kk->k_dgamma2.sync_host();
+  meam_inst_kk->k_dgamma3.sync_host();
+  meam_inst_kk->k_arho2b.sync_host();
+  meam_inst_kk->k_arho1.sync_host();
+  meam_inst_kk->k_arho2.sync_host();
+  meam_inst_kk->k_arho3.sync_host();
+  meam_inst_kk->k_arho3b.sync_host();
+  meam_inst_kk->k_t_ave.sync_host();
+  meam_inst_kk->k_tsq_ave.sync_host();
 
-    m = 0;
-
-    for (i = 0; i < n; i++) {
-       j = list[i];
+  int m = 0;
+  for (int i = 0; i < n; i++) {
+    const int j = list[i];
     buf[m++] = meam_inst_kk->h_rho0[j];
     buf[m++] = meam_inst_kk->h_rho1[j];
     buf[m++] = meam_inst_kk->h_rho2[j];
@@ -463,7 +447,7 @@ int PairMEAMKokkos<DeviceType>::pack_forward_comm(int n, int *list, double *buf,
     buf[m++] = meam_inst_kk->h_arho2(j,3);
     buf[m++] = meam_inst_kk->h_arho2(j,4);
     buf[m++] = meam_inst_kk->h_arho2(j,5);
-    for (k = 0; k < 10; k++) buf[m++] = meam_inst_kk->h_arho3(j,k);
+    for (int k = 0; k < 10; k++) buf[m++] = meam_inst_kk->h_arho3(j,k);
     buf[m++] = meam_inst_kk->h_arho3b(j,0);
     buf[m++] = meam_inst_kk->h_arho3b(j,1);
     buf[m++] = meam_inst_kk->h_arho3b(j,2);
@@ -478,14 +462,31 @@ int PairMEAMKokkos<DeviceType>::pack_forward_comm(int n, int *list, double *buf,
   return m;
 }
 
+/* ---------------------------------------------------------------------- */
+
 template<class DeviceType>
 void PairMEAMKokkos<DeviceType>::unpack_forward_comm(int n, int first, double *buf)
 {
-    int i,k,m,last;
+  meam_inst_kk->k_rho0.sync_host();
+  meam_inst_kk->k_rho1.sync_host();
+  meam_inst_kk->k_rho2.sync_host();
+  meam_inst_kk->k_rho3.sync_host();
+  meam_inst_kk->k_frhop.sync_host();
+  meam_inst_kk->k_gamma.sync_host();
+  meam_inst_kk->k_dgamma1.sync_host();
+  meam_inst_kk->k_dgamma2.sync_host();
+  meam_inst_kk->k_dgamma3.sync_host();
+  meam_inst_kk->k_arho2b.sync_host();
+  meam_inst_kk->k_arho1.sync_host();
+  meam_inst_kk->k_arho2.sync_host();
+  meam_inst_kk->k_arho3.sync_host();
+  meam_inst_kk->k_arho3b.sync_host();
+  meam_inst_kk->k_t_ave.sync_host();
+  meam_inst_kk->k_tsq_ave.sync_host();
 
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
+  int m = 0;
+  const int last = first + n;
+  for (int i = first; i < last; i++) {
     meam_inst_kk->h_rho0[i] = buf[m++];
     meam_inst_kk->h_rho1[i] = buf[m++];
     meam_inst_kk->h_rho2[i] = buf[m++];
@@ -505,7 +506,7 @@ void PairMEAMKokkos<DeviceType>::unpack_forward_comm(int n, int first, double *b
     meam_inst_kk->h_arho2(i,3) = buf[m++];
     meam_inst_kk->h_arho2(i,4) = buf[m++];
     meam_inst_kk->h_arho2(i,5) = buf[m++];
-    for (k = 0; k < 10; k++) meam_inst_kk->h_arho3(i,k) = buf[m++];
+    for (int k = 0; k < 10; k++) meam_inst_kk->h_arho3(i,k) = buf[m++];
     meam_inst_kk->h_arho3b(i,0) = buf[m++];
     meam_inst_kk->h_arho3b(i,1) = buf[m++];
     meam_inst_kk->h_arho3b(i,2) = buf[m++];
@@ -516,17 +517,83 @@ void PairMEAMKokkos<DeviceType>::unpack_forward_comm(int n, int first, double *b
     meam_inst_kk->h_tsq_ave(i,1) = buf[m++];
     meam_inst_kk->h_tsq_ave(i,2) = buf[m++];
   }
+
+  meam_inst_kk->k_rho0.modify_host();
+  meam_inst_kk->k_rho1.modify_host();
+  meam_inst_kk->k_rho2.modify_host();
+  meam_inst_kk->k_rho3.modify_host();
+  meam_inst_kk->k_frhop.modify_host();
+  meam_inst_kk->k_gamma.modify_host();
+  meam_inst_kk->k_dgamma1.modify_host();
+  meam_inst_kk->k_dgamma2.modify_host();
+  meam_inst_kk->k_dgamma3.modify_host();
+  meam_inst_kk->k_arho2b.modify_host();
+  meam_inst_kk->k_arho1.modify_host();
+  meam_inst_kk->k_arho2.modify_host();
+  meam_inst_kk->k_arho3.modify_host();
+  meam_inst_kk->k_arho3b.modify_host();
+  meam_inst_kk->k_t_ave.modify_host();
+  meam_inst_kk->k_tsq_ave.modify_host();
 }
 
 /* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+int PairMEAMKokkos<DeviceType>::pack_reverse_comm_kokkos(int n, int first_in, DAT::tdual_xfloat_1d &buf)
+{
+  first = first_in;
+  v_buf = buf.view<DeviceType>();
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMEAMPackReverseComm>(0,n),*this);
+  return n*30;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairMEAMKokkos<DeviceType>::operator()(TagPairMEAMPackReverseComm, const int &i) const {
+  int m = i*30;
+
+  v_buf[m++] = meam_inst_kk->d_rho0[i+first];
+  v_buf[m++] = meam_inst_kk->d_arho2b[i+first];
+  v_buf[m++] = meam_inst_kk->d_arho1(i+first,0);
+  v_buf[m++] = meam_inst_kk->d_arho1(i+first,1);
+  v_buf[m++] = meam_inst_kk->d_arho1(i+first,2);
+  v_buf[m++] = meam_inst_kk->d_arho2(i+first,0);
+  v_buf[m++] = meam_inst_kk->d_arho2(i+first,1);
+  v_buf[m++] = meam_inst_kk->d_arho2(i+first,2);
+  v_buf[m++] = meam_inst_kk->d_arho2(i+first,3);
+  v_buf[m++] = meam_inst_kk->d_arho2(i+first,4);
+  v_buf[m++] = meam_inst_kk->d_arho2(i+first,5);
+  for (int k = 0; k < 10; k++) v_buf[m++] = meam_inst_kk->d_arho3(i+first,k);
+  v_buf[m++] = meam_inst_kk->d_arho3b(i+first,0);
+  v_buf[m++] = meam_inst_kk->d_arho3b(i+first,1);
+  v_buf[m++] = meam_inst_kk->d_arho3b(i+first,2);
+  v_buf[m++] = meam_inst_kk->d_t_ave(i+first,0);
+  v_buf[m++] = meam_inst_kk->d_t_ave(i+first,1);
+  v_buf[m++] = meam_inst_kk->d_t_ave(i+first,2);
+  v_buf[m++] = meam_inst_kk->d_tsq_ave(i+first,0);
+  v_buf[m++] = meam_inst_kk->d_tsq_ave(i+first,1);
+  v_buf[m++] = meam_inst_kk->d_tsq_ave(i+first,2);
+}
+
+/* ---------------------------------------------------------------------- */
+
 template<class DeviceType>
 int PairMEAMKokkos<DeviceType>::pack_reverse_comm(int n, int first, double *buf)
 {
-  int i,k,m,last;
+  meam_inst_kk->k_rho0.sync_host();
+  meam_inst_kk->k_arho2b.sync_host();
+  meam_inst_kk->k_arho1.sync_host();
+  meam_inst_kk->k_arho2.sync_host();
+  meam_inst_kk->k_arho3.sync_host();
+  meam_inst_kk->k_arho3b.sync_host();
+  meam_inst_kk->k_t_ave.sync_host();
+  meam_inst_kk->k_tsq_ave.sync_host();
 
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
+  int m = 0;
+  const int last = first + n;
+  for (int i = first; i < last; i++) {
     buf[m++] = meam_inst_kk->h_rho0[i];
     buf[m++] = meam_inst_kk->h_arho2b[i];
     buf[m++] = meam_inst_kk->h_arho1(i,0);
@@ -538,7 +605,7 @@ int PairMEAMKokkos<DeviceType>::pack_reverse_comm(int n, int first, double *buf)
     buf[m++] = meam_inst_kk->h_arho2(i,3);
     buf[m++] = meam_inst_kk->h_arho2(i,4);
     buf[m++] = meam_inst_kk->h_arho2(i,5);
-    for (k = 0; k < 10; k++) buf[m++] = meam_inst_kk->h_arho3(i,k);
+    for (int k = 0; k < 10; k++) buf[m++] = meam_inst_kk->h_arho3(i,k);
     buf[m++] = meam_inst_kk->h_arho3b(i,0);
     buf[m++] = meam_inst_kk->h_arho3b(i,1);
     buf[m++] = meam_inst_kk->h_arho3b(i,2);
@@ -554,14 +621,64 @@ int PairMEAMKokkos<DeviceType>::pack_reverse_comm(int n, int first, double *buf)
 }
 
 /* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairMEAMKokkos<DeviceType>::unpack_reverse_comm_kokkos(int n, DAT::tdual_int_2d k_sendlist, int iswap_in, DAT::tdual_xfloat_1d &buf)
+{
+  d_sendlist = k_sendlist.view<DeviceType>();
+  iswap = iswap_in;
+  v_buf = buf.view<DeviceType>();
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMEAMUnpackReverseComm>(0,n),*this);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairMEAMKokkos<DeviceType>::operator()(TagPairMEAMUnpackReverseComm, const int &i) const {
+  int j = d_sendlist(iswap, i);
+  int m = i*30;
+
+  meam_inst_kk->d_rho0[j] += v_buf[m++];
+  meam_inst_kk->d_arho2b[j] += v_buf[m++];
+  meam_inst_kk->d_arho1(j,0) += v_buf[m++];
+  meam_inst_kk->d_arho1(j,1) += v_buf[m++];
+  meam_inst_kk->d_arho1(j,2) += v_buf[m++];
+  meam_inst_kk->d_arho2(j,0) += v_buf[m++];
+  meam_inst_kk->d_arho2(j,1) += v_buf[m++];
+  meam_inst_kk->d_arho2(j,2) += v_buf[m++];
+  meam_inst_kk->d_arho2(j,3) += v_buf[m++];
+  meam_inst_kk->d_arho2(j,4) += v_buf[m++];
+  meam_inst_kk->d_arho2(j,5) += v_buf[m++];
+  for (int k = 0; k < 10; k++) meam_inst_kk->d_arho3(j,k) += v_buf[m++];
+  meam_inst_kk->d_arho3b(j,0) += v_buf[m++];
+  meam_inst_kk->d_arho3b(j,1) += v_buf[m++];
+  meam_inst_kk->d_arho3b(j,2) += v_buf[m++];
+  meam_inst_kk->d_t_ave(j,0) += v_buf[m++];
+  meam_inst_kk->d_t_ave(j,1) += v_buf[m++];
+  meam_inst_kk->d_t_ave(j,2) += v_buf[m++];
+  meam_inst_kk->d_tsq_ave(j,0) += v_buf[m++];
+  meam_inst_kk->d_tsq_ave(j,1) += v_buf[m++];
+  meam_inst_kk->d_tsq_ave(j,2) += v_buf[m++];
+}
+
+/* ---------------------------------------------------------------------- */
+
 template<class DeviceType>
 void PairMEAMKokkos<DeviceType>::unpack_reverse_comm(int n, int *list, double *buf)
 {
-  int i,j,k,m;
+  meam_inst_kk->k_rho0.sync_host();
+  meam_inst_kk->k_arho2b.sync_host();
+  meam_inst_kk->k_arho1.sync_host();
+  meam_inst_kk->k_arho2.sync_host();
+  meam_inst_kk->k_arho3.sync_host();
+  meam_inst_kk->k_arho3b.sync_host();
+  meam_inst_kk->k_t_ave.sync_host();
+  meam_inst_kk->k_tsq_ave.sync_host();
 
-  m = 0;
-  for (i = 0; i < n; i++) {
-    j = list[i];
+  int m = 0;
+  for (int i = 0; i < n; i++) {
+    const int j = list[i];
     meam_inst_kk->h_rho0[j] += buf[m++];
     meam_inst_kk->h_arho2b[j] += buf[m++];
     meam_inst_kk->h_arho1(j,0) += buf[m++];
@@ -573,7 +690,7 @@ void PairMEAMKokkos<DeviceType>::unpack_reverse_comm(int n, int *list, double *b
     meam_inst_kk->h_arho2(j,3) += buf[m++];
     meam_inst_kk->h_arho2(j,4) += buf[m++];
     meam_inst_kk->h_arho2(j,5) += buf[m++];
-    for (k = 0; k < 10; k++) meam_inst_kk->h_arho3(j,k) += buf[m++];
+    for (int k = 0; k < 10; k++) meam_inst_kk->h_arho3(j,k) += buf[m++];
     meam_inst_kk->h_arho3b(j,0) += buf[m++];
     meam_inst_kk->h_arho3b(j,1) += buf[m++];
     meam_inst_kk->h_arho3b(j,2) += buf[m++];
@@ -584,46 +701,48 @@ void PairMEAMKokkos<DeviceType>::unpack_reverse_comm(int n, int *list, double *b
     meam_inst_kk->h_tsq_ave(j,1) += buf[m++];
     meam_inst_kk->h_tsq_ave(j,2) += buf[m++];
   }
+
+  meam_inst_kk->k_rho0.modify_host();
+  meam_inst_kk->k_arho2b.modify_host();
+  meam_inst_kk->k_arho1.modify_host();
+  meam_inst_kk->k_arho2.modify_host();
+  meam_inst_kk->k_arho3.modify_host();
+  meam_inst_kk->k_arho3b.modify_host();
+  meam_inst_kk->k_t_ave.modify_host();
+  meam_inst_kk->k_tsq_ave.modify_host();
 }
 
 /* ----------------------------------------------------------------------
-   memory usage of local atom-based arrays
+   strip special bond flags from neighbor list entries
+   are not used with MEAM
+   need to do here so Fortran lib doesn't see them
+   done once per reneighbor so that neigh_f2c and neigh_c2f don't see them
 ------------------------------------------------------------------------- */
-template<class DeviceType>
-double PairMEAMKokkos<DeviceType>::memory_usage()
-{
-  double bytes = 11 * meam_inst_kk->nmax * sizeof(double);
-  bytes += (3 + 6 + 10 + 3 + 3 + 3) * meam_inst_kk->nmax * sizeof(double);
-  bytes += 3 * meam_inst_kk->maxneigh * sizeof(double);
-  return bytes;
-}
-template<class DeviceType>
-KOKKOS_INLINE_FUNCTION
-void PairMEAMKokkos<DeviceType>::operator()(TagPairMEAMKernelNeighStrip, const int &ii) const {
-/* ----------------------------------------------------------------------
- *    strip special bond flags from neighbor list entries
- *       are not used with MEAM
- *          need to do here so Fortran lib doesn't see them
- *             done once per reneighbor so that neigh_f2c and neigh_c2f don't see them
- *             ------------------------------------------------------------------------- */
-    const int i = d_ilist_half[ii];
-    const int jnum_half = d_numneigh_half[i];
-    const int jnum_full = d_numneigh_full[i];
-    for (int jj = 0; jj < jnum_half; jj++) {
-        d_neighbors_half(i,jj) &= NEIGHMASK;
-    }
-    for (int jj = 0; jj < jnum_full; jj++) {
-        d_neighbors_full(i,jj) &= NEIGHMASK;
-    }
-}
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void PairMEAMKokkos<DeviceType>::operator()(TagPairMEAMKernelA, const int ii, int &n) const {
-    const int i = d_ilist_half[ii];
-    n += d_numneigh_half[i]; 
+void PairMEAMKokkos<DeviceType>::operator()(TagPairMEAMNeighStrip, const int &ii) const {
+
+  const int i = d_ilist_half[ii];
+  const int jnum_half = d_numneigh_half[i];
+  const int jnum_full = d_numneigh_full[i];
+  for (int jj = 0; jj < jnum_half; jj++)
+    d_neighbors_half(i,jj) &= NEIGHMASK;
+  for (int jj = 0; jj < jnum_full; jj++)
+    d_neighbors_full(i,jj) &= NEIGHMASK;
 }
 
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairMEAMKokkos<DeviceType>::operator()(TagPairMEAMOffsets, const int ii, int &n) const {
+  const int i = d_ilist_half[ii];
+  n += d_numneigh_half[i]; 
+}
+
+/* ---------------------------------------------------------------------- */
+
 namespace LAMMPS_NS {
 template class PairMEAMKokkos<LMPDeviceType>;
 #ifdef KOKKOS_ENABLE_CUDA
diff --git a/src/KOKKOS/pair_meam_kokkos.h b/src/KOKKOS/pair_meam_kokkos.h
index 114f4a1665..c18c9151f2 100644
--- a/src/KOKKOS/pair_meam_kokkos.h
+++ b/src/KOKKOS/pair_meam_kokkos.h
@@ -1,6 +1,6 @@
 /* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
+   https://www.lammps.org/, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
@@ -12,31 +12,33 @@
 ------------------------------------------------------------------------- */
 
 #ifdef PAIR_CLASS
-
+// clang-format off
 PairStyle(meam/c/kk,PairMEAMKokkos<LMPDeviceType>)
 PairStyle(meam/c/kk/device,PairMEAMKokkos<LMPDeviceType>)
 PairStyle(meam/c/kk/host,PairMEAMKokkos<LMPHostType>)
 PairStyle(meam/kk,PairMEAMKokkos<LMPDeviceType>)
 PairStyle(meam/kk/device,PairMEAMKokkos<LMPDeviceType>)
 PairStyle(meam/kk/host,PairMEAMKokkos<LMPHostType>)
-
+// clang-format on
 #else
 
-#ifndef LMP_PAIR_MEAMC_KOKKOS_H
-#define LMP_PAIR_MEAMC_KOKKOS_H
+// clang-format off
+#ifndef LMP_PAIR_MEAM_KOKKOS_H
+#define LMP_PAIR_MEAM_KOKKOS_H
 
 #include "kokkos_base.h"
 #include "pair_kokkos.h"
 #include "pair_meam.h"
-#include "neigh_list_kokkos.h"
 #include "meam_kokkos.h"
 
-
 namespace LAMMPS_NS {
-struct TagPairMEAMKernelNeighStrip{};
-struct TagPairMEAMKernelA{};
+
+struct TagPairMEAMNeighStrip{};
+struct TagPairMEAMOffsets{};
 struct TagPairMEAMPackForwardComm{};
 struct TagPairMEAMUnpackForwardComm{};
+struct TagPairMEAMPackReverseComm{};
+struct TagPairMEAMUnpackReverseComm{};
 
 template<class DeviceType>
 class MEAMKokkos;
@@ -48,13 +50,13 @@ class PairMEAMKokkos : public PairMEAM, public KokkosBase {
   enum {COUL_FLAG=0};
   typedef DeviceType device_type;
   typedef ArrayTypes<DeviceType> AT;
-  //typedef EV_FLOAT value_type;
+  typedef int value_type;
 
   PairMEAMKokkos(class LAMMPS *);
-  virtual ~PairMEAMKokkos();
-  void compute(int, int);
-  void coeff(int, char **);
-  void init_style();
+  ~PairMEAMKokkos() override;
+  void compute(int, int) override;
+  void coeff(int, char **) override;
+  void init_style() override;
 
   KOKKOS_INLINE_FUNCTION
   void operator()(TagPairMEAMPackForwardComm, const int&) const;
@@ -63,104 +65,59 @@ class PairMEAMKokkos : public PairMEAM, public KokkosBase {
   void operator()(TagPairMEAMUnpackForwardComm, const int&) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(TagPairMEAMKernelNeighStrip,  const int&) const;
+  void operator()(TagPairMEAMPackReverseComm, const int&) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(TagPairMEAMKernelA,  const int, int&) const;
+  void operator()(TagPairMEAMUnpackReverseComm, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairMEAMNeighStrip,  const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairMEAMOffsets,  const int, int&) const;
 
   int pack_forward_comm_kokkos(int, DAT::tdual_int_2d, int, DAT::tdual_xfloat_1d&,
-                       int, int *);
-  void unpack_forward_comm_kokkos(int, int, DAT::tdual_xfloat_1d&);
-  int pack_forward_comm(int, int *, double *, int, int *);
-  void unpack_forward_comm(int, int, double *);
-  int pack_reverse_comm(int, int, double *);
-  void unpack_reverse_comm(int, int *, double *);
-  double memory_usage();
+                               int, int *) override;
+  int pack_forward_comm(int, int *, double *, int, int *) override;
+  void unpack_forward_comm_kokkos(int, int, DAT::tdual_xfloat_1d&) override;
+  void unpack_forward_comm(int, int, double *) override;
+  int pack_reverse_comm_kokkos(int, int, DAT::tdual_xfloat_1d&) override;
+  int pack_reverse_comm(int, int, double *) override;
+  void unpack_reverse_comm_kokkos(int, DAT::tdual_int_2d,
+                                  int, DAT::tdual_xfloat_1d&) override;
+  void unpack_reverse_comm(int, int *, double *) override;
 
  protected:
   class MEAMKokkos<DeviceType> *meam_inst_kk;
-  typename AT::t_x_array_randomread x;
+  typename AT::t_x_array x;
   typename AT::t_f_array f;
-  typename AT::t_int_1d_randomread type;
+  typename AT::t_int_1d type;
   
   DAT::tdual_efloat_1d k_eatom;
   DAT::tdual_virial_array k_vatom;
-  typename ArrayTypes<DeviceType>::t_efloat_1d d_eatom;
-  typename ArrayTypes<DeviceType>::t_virial_array d_vatom;
+  typename AT::t_efloat_1d d_eatom;
+  typename AT::t_virial_array d_vatom;
 
-  DAT::tdual_int_1d k_offset; 
-  HAT::t_int_1d h_offset;
-  typename AT::t_int_1d_randomread d_offset;
+  typename AT::t_int_1d d_offset;
   
   DAT::tdual_int_1d k_map; 
-  typename AT::t_int_1d_randomread d_map;
-  typename AT::t_int_1d_randomread d_ilist_half;
-  typename AT::t_int_1d_randomread d_numneigh_half;
+  typename AT::t_int_1d d_map;
+  typename AT::t_int_2d d_scale;
+  typename AT::t_int_1d d_ilist_half;
+  typename AT::t_int_1d d_numneigh_half;
   typename AT::t_neighbors_2d d_neighbors_half;
-  typename AT::t_int_1d_randomread d_numneigh_full;
+  typename AT::t_int_1d d_numneigh_full;
   typename AT::t_neighbors_2d d_neighbors_full;
   typename AT::t_int_2d d_sendlist;
   typename AT::t_xfloat_1d_um v_buf;
-   
+
+  int iswap,first;   
   int neighflag,nlocal,nall,eflag,vflag;
-  int iswap; 
-  int first;
 
   friend void pair_virial_fdotr_compute<PairMEAMKokkos>(PairMEAMKokkos*);
-
 };
 
 }
-
 #endif
 #endif
 
-/* ERROR/WARNING messages:
-
-E: MEAM library error %d
-
-A call to the MEAM Fortran library returned an error.
-
-E: Illegal ... command
-
-Self-explanatory.  Check the input script syntax and compare to the
-documentation for the command.  You can use -echo screen as a
-command-line option when running LAMMPS to see the offending line.
-
-E: Incorrect args for pair coefficients
-
-Self-explanatory.  Check the input script or data file.
-
-E: Pair style MEAM requires newton pair on
-
-See the newton command.  This is a restriction to use the MEAM
-potential.
-
-E: Cannot open MEAM potential file %s
-
-The specified MEAM potential file cannot be opened.  Check that the
-path and name are correct.
-
-E: Incorrect format in MEAM potential file
-
-Incorrect number of words per line in the potential file.
-
-E: Unrecognized lattice type in MEAM file 1
-
-The lattice type in an entry of the MEAM library file is not
-valid.
-
-E: Did not find all elements in MEAM library file
-
-The requested elements were not found in the MEAM file.
-
-E: Keyword %s in MEAM parameter file not recognized
-
-Self-explanatory.
-
-E: Unrecognized lattice type in MEAM file 2
-
-The lattice type in an entry of the MEAM parameter file is not
-valid.
-
-*/
diff --git a/src/MEAM/meam.h b/src/MEAM/meam.h
index 8b94ac0084..8a2f43e354 100644
--- a/src/MEAM/meam.h
+++ b/src/MEAM/meam.h
@@ -27,7 +27,9 @@ typedef enum { FCC, BCC, HCP, DIM, DIA, DIA3, B1, C11, L12, B2, CH4, LIN, ZIG, T
 class MEAM {
  public:
   MEAM(Memory *mem);
-  ~MEAM();
+  virtual ~MEAM();
+
+  int copymode;
 
  protected:
   Memory *memory;
@@ -285,8 +287,8 @@ class MEAM {
                          double *rozero, int *ibar);
   void meam_setup_param(int which, double value, int nindex, int *index /*index(3)*/,
                         int *errorflag);
-  void meam_setup_done(double *cutmax);
-  void meam_dens_setup(int atom_nmax, int nall, int n_neigh);
+  virtual void meam_setup_done(double *cutmax);
+  virtual void meam_dens_setup(int atom_nmax, int nall, int n_neigh);
   void meam_dens_init(int i, int ntype, int *type, int *fmap, double **x, int numneigh,
                       int *firstneigh, int numneigh_full, int *firstneigh_full, int fnoffset);
   void meam_dens_final(int nlocal, int eflag_either, int eflag_global, int eflag_atom,
diff --git a/src/MEAM/meam_impl.cpp b/src/MEAM/meam_impl.cpp
index 499a2b1520..d0d81cee88 100644
--- a/src/MEAM/meam_impl.cpp
+++ b/src/MEAM/meam_impl.cpp
@@ -36,6 +36,7 @@ MEAM::MEAM(Memory* mem)
 
   maxneigh = 0;
   scrfcn = dscrfcn = fcpair = nullptr;
+  copymode = 0;
 
   neltypes = 0;
   for (int i = 0; i < maxelt; i++) {
@@ -53,6 +54,8 @@ MEAM::MEAM(Memory* mem)
 
 MEAM::~MEAM()
 {
+  if (copymode) return;
+
   memory->destroy(this->phirar6);
   memory->destroy(this->phirar5);
   memory->destroy(this->phirar4);
diff --git a/src/MEAM/pair_meam.cpp b/src/MEAM/pair_meam.cpp
index ab027dcb89..6cbe5c69b9 100644
--- a/src/MEAM/pair_meam.cpp
+++ b/src/MEAM/pair_meam.cpp
@@ -75,7 +75,8 @@ PairMEAM::~PairMEAM()
 {
   if (copymode) return;
 
-  delete meam_inst;
+  if (meam_inst)
+    delete meam_inst;
 
   if (allocated) {
     memory->destroy(setflag);
diff --git a/src/MEAM/pair_meam.h b/src/MEAM/pair_meam.h
index ffe8045ac9..304e7eb41f 100644
--- a/src/MEAM/pair_meam.h
+++ b/src/MEAM/pair_meam.h
@@ -52,7 +52,7 @@ class PairMEAM : public Pair {
 
   double **scale;    // scaling factor for adapt
 
-  virtual void allocate();
+  void allocate();
   void read_files(const std::string &, const std::string &);
   void read_global_meam_file(const std::string &);
   void read_user_meam_file(const std::string &);
diff --git a/src/pair.cpp b/src/pair.cpp
index 44f23745d9..5ce4dc2d32 100644
--- a/src/pair.cpp
+++ b/src/pair.cpp
@@ -126,6 +126,7 @@ Pair::Pair(LAMMPS *lmp) : Pointers(lmp)
   datamask_modify = ALL_MASK;
 
   kokkosable = 0;
+  reverse_comm_device = 0;
   copymode = 0;
 }
 
diff --git a/src/pair.h b/src/pair.h
index 4763a225d2..5bad9e14a3 100644
--- a/src/pair.h
+++ b/src/pair.h
@@ -123,6 +123,7 @@ class Pair : protected Pointers {
   ExecutionSpace execution_space;
   unsigned int datamask_read, datamask_modify;
   int kokkosable;    // 1 if Kokkos pair
+  int reverse_comm_device;    // 1 if reverse comm on Device
 
   Pair(class LAMMPS *);
   ~Pair() override;