Merge branch 'develop' into long-string-variables

2022-07-02 11:30:16 -04:00
parent 39b01a901f 8e4b3fd41b
commit b5d5654399
233 changed files with 41780 additions and 2976 deletions
--- a/src/.gitignore
+++ b/src/.gitignore
@ -173,12 +173,20 @@
 /pair_tdpd.cpp
 /pair_tdpd.h

+/compute_grid.cpp
+/compute_grid.h
+/compute_grid_local.cpp
+/compute_grid_local.h
 /compute_sna_atom.cpp
 /compute_sna_atom.h
 /compute_snad_atom.cpp
 /compute_snad_atom.h
 /compute_snav_atom.cpp
 /compute_snav_atom.h
+/compute_sna_grid.cpp
+/compute_sna_grid.h
+/compute_sna_grid_local.cpp
+/compute_sna_grid_local.h
 /compute_snap.cpp
 /compute_snap.h
 /openmp_snap.h
--- a/src/Depend.sh
+++ b/src/Depend.sh
@ -127,6 +127,10 @@ if (test $1 = "MANYBODY") then
  depend OPENMP
 fi

+if (test $1 = "MEAM") then
+  depend KOKKOS
+fi
+
 if (test $1 = "MOLECULE") then
  depend EXTRA-MOLECULE
  depend GPU
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@ -182,6 +182,13 @@ action kokkos_base.h
 action kokkos_base_fft.h fft3d.h
 action kokkos_few.h
 action kokkos_type.h
+action meam_kokkos.h meam.h
+action meam_dens_final_kokkos.h meam_dens_final.cpp
+action meam_dens_init_kokkos.h meam_dens_init.cpp
+action meam_force_kokkos.h meam_force.cpp
+action meam_funcs_kokkos.h meam_funcs.cpp
+action meam_impl_kokkos.h meam_impl.cpp
+action meam_setup_done_kokkos.h meam_setup_done.cpp
 action memory_kokkos.h
 action modify_kokkos.cpp
 action modify_kokkos.h
@ -287,6 +294,8 @@ action pair_lj_gromacs_kokkos.cpp pair_lj_gromacs.cpp
 action pair_lj_gromacs_kokkos.h pair_lj_gromacs.h
 action pair_lj_sdk_kokkos.cpp pair_lj_sdk.cpp
 action pair_lj_sdk_kokkos.h pair_lj_sdk.h
+action pair_meam_kokkos.cpp pair_meam.cpp
+action pair_meam_kokkos.h pair_meam.h
 action pair_morse_kokkos.cpp
 action pair_morse_kokkos.h
 action pair_multi_lucy_rx_kokkos.cpp pair_multi_lucy_rx.cpp
--- a/src/KOKKOS/atom_vec_angle_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_angle_kokkos.cpp
@ -1391,6 +1391,9 @@ int AtomVecAngleKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int n
                                              int nlocal,int dim,X_FLOAT lo,X_FLOAT hi,
                                              ExecutionSpace space) {
  const size_t elements = 17+atom->maxspecial+2*atom->bond_per_atom+4*atom->angle_per_atom;
+
+  while (nlocal + nrecv/elements >= nmax) grow(0);
+
  if (space == Host) {
    k_count.h_view(0) = nlocal;
    AtomVecAngleKokkos_UnpackExchangeFunctor<LMPHostType>
--- a/src/KOKKOS/atom_vec_atomic_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_atomic_kokkos.cpp
@ -649,6 +649,8 @@ struct AtomVecAtomicKokkos_UnpackExchangeFunctor {
 /* ---------------------------------------------------------------------- */

 int AtomVecAtomicKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int nrecv,int nlocal,int dim,X_FLOAT lo,X_FLOAT hi,ExecutionSpace space) {
+  while (nlocal + nrecv/11 >= nmax) grow(0);
+
  if (space == Host) {
    k_count.h_view(0) = nlocal;
    AtomVecAtomicKokkos_UnpackExchangeFunctor<LMPHostType> f(atomKK,k_buf,k_count,dim,lo,hi);
--- a/src/KOKKOS/atom_vec_bond_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_bond_kokkos.cpp
@ -845,6 +845,9 @@ int AtomVecBondKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int nr
                                              int nlocal,int dim,X_FLOAT lo,X_FLOAT hi,
                                              ExecutionSpace space) {
  const size_t elements = 16+atomKK->maxspecial+atomKK->bond_per_atom+atomKK->bond_per_atom;
+
+  while (nlocal + nrecv/elements >= nmax) grow(0);
+
  if (space == Host) {
    k_count.h_view(0) = nlocal;
    AtomVecBondKokkos_UnpackExchangeFunctor<LMPHostType>
--- a/src/KOKKOS/atom_vec_charge_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_charge_kokkos.cpp
@ -774,6 +774,8 @@ struct AtomVecChargeKokkos_UnpackExchangeFunctor {
 int AtomVecChargeKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int nrecv,
                                                int nlocal,int dim,X_FLOAT lo,X_FLOAT hi,
                                                ExecutionSpace space) {
+  while (nlocal + nrecv/12 >= nmax) grow(0);
+
  if (space == Host) {
    k_count.h_view(0) = nlocal;
    AtomVecChargeKokkos_UnpackExchangeFunctor<LMPHostType> f(atomKK,k_buf,k_count,dim,lo,hi);
--- a/src/KOKKOS/atom_vec_dpd_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_dpd_kokkos.cpp
@ -1505,6 +1505,8 @@ struct AtomVecDPDKokkos_UnpackExchangeFunctor {
 /* ---------------------------------------------------------------------- */

 int AtomVecDPDKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int nrecv,int nlocal,int dim,X_FLOAT lo,X_FLOAT hi,ExecutionSpace space) {
+  while (nlocal + nrecv/17 >= nmax) grow(0);
+
  if (space == Host) {
    k_count.h_view(0) = nlocal;
    AtomVecDPDKokkos_UnpackExchangeFunctor<LMPHostType> f(atomKK,k_buf,k_count,dim,lo,hi);
--- a/src/KOKKOS/atom_vec_full_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_full_kokkos.cpp
@ -1186,6 +1186,9 @@ int AtomVecFullKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int nr
                                              ExecutionSpace space) {
  const size_t elements = 20+atom->maxspecial+2*atom->bond_per_atom+4*atom->angle_per_atom+
    5*atom->dihedral_per_atom + 5*atom->improper_per_atom;
+
+  while (nlocal + nrecv/elements >= nmax) grow(0);
+
  if (space == Host) {
    k_count.h_view(0) = nlocal;
    AtomVecFullKokkos_UnpackExchangeFunctor<LMPHostType>
--- a/src/KOKKOS/atom_vec_molecular_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_molecular_kokkos.cpp
@ -1594,6 +1594,9 @@ int AtomVecMolecularKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,i
                                                   ExecutionSpace space) {
  const size_t elements = 19+atom->maxspecial+2*atom->bond_per_atom+4*atom->angle_per_atom+
    5*atom->dihedral_per_atom + 5*atom->improper_per_atom;
+
+  while (nlocal + nrecv/elements >= nmax) grow(0);
+
  if (space == Host) {
    k_count.h_view(0) = nlocal;
    AtomVecMolecularKokkos_UnpackExchangeFunctor<LMPHostType>
--- a/src/KOKKOS/atom_vec_sphere_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_sphere_kokkos.cpp
@ -2341,6 +2341,8 @@ struct AtomVecSphereKokkos_UnpackExchangeFunctor {
 /* ---------------------------------------------------------------------- */

 int AtomVecSphereKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int nrecv,int nlocal,int dim,X_FLOAT lo,X_FLOAT hi,ExecutionSpace space) {
+  while (nlocal + nrecv/16 >= nmax) grow(0);
+
  if (space == Host) {
    k_count.h_view(0) = nlocal;
    AtomVecSphereKokkos_UnpackExchangeFunctor<LMPHostType> f(atomKK,k_buf,k_count,dim,lo,hi);
--- a/src/KOKKOS/atom_vec_spin_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_spin_kokkos.cpp
@ -863,6 +863,8 @@ struct AtomVecSpinKokkos_UnpackExchangeFunctor {
 int AtomVecSpinKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int nrecv,
                                                int nlocal,int dim,X_FLOAT lo,X_FLOAT hi,
                                                ExecutionSpace space) {
+  while (nlocal + nrecv/15 >= nmax) grow(0);
+
  if(space == Host) {
    k_count.h_view(0) = nlocal;
    AtomVecSpinKokkos_UnpackExchangeFunctor<LMPHostType> f(atomKK,k_buf,k_count,dim,lo,hi);
--- a/src/KOKKOS/comm_kokkos.cpp
+++ b/src/KOKKOS/comm_kokkos.cpp
@ -109,6 +109,7 @@ void CommKokkos::init()
  exchange_comm_classic = lmp->kokkos->exchange_comm_classic;
  forward_comm_classic = lmp->kokkos->forward_comm_classic;
  forward_pair_comm_classic = lmp->kokkos->forward_pair_comm_classic;
+  reverse_pair_comm_classic = lmp->kokkos->reverse_pair_comm_classic;
  forward_fix_comm_classic = lmp->kokkos->forward_fix_comm_classic;
  reverse_comm_classic = lmp->kokkos->reverse_comm_classic;
  exchange_comm_on_host = lmp->kokkos->exchange_comm_on_host;
@ -478,12 +479,13 @@ void CommKokkos::forward_comm_device(Pair *pair)
  int nsize = pair->comm_forward;
  KokkosBase* pairKKBase = dynamic_cast<KokkosBase*>(pair);

+  int nmax = max_buf_pair;
  for (iswap = 0; iswap < nswap; iswap++) {
-    int n = MAX(max_buf_pair,nsize*sendnum[iswap]);
-    n = MAX(n,nsize*recvnum[iswap]);
-    if (n > max_buf_pair)
-      grow_buf_pair(n);
+    nmax = MAX(nmax,nsize*sendnum[iswap]);
+    nmax = MAX(nmax,nsize*recvnum[iswap]);
  }
+  if (nmax > max_buf_pair)
+    grow_buf_pair(nmax);

  for (iswap = 0; iswap < nswap; iswap++) {

@ -545,8 +547,76 @@ void CommKokkos::grow_buf_fix(int n) {

 void CommKokkos::reverse_comm(Pair *pair)
 {
-  k_sendlist.sync<LMPHostType>();
-  CommBrick::reverse_comm(pair);
+  if (pair->execution_space == Host || !pair->reverse_comm_device || reverse_pair_comm_classic) {
+    k_sendlist.sync<LMPHostType>();
+    CommBrick::reverse_comm(pair);
+  } else {
+    k_sendlist.sync<LMPDeviceType>();
+    reverse_comm_device<LMPDeviceType>(pair);
+  }
+}
+
+template<class DeviceType>
+void CommKokkos::reverse_comm_device(Pair *pair)
+{
+  int iswap,n;
+  MPI_Request request;
+  DAT::tdual_xfloat_1d k_buf_tmp;
+
+  KokkosBase* pairKKBase = dynamic_cast<KokkosBase*>(pair);
+
+  int nsize = MAX(pair->comm_reverse,pair->comm_reverse_off);
+
+  int nmax = max_buf_pair;
+  for (iswap = 0; iswap < nswap; iswap++) {
+    nmax = MAX(nmax,nsize*sendnum[iswap]);
+    nmax = MAX(nmax,nsize*recvnum[iswap]);
+  }
+  if (nmax > max_buf_pair)
+    grow_buf_pair(nmax);
+
+  for (iswap = nswap-1; iswap >= 0; iswap--) {
+
+    // pack buffer
+
+    n = pairKKBase->pack_reverse_comm_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_send_pair);
+    DeviceType().fence();
+
+    // exchange with another proc
+    // if self, set recv buffer to send buffer
+
+    double* buf_send_pair;
+    double* buf_recv_pair;
+    if (lmp->kokkos->gpu_aware_flag) {
+      buf_send_pair = k_buf_send_pair.view<DeviceType>().data();
+      buf_recv_pair = k_buf_recv_pair.view<DeviceType>().data();
+    } else {
+      k_buf_send_pair.modify<DeviceType>();
+      k_buf_send_pair.sync<LMPHostType>();
+      buf_send_pair = k_buf_send_pair.h_view.data();
+      buf_recv_pair = k_buf_recv_pair.h_view.data();
+    }
+
+    if (sendproc[iswap] != me) {
+      if (sendnum[iswap])
+        MPI_Irecv(buf_recv_pair,nsize*sendnum[iswap],MPI_DOUBLE,sendproc[iswap],0,world,&request);
+      if (recvnum[iswap])
+        MPI_Send(buf_send_pair,n,MPI_DOUBLE,recvproc[iswap],0,world);
+      if (sendnum[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE);
+
+      if (!lmp->kokkos->gpu_aware_flag) {
+        k_buf_recv_pair.modify<LMPHostType>();
+        k_buf_recv_pair.sync<DeviceType>();
+      }
+      k_buf_tmp = k_buf_recv_pair;
+    } else k_buf_tmp = k_buf_send_pair;
+
+    // unpack buffer
+
+    pairKKBase->unpack_reverse_comm_kokkos(sendnum[iswap],k_sendlist,
+                                       iswap,k_buf_tmp);
+    DeviceType().fence();
+  }
 }

 void CommKokkos::forward_comm(Dump *dump)
--- a/src/KOKKOS/comm_kokkos.h
+++ b/src/KOKKOS/comm_kokkos.h
@ -27,6 +27,7 @@ class CommKokkos : public CommBrick {
  bool exchange_comm_classic;
  bool forward_comm_classic;
  bool forward_pair_comm_classic;
+  bool reverse_pair_comm_classic;
  bool forward_fix_comm_classic;
  bool reverse_comm_classic;
  bool exchange_comm_on_host;
@ -58,6 +59,7 @@ class CommKokkos : public CommBrick {
  template<class DeviceType> void forward_comm_device(int dummy);
  template<class DeviceType> void reverse_comm_device();
  template<class DeviceType> void forward_comm_device(Pair *pair);
+  template<class DeviceType> void reverse_comm_device(Pair *pair);
  template<class DeviceType> void forward_comm_device(Fix *fix, int size=0);
  template<class DeviceType> void exchange_device();
  template<class DeviceType> void borders_device();
--- a/src/KOKKOS/kokkos.cpp
+++ b/src/KOKKOS/kokkos.cpp
@ -91,6 +91,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
  exchange_comm_changed = 0;
  forward_comm_changed = 0;
  forward_pair_comm_changed = 0;
+  reverse_pair_comm_changed = 0;
  forward_fix_comm_changed = 0;
  reverse_comm_changed = 0;

@ -239,7 +240,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
    newtonflag = 0;

    exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
-    forward_pair_comm_classic = forward_fix_comm_classic = 0;
+    forward_pair_comm_classic = reverse_pair_comm_classic = forward_fix_comm_classic = 0;

    exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
  } else {
@ -253,7 +254,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
    newtonflag = 1;

    exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 1;
-    forward_pair_comm_classic = forward_fix_comm_classic = 1;
+    forward_pair_comm_classic = reverse_pair_comm_classic = forward_fix_comm_classic = 1;

    exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
  }
@ -394,17 +395,17 @@ void KokkosLMP::accelerator(int narg, char **arg)
      if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
      if (strcmp(arg[iarg+1],"no") == 0) {
        exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 1;
-        forward_pair_comm_classic = forward_fix_comm_classic = 1;
+        forward_pair_comm_classic = reverse_pair_comm_classic = forward_fix_comm_classic = 1;

        exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
      } else if (strcmp(arg[iarg+1],"host") == 0) {
        exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
-        forward_pair_comm_classic = forward_fix_comm_classic = 1;
+        forward_pair_comm_classic = reverse_pair_comm_classic = forward_fix_comm_classic = 1;

        exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 1;
      } else if (strcmp(arg[iarg+1],"device") == 0) {
        exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
-        forward_pair_comm_classic = forward_fix_comm_classic = 0;
+        forward_pair_comm_classic = reverse_pair_comm_classic = forward_fix_comm_classic = 0;

        exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
      } else error->all(FLERR,"Illegal package kokkos command");
@ -441,6 +442,14 @@ void KokkosLMP::accelerator(int narg, char **arg)
      else error->all(FLERR,"Illegal package kokkos command");
      forward_pair_comm_changed = 0;
      iarg += 2;
+    } else if (strcmp(arg[iarg],"comm/pair/reverse") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
+      if (strcmp(arg[iarg+1],"no") == 0) reverse_pair_comm_classic = 1;
+      else if (strcmp(arg[iarg+1],"host") == 0) reverse_pair_comm_classic = 1;
+      else if (strcmp(arg[iarg+1],"device") == 0) reverse_pair_comm_classic = 0;
+      else error->all(FLERR,"Illegal package kokkos command");
+      reverse_pair_comm_changed = 0;
+      iarg += 2;
    } else if (strcmp(arg[iarg],"comm/fix/forward") == 0) {
      if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
      if (strcmp(arg[iarg+1],"no") == 0) forward_fix_comm_classic = 1;
@ -515,6 +524,10 @@ void KokkosLMP::accelerator(int narg, char **arg)
      forward_pair_comm_classic = 1;
      forward_pair_comm_changed = 1;
    }
+    if (reverse_pair_comm_classic == 0) {
+      reverse_pair_comm_classic = 1;
+      reverse_pair_comm_changed = 1;
+    }
    if (forward_fix_comm_classic == 0) {
      forward_fix_comm_classic = 1;
      forward_fix_comm_changed = 1;
@ -540,6 +553,10 @@ void KokkosLMP::accelerator(int narg, char **arg)
      forward_pair_comm_classic = 0;
      forward_pair_comm_changed = 0;
    }
+    if (reverse_pair_comm_changed) {
+      reverse_pair_comm_classic = 0;
+      reverse_pair_comm_changed = 0;
+    }
    if (forward_fix_comm_changed) {
      forward_fix_comm_classic = 0;
      forward_fix_comm_changed = 0;
--- a/src/KOKKOS/kokkos.h
+++ b/src/KOKKOS/kokkos.h
@ -30,6 +30,7 @@ class KokkosLMP : protected Pointers {
  int exchange_comm_classic;
  int forward_comm_classic;
  int forward_pair_comm_classic;
+  int reverse_pair_comm_classic;
  int forward_fix_comm_classic;
  int reverse_comm_classic;
  int exchange_comm_on_host;
@ -38,6 +39,7 @@ class KokkosLMP : protected Pointers {
  int exchange_comm_changed;
  int forward_comm_changed;
  int forward_pair_comm_changed;
+  int reverse_pair_comm_changed;
  int forward_fix_comm_changed;
  int reverse_comm_changed;
  int nthreads,ngpus;
--- a/src/KOKKOS/kokkos_base.h
+++ b/src/KOKKOS/kokkos_base.h
@ -29,6 +29,10 @@ class KokkosBase {
                                       int, int *) {return 0;};
  virtual void unpack_forward_comm_kokkos(int, int, DAT::tdual_xfloat_1d &) {}

+  virtual int pack_reverse_comm_kokkos(int, int, DAT::tdual_xfloat_1d &) {return 0;};
+  virtual void unpack_reverse_comm_kokkos(int, DAT::tdual_int_2d,
+                                          int, DAT::tdual_xfloat_1d &) {}
+
  // Fix
  virtual int pack_forward_comm_fix_kokkos(int, DAT::tdual_int_2d,
                                           int, DAT::tdual_xfloat_1d &,
--- a/src/KOKKOS/math_special_kokkos.cpp
+++ b/src/KOKKOS/math_special_kokkos.cpp
@ -477,59 +477,3 @@ double MathSpecialKokkos::erfcx_y100(const double y100)
    return 1.0;
 } /* erfcx_y100 */

-/* optimizer friendly implementation of exp2(x).
- *
- * strategy:
- *
- * split argument into an integer part and a fraction:
- * ipart = floor(x+0.5);
- * fpart = x - ipart;
- *
- * compute exp2(ipart) from setting the ieee754 exponent
- * compute exp2(fpart) using a pade' approximation for x in [-0.5;0.5[
- *
- * the result becomes: exp2(x) = exp2(ipart) * exp2(fpart)
- */
-
-/* IEEE 754 double precision floating point data manipulation */
-typedef union
-{
-    double   f;
-    uint64_t u;
-    struct {int32_t  i0,i1;} s;
-}  udi_t;
-
-static const double fm_exp2_q[] = {
-/*  1.00000000000000000000e0, */
-    2.33184211722314911771e2,
-    4.36821166879210612817e3
-};
-static const double fm_exp2_p[] = {
-    2.30933477057345225087e-2,
-    2.02020656693165307700e1,
-    1.51390680115615096133e3
-};
-
-double MathSpecialKokkos::exp2_x86(double x)
-{
-    double   ipart, fpart, px, qx;
-    udi_t    epart;
-
-    ipart = floor(x+0.5);
-    fpart = x - ipart;
-    epart.s.i0 = 0;
-    epart.s.i1 = (((int) ipart) + 1023) << 20;
-
-    x = fpart*fpart;
-
-    px =        fm_exp2_p[0];
-    px = px*x + fm_exp2_p[1];
-    qx =    x + fm_exp2_q[0];
-    px = px*x + fm_exp2_p[2];
-    qx = qx*x + fm_exp2_q[1];
-
-    px = px * fpart;
-
-    x = 1.0 + 2.0*(px/(qx-px));
-    return epart.f*x;
-}
--- a/src/KOKKOS/math_special_kokkos.h
+++ b/src/KOKKOS/math_special_kokkos.h
@ -22,79 +22,233 @@ namespace LAMMPS_NS {

 namespace MathSpecialKokkos {

+  /*! Fast tabulated factorial function
+   *
+   *  This function looks up pre-computed factorial values for arguments of n = 0
+   *  to a maximum of 167, which is the maximal value representable by a double
+   *  precision floating point number.  For other values of n a NaN value is returned.
+   *
+   *  \param   n  argument (valid: 0 <= n <= 167)
+   *  \return  value of n! as double precision number or NaN */
+
+  extern double factorial(const int n);
+
+  /* optimizer friendly implementation of exp2(x).
+   *
+   * strategy:
+   *
+   * split argument into an integer part and a fraction:
+   * ipart = floor(x+0.5);
+   * fpart = x - ipart;
+   *
+   * compute exp2(ipart) from setting the ieee754 exponent
+   * compute exp2(fpart) using a pade' approximation for x in [-0.5;0.5[
+   *
+   * the result becomes: exp2(x) = exp2(ipart) * exp2(fpart)
+   */
+
+  /* IEEE 754 double precision floating point data manipulation */
+  typedef union
+  {
+    double   f;
+    uint64_t u;
+    struct {int32_t  i0,i1;} s;
+  }  udi_t;
+
+  /* double precision constants */
+  #define FM_DOUBLE_LOG2OFE  1.4426950408889634074
+
+  /*! Fast implementation of 2^x without argument checks for little endian CPUs
+   *
+   *  This function implements an optimized version of pow(2.0, x) that does not
+   *  check for valid arguments and thus may only be used where arguments are well
+   *  behaved.  The implementation makes assumptions about the layout of double
+   *  precision floating point numbers in memory and thus will only work on little
+   *  endian CPUs.  If little endian cannot be safely detected, the result of
+   *  calling pow(2.0, x) will be returned.  This function also is the basis for
+   *  the fast exponential fm_exp(x).
+   *
+   *  \param   x argument
+   *  \return  value of 2^x as double precision number */
+
+  KOKKOS_INLINE_FUNCTION
+  static double exp2_x86(double x)
+  {
+  #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+      double   ipart, fpart, px, qx;
+      udi_t    epart;
+
+  const double fm_exp2_q[2] = {
+  /*  1.00000000000000000000e0, */
+      2.33184211722314911771e2,
+      4.36821166879210612817e3
+  };
+  const double fm_exp2_p[3] = {
+      2.30933477057345225087e-2,
+      2.02020656693165307700e1,
+      1.51390680115615096133e3
+  };
+
+      ipart = floor(x+0.5);
+      fpart = x - ipart;
+      epart.s.i0 = 0;
+      epart.s.i1 = (((int) ipart) + 1023) << 20;
+
+      x = fpart*fpart;
+
+      px =        fm_exp2_p[0];
+      px = px*x + fm_exp2_p[1];
+      qx =    x + fm_exp2_q[0];
+      px = px*x + fm_exp2_p[2];
+      qx = qx*x + fm_exp2_q[1];
+
+      px = px * fpart;
+
+      x = 1.0 + 2.0*(px/(qx-px));
+      return epart.f*x;
+  #else
+      return pow(2.0, x);
+  #endif
+  }
+
+  /*! Fast implementation of exp(x) for little endian CPUs
+   *
+   *  This function implements an optimized version of exp(x) for little endian CPUs.
+   *  It calls the exp2_x86(x) function with a suitable prefactor to x to return exp(x).
+   *  The implementation makes assumptions about the layout of double
+   *  precision floating point numbers in memory and thus will only work on little
+   *  endian CPUs.  If little endian cannot be safely detected, the result of
+   *  calling the exp(x) implementation in the standard math library will be returned.
+   *
+   *  \param   x argument
+   *  \return  value of e^x as double precision number */
+
+  KOKKOS_INLINE_FUNCTION
+  static double fm_exp(double x)
+  {
+  #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+      if (x < -1022.0/FM_DOUBLE_LOG2OFE) return 0;
+      if (x > 1023.0/FM_DOUBLE_LOG2OFE) return INFINITY;
+      return exp2_x86(FM_DOUBLE_LOG2OFE * x);
+  #else
+      return ::exp(x);
+  #endif
+  }
+
  // support function for scaled error function complement

  extern double erfcx_y100(const double y100);

-  // fast 2**x function without argument checks for little endian CPUs
-  extern double exp2_x86(double x);
-
-  // scaled error function complement exp(x*x)*erfc(x) for coul/long styles
+  /*! Fast scaled error function complement exp(x*x)*erfc(x) for coul/long styles
+   *
+   *  This is a portable fast implementation of exp(x*x)*erfc(x) that can be used
+   *  in coul/long pair styles as a replacement for the polynomial expansion that
+   *  is/was widely used.  Unlike the polynomial expansion, that is only accurate
+   *  at the level of single precision floating point it provides full double precision
+   *  accuracy, but at comparable speed (unlike the erfc() implementation shipped
+   *  with GNU standard math library).
+   *
+   *  \param   x argument
+   *  \return  value of e^(x*x)*erfc(x) */

  static inline double my_erfcx(const double x)
  {
-    if (x >= 0.0) return erfcx_y100(400.0/(4.0+x));
-    else return 2.0*exp(x*x) - erfcx_y100(400.0/(4.0-x));
+    if (x >= 0.0)
+      return erfcx_y100(400.0 / (4.0 + x));
+    else
+      return 2.0 * exp(x * x) - erfcx_y100(400.0 / (4.0 - x));
  }

-  // exp(-x*x) for coul/long styles
+  /*! Fast implementation of exp(-x*x) for little endian CPUs for coul/long styles
+   *
+   *  This function implements an optimized version of exp(-x*x) based on exp2_x86()
+   *  for use with little endian CPUs. If little endian cannot be safely detected,
+   *  the result of calling the exp(-x*x) implementation in the standard math
+   *  library will be returned.
+   *
+   *  \param   x argument
+   *  \return  value of e^(-x*x) as double precision number */

  static inline double expmsq(double x)
  {
    x *= x;
    x *= 1.4426950408889634074; // log_2(e)
-#if defined(__BYTE_ORDER__) &&  __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
    return (x < 1023.0) ? exp2_x86(-x) : 0.0;
 #else
    return (x < 1023.0) ? exp2(-x) : 0.0;
 #endif
  }

-  // x**2, use instead of pow(x,2.0)
-  KOKKOS_INLINE_FUNCTION
-  static double square(const double &x) { return x*x; }
+  /*! Fast inline version of pow(x, 2.0)
+   *
+   *  \param   x argument
+   *  \return  x*x */

-  // x**3, use instead of pow(x,3.0)
  KOKKOS_INLINE_FUNCTION
-  static double cube(const double &x) { return x*x*x; }
+  static double square(const double &x) { return x * x; }
+
+  /*! Fast inline version of pow(x, 3.0)
+   *
+   *  \param   x argument
+   *  \return  x*x */
+
+  KOKKOS_INLINE_FUNCTION
+  static double cube(const double &x) { return x * x * x; }
+
+  /* Fast inline version of pow(-1.0, n)
+   *
+   *  \param   n argument (integer)
+   *  \return  -1 if n is odd, 1.0 if n is even */

-  // return -1.0 for odd n, 1.0 for even n, like pow(-1.0,n)
  KOKKOS_INLINE_FUNCTION
  static double powsign(const int n) { return (n & 1) ? -1.0 : 1.0; }

-  // optimized version of pow(x,n) with n being integer
-  // up to 10x faster than pow(x,y)
+  /* Fast inline version of pow(x,n) for integer n
+   *
+   * This is a version of pow(x,n) optimized for n being integer.
+   * Speedups of up to 10x faster than pow(x,y) have been measured.
+   *
+   *  \param   n argument (integer)
+   *  \return  value of x^n */

  KOKKOS_INLINE_FUNCTION
-  static double powint(const double &x, const int n) {
-    double yy,ww;
+  static double powint(const double &x, const int n)
+  {
+    double yy, ww;

    if (x == 0.0) return 0.0;
    int nn = (n > 0) ? n : -n;
    ww = x;

-    for (yy = 1.0; nn != 0; nn >>= 1, ww *=ww)
+    for (yy = 1.0; nn != 0; nn >>= 1, ww *= ww)
      if (nn & 1) yy *= ww;

-    return (n > 0) ? yy : 1.0/yy;
+    return (n > 0) ? yy : 1.0 / yy;
  }

-  // optimized version of (sin(x)/x)**n with n being a _positive_ integer
+  /* Fast inline version of (sin(x)/x)^n as used by PPPM kspace styles
+   *
+   * This is an optimized function to compute (sin(x)/x)^n as frequently used by PPPM.
+   *
+   *  \param   n argument (integer). Expected to be positive.
+   *  \return  value of (sin(x)/x)^n */

  KOKKOS_INLINE_FUNCTION
-  static double powsinxx(const double &x, int n) {
-    double yy,ww;
+  static double powsinxx(const double &x, int n)
+  {
+    double yy, ww;

    if (x == 0.0) return 1.0;

-    ww = sin(x)/x;
+    ww = sin(x) / x;

-    for (yy = 1.0; n != 0; n >>= 1, ww *=ww)
+    for (yy = 1.0; n != 0; n >>= 1, ww *= ww)
      if (n & 1) yy *= ww;

    return yy;
  }
-}
-}
+}    // namespace MathSpecialKokkos
+}    // namespace LAMMPS_NS

 #endif
--- a/src/KOKKOS/meam_dens_final_kokkos.h
+++ b/src/KOKKOS/meam_dens_final_kokkos.h
@ -0,0 +1,164 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "meam_kokkos.h"
+#include "math_special.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void
+MEAMKokkos<DeviceType>::meam_dens_final(int nlocal, int eflag_either, int eflag_global, int eflag_atom,
+                      typename ArrayTypes<DeviceType>::t_efloat_1d eatom, int ntype, typename AT::t_int_1d type, typename AT::t_int_1d d_map, typename AT::t_int_2d d_scale, int& errorflag, EV_FLOAT &ev_all)
+{
+  EV_FLOAT ev;
+  this->eflag_either = eflag_either;
+  this->eflag_global = eflag_global;
+  this->eflag_atom = eflag_atom;
+  this->d_eatom = eatom;
+  this->ntype = ntype;
+  this->type = type;
+  this->d_map = d_map;
+  this->d_scale = d_scale;
+
+  Kokkos::deep_copy(d_errorflag,0);
+
+  // Complete the calculation of density
+
+  copymode = 1;
+  Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagMEAMDensFinal>(0,nlocal),*this,ev);
+  ev_all.evdwl += ev.evdwl;
+  copymode = 0;
+
+  auto h_errorflag = Kokkos::create_mirror_view_and_copy(LMPHostType(),d_errorflag);
+  errorflag = h_errorflag();
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void MEAMKokkos<DeviceType>::operator()(TagMEAMDensFinal, const int &i, EV_FLOAT& ev) const {
+
+  F_FLOAT rhob, G, dG, Gbar, dGbar, gam, shp[3], Z;
+  F_FLOAT denom, rho_bkgd, Fl;
+  double scaleii;
+
+  int elti = d_map[type[i]];
+  if (elti >= 0) {
+    scaleii = d_scale(type[i],type[i]);
+    d_rho1[i] = 0.0;
+    d_rho2[i] = -1.0 / 3.0 * d_arho2b[i] * d_arho2b[i];
+    d_rho3[i] = 0.0;
+    for (int m = 0; m < 3; m++) {
+      d_rho1[i] += d_arho1(i,m) * d_arho1(i,m);
+      d_rho3[i] -= 3.0 / 5.0 * d_arho3b(i,m) * d_arho3b(i,m);
+    }
+    for (int m = 0; m < 6; m++)
+      d_rho2[i] += v2D[m] * d_arho2(i,m) * d_arho2(i,m);
+    for (int m = 0; m < 10; m++)
+      d_rho3[i] += v3D[m] * d_arho3(i,m) * d_arho3(i,m);
+
+    if (d_rho0[i] > 0.0) {
+      if (ialloy == 1) {
+        d_t_ave(i,0) = fdiv_zero_kk(d_t_ave(i,0), d_tsq_ave(i,0));
+        d_t_ave(i,1) = fdiv_zero_kk(d_t_ave(i,1), d_tsq_ave(i,1));
+        d_t_ave(i,2) = fdiv_zero_kk(d_t_ave(i,2), d_tsq_ave(i,2));
+      } else if (ialloy == 2) {
+        d_t_ave(i,0) = t1_meam[elti];
+        d_t_ave(i,1) = t2_meam[elti];
+        d_t_ave(i,2) = t3_meam[elti];
+      } else {
+        d_t_ave(i,0) /= d_rho0[i];
+        d_t_ave(i,1) /= d_rho0[i];
+        d_t_ave(i,2) /= d_rho0[i];
+      }
+    }
+
+    d_gamma[i] = d_t_ave(i,0) * d_rho1[i] + d_t_ave(i,1) * d_rho2[i] + d_t_ave(i,2) * d_rho3[i];
+
+    if (d_rho0[i] > 0.0)
+      d_gamma[i] /= (d_rho0[i] * d_rho0[i]);
+
+    Z = get_Zij(lattce_meam[elti][elti]);
+
+    G = G_gam(d_gamma[i], ibar_meam[elti], d_errorflag());
+    if (d_errorflag() != 0)
+      return;
+
+    get_shpfcn(lattce_meam[elti][elti], stheta_meam[elti][elti], ctheta_meam[elti][elti], shp);
+    if (ibar_meam[elti] <= 0) {
+      Gbar = 1.0;
+      dGbar = 0.0;
+    } else {
+      if (mix_ref_t == 1)
+        gam = (d_t_ave(i,0) * shp[0] + d_t_ave(i,1) * shp[1] + d_t_ave(i,2) * shp[2]) / (Z * Z);
+      else
+        gam = (t1_meam[elti] * shp[0] + t2_meam[elti] * shp[1] + t3_meam[elti] * shp[2]) /
+              (Z * Z);
+      Gbar = G_gam(gam, ibar_meam[elti], d_errorflag());
+    }
+    d_rho[i] = d_rho0[i] * G;
+
+    if (mix_ref_t == 1) {
+      if (ibar_meam[elti] <= 0) {
+        Gbar = 1.0;
+        dGbar = 0.0;
+      } else {
+        gam = (d_t_ave(i,0) * shp[0] + d_t_ave(i,1) * shp[1] + d_t_ave(i,2) * shp[2]) / (Z * Z);
+        Gbar = dG_gam(gam, ibar_meam[elti], dGbar);
+      }
+      rho_bkgd = rho0_meam[elti] * Z * Gbar;
+    } else {
+      if (bkgd_dyn == 1)
+        rho_bkgd = rho0_meam[elti] * Z;
+      else
+        rho_bkgd = rho_ref_meam[elti];
+    }
+    rhob = d_rho[i] / rho_bkgd;
+    denom = 1.0 / rho_bkgd;
+
+    G = dG_gam(d_gamma[i], ibar_meam[elti], dG);
+
+    d_dgamma1[i] = (G - 2 * dG * d_gamma[i]) * denom;
+
+    if (!iszero_kk(d_rho0[i]))
+      d_dgamma2[i] = (dG / d_rho0[i]) * denom;
+    else
+      d_dgamma2[i] = 0.0;
+
+    // dgamma3 is nonzero only if we are using the "mixed" rule for
+    // computing t in the reference system (which is not correct, but
+    // included for backward compatibility
+    if (mix_ref_t == 1)
+      d_dgamma3[i] = d_rho0[i] * G * dGbar / (Gbar * Z * Z) * denom;
+    else
+      d_dgamma3[i] = 0.0;
+
+    Fl = embedding(A_meam[elti], Ec_meam[elti][elti], rhob, d_frhop[i]);
+
+    if (eflag_either) {
+      Fl *= scaleii;
+      if (eflag_global) {
+        ev.evdwl += Fl;
+      }
+      if (eflag_atom) {
+        d_eatom[i] += Fl;
+      }
+    }
+  }
+}
+
--- a/src/KOKKOS/meam_dens_init_kokkos.h
+++ b/src/KOKKOS/meam_dens_init_kokkos.h
@ -0,0 +1,602 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "meam_kokkos.h"
+#include "math_special_kokkos.h"
+
+using namespace LAMMPS_NS;
+using namespace MathSpecialKokkos;
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+template<int NEIGHFLAG>
+KOKKOS_INLINE_FUNCTION
+void MEAMKokkos<DeviceType>::operator()(TagMEAMDensInit<NEIGHFLAG>, const int &i) const {
+  int ii, offsetval;
+  ii = d_ilist_half[i];
+  offsetval = d_offset[i];
+  // compute screening function and derivatives
+  this->template getscreen<NEIGHFLAG>(ii, offsetval, x, d_numneigh_half,
+            d_numneigh_full, ntype, type, d_map);
+
+  // calculate intermediate density terms to be communicated
+  this->template calc_rho1<NEIGHFLAG>(ii, ntype, type, d_map, x, d_numneigh_half, offsetval);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void MEAMKokkos<DeviceType>::operator()(TagMEAMZero, const int &i) const {
+  d_rho0[i] = 0.0;
+  d_arho2b[i] = 0.0;
+  d_arho1(i,0) = d_arho1(i,1) = d_arho1(i,2) = 0.0;
+  for (int j = 0; j < 6; j++)
+    d_arho2(i,j) = 0.0;
+  for (int j = 0; j < 10; j++)
+    d_arho3(i,j) = 0.0;
+  d_arho3b(i,0) = d_arho3b(i,1) = d_arho3b(i,2) = 0.0;
+  d_t_ave(i,0) = d_t_ave(i,1) = d_t_ave(i,2) = 0.0;
+  d_tsq_ave(i,0) = d_tsq_ave(i,1) = d_tsq_ave(i,2) = 0.0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void
+MEAMKokkos<DeviceType>::meam_dens_setup(int atom_nmax, int nall, int n_neigh)
+{
+  MemoryKokkos *memoryKK = (MemoryKokkos *)memory;
+
+  // grow local arrays if necessary
+
+  if (atom_nmax > nmax) {
+    memoryKK->destroy_kokkos(k_rho,rho);
+    memoryKK->destroy_kokkos(k_rho0,rho0);
+    memoryKK->destroy_kokkos(k_rho1,rho1);
+    memoryKK->destroy_kokkos(k_rho2,rho2);
+    memoryKK->destroy_kokkos(k_rho3,rho3);
+    memoryKK->destroy_kokkos(k_frhop,frhop);
+    memoryKK->destroy_kokkos(k_gamma,gamma);
+    memoryKK->destroy_kokkos(k_dgamma1,dgamma1);
+    memoryKK->destroy_kokkos(k_dgamma2,dgamma2);
+    memoryKK->destroy_kokkos(k_dgamma3,dgamma3);
+    memoryKK->destroy_kokkos(k_arho2b,arho2b);
+    memoryKK->destroy_kokkos(k_arho1,arho1);
+    memoryKK->destroy_kokkos(k_arho2,arho2);
+    memoryKK->destroy_kokkos(k_arho3,arho3);
+    memoryKK->destroy_kokkos(k_arho3b,arho3b);
+    memoryKK->destroy_kokkos(k_t_ave,t_ave);
+    memoryKK->destroy_kokkos(k_tsq_ave,tsq_ave);
+
+    nmax = atom_nmax;
+//    memory->create(rho, nmax, "pair:rho");
+    k_rho = DAT::tdual_ffloat_1d("pair:rho",nmax);
+    d_rho = k_rho.template view<DeviceType>();
+    h_rho = k_rho.h_view;
+ //   memory->create(rho0, nmax, "pair:rho0");
+    k_rho0 = DAT::tdual_ffloat_1d("pair:rho0",nmax);
+    d_rho0 = k_rho0.template view<DeviceType>();
+    h_rho0 = k_rho0.h_view;
+    //memory->create(rho1, nmax, "pair:rho1");
+    k_rho1 = DAT::tdual_ffloat_1d("pair:rho1",nmax);
+    d_rho1 = k_rho1.template view<DeviceType>();
+    h_rho1 = k_rho1.h_view;
+    //memory->create(rho2, nmax, "pair:rho2");
+    k_rho2 = DAT::tdual_ffloat_1d("pair:rho2",nmax);
+    d_rho2 = k_rho2.template view<DeviceType>();
+    h_rho2 = k_rho2.h_view;
+    //memory->create(rho3, nmax, "pair:rho3");
+    k_rho3 = DAT::tdual_ffloat_1d("pair:rho3",nmax);
+    d_rho3 = k_rho3.template view<DeviceType>();
+    h_rho3 = k_rho3.h_view;
+    //memory->create(frhop, nmax, "pair:frhop");
+    k_frhop = DAT::tdual_ffloat_1d("pair:frhop",nmax);
+    d_frhop = k_frhop.template view<DeviceType>();
+    h_frhop = k_frhop.h_view;
+    //memory->create(gamma, nmax, "pair:gamma");
+    k_gamma = DAT::tdual_ffloat_1d("pair:gamma",nmax);
+    d_gamma = k_gamma.template view<DeviceType>();
+    h_gamma = k_gamma.h_view;
+    //memory->create(dgamma1, nmax, "pair:dgamma1");
+    k_dgamma1 = DAT::tdual_ffloat_1d("pair:dgamma1",nmax);
+    d_dgamma1 = k_dgamma1.template view<DeviceType>();
+    h_dgamma1 = k_dgamma1.h_view;
+    //memory->create(dgamma2, nmax, "pair:dgamma2");
+    k_dgamma2 = DAT::tdual_ffloat_1d("pair:dgamma2",nmax);
+    d_dgamma2 = k_dgamma2.template view<DeviceType>();
+    h_dgamma2 = k_dgamma2.h_view;
+    //memory->create(dgamma3, nmax, "pair:dgamma3");
+    k_dgamma3 = DAT::tdual_ffloat_1d("pair:dgamma3",nmax);
+    d_dgamma3 = k_dgamma3.template view<DeviceType>();
+    h_dgamma3 = k_dgamma3.h_view;
+    //memory->create(arho2b, nmax, "pair:arho2b");
+    k_arho2b = DAT::tdual_ffloat_1d("pair:arho2b",nmax);
+    d_arho2b = k_arho2b.template view<DeviceType>();
+    h_arho2b = k_arho2b.h_view;
+    //memory->create(arho1, nmax, 3, "pair:arho1");
+    k_arho1 = DAT::tdual_ffloat_2d("pair:arho1",nmax, 3);
+    d_arho1 = k_arho1.template view<DeviceType>();
+    h_arho1 = k_arho1.h_view;
+    //memory->create(arho2, nmax, 6, "pair:arho2");
+    k_arho2 = DAT::tdual_ffloat_2d("pair:arho2",nmax, 6);
+    d_arho2 = k_arho2.template view<DeviceType>();
+    h_arho2 = k_arho2.h_view;
+    //memory->create(arho3, nmax, 10, "pair:arho3");
+    k_arho3 = DAT::tdual_ffloat_2d("pair:arho3",nmax, 10);
+    d_arho3 = k_arho3.template view<DeviceType>();
+    h_arho3 = k_arho3.h_view;
+    //memory->create(arho3b, nmax, 3, "pair:arho3b");
+    k_arho3b = DAT::tdual_ffloat_2d("pair:arho3b",nmax, 3);
+    d_arho3b = k_arho3b.template view<DeviceType>();
+    h_arho3b = k_arho3b.h_view;
+    //memory->create(t_ave, nmax, 3, "pair:t_ave");
+    k_t_ave = DAT::tdual_ffloat_2d("pair:t_ave",nmax, 3);
+    d_t_ave = k_t_ave.template view<DeviceType>();
+    h_t_ave = k_t_ave.h_view;
+    //memory->create(tsq_ave, nmax, 3, "pair:tsq_ave");
+    k_tsq_ave = DAT::tdual_ffloat_2d("pair:tsq_ave",nmax, 3);
+    d_tsq_ave = k_tsq_ave.template view<DeviceType>();
+    h_tsq_ave = k_tsq_ave.h_view;
+  }
+
+  if (n_neigh > maxneigh) {
+    memoryKK->destroy_kokkos(k_scrfcn,scrfcn);
+    memoryKK->destroy_kokkos(k_dscrfcn,dscrfcn);
+    memoryKK->destroy_kokkos(k_fcpair,fcpair);
+    maxneigh = n_neigh;
+   // memory->create(scrfcn, maxneigh, "pair:scrfcn");
+    k_scrfcn = DAT::tdual_ffloat_1d("pair:scrfcn", maxneigh);
+    d_scrfcn = k_scrfcn.template view<DeviceType>();
+    h_scrfcn = k_scrfcn.h_view;
+    //memory->create(dscrfcn, maxneigh, "pair:dscrfcn");
+    k_dscrfcn = DAT::tdual_ffloat_1d("pair:dscrfcn", maxneigh);
+    d_dscrfcn = k_dscrfcn.template view<DeviceType>();
+    h_dscrfcn = k_dscrfcn.h_view;
+    //memory->create(fcpair, maxneigh, "pair:fcpair");
+    k_fcpair = DAT::tdual_ffloat_1d("pair:fcpair", maxneigh);
+    d_fcpair = k_fcpair.template view<DeviceType>();
+    h_fcpair = k_fcpair.h_view;
+  }
+
+  // zero out local arrays
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagMEAMZero>(0, nall),*this);
+  copymode = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void
+MEAMKokkos<DeviceType>::meam_dens_init(int inum_half, int ntype, typename AT::t_int_1d type, typename AT::t_int_1d d_map, typename AT::t_x_array x, typename AT::t_int_1d d_numneigh_half, typename AT::t_int_1d d_numneigh_full,
+                     typename AT::t_int_1d d_ilist_half, typename AT::t_neighbors_2d d_neighbors_half, typename AT::t_neighbors_2d d_neighbors_full, typename AT::t_int_1d d_offset, int neighflag, int need_dup)
+{
+  this->ntype = ntype;
+  this->type = type;
+  this->d_map = d_map;
+  this->x = x;
+  this->d_numneigh_half = d_numneigh_half;
+  this->d_numneigh_full = d_numneigh_full;
+  this->d_ilist_half = d_ilist_half;
+  this->d_neighbors_half = d_neighbors_half;
+  this->d_neighbors_full = d_neighbors_full;
+  this->d_offset = d_offset;
+  this->nlocal = nlocal;
+
+  if (need_dup) {
+    dup_rho0 = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_rho0);
+    dup_arho2b = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_arho2b);
+    dup_arho1 = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_arho1);
+    dup_arho2 = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_arho2);
+    dup_arho3 = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_arho3);
+    dup_arho3b = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_arho3b);
+    dup_t_ave = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_t_ave);
+    dup_tsq_ave = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_tsq_ave);
+  } else {
+    ndup_rho0 = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_rho0);
+    ndup_arho2b = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_arho2b);
+    ndup_arho1 = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_arho1);
+    ndup_arho2 = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_arho2);
+    ndup_arho3 = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_arho3);
+    ndup_arho3b = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_arho3b);
+    ndup_t_ave = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_t_ave);
+    ndup_tsq_ave = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_tsq_ave);
+  }
+
+  copymode = 1;
+  if (neighflag == HALF)
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagMEAMDensInit<HALF> >(0,inum_half),*this);
+  else if (neighflag == HALFTHREAD)
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagMEAMDensInit<HALFTHREAD> >(0,inum_half),*this);
+  copymode = 0;
+
+  if (need_dup) {
+    Kokkos::Experimental::contribute(d_rho0, dup_rho0);
+    Kokkos::Experimental::contribute(d_arho2b, dup_arho2b);
+    Kokkos::Experimental::contribute(d_arho1, dup_arho1);
+    Kokkos::Experimental::contribute(d_arho2, dup_arho2);
+    Kokkos::Experimental::contribute(d_arho3, dup_arho3);
+    Kokkos::Experimental::contribute(d_arho3b, dup_arho3b);
+    Kokkos::Experimental::contribute(d_t_ave, dup_t_ave);
+    Kokkos::Experimental::contribute(d_tsq_ave, dup_tsq_ave);
+
+    // free duplicated memory
+    dup_rho0 = decltype(dup_rho0)();
+    dup_arho2b = decltype(dup_arho2b)();
+    dup_arho1 = decltype(dup_arho1)();
+    dup_arho2 = decltype(dup_arho2)();
+    dup_arho3 = decltype(dup_arho3)();
+    dup_arho3b = decltype(dup_arho3b)();
+    dup_t_ave = decltype(dup_t_ave)();
+    dup_tsq_ave = decltype(dup_tsq_ave)();
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+template<int NEIGHFLAG>
+KOKKOS_INLINE_FUNCTION
+void
+MEAMKokkos<DeviceType>::getscreen(int i, int offset, typename AT::t_x_array x, typename AT::t_int_1d d_numneigh_half,
+                typename AT::t_int_1d d_numneigh_full, int /*ntype*/, typename AT::t_int_1d type, typename AT::t_int_1d d_map)
+const {
+  const double drinv = 1.0 / delr_meam;
+  const int elti = d_map[type[i]];
+  if (elti < 0) return;
+
+  const double xitmp = x(i,0);
+  const double yitmp = x(i,1);
+  const double zitmp = x(i,2);
+
+  for (int jn = 0; jn < d_numneigh_half[i]; jn++) {
+    const int j = d_neighbors_half(i,jn);
+
+    const int eltj = d_map[type[j]];
+    if (eltj < 0) continue;
+
+    // First compute screening function itself, sij
+    const double xjtmp = x(j,0);
+    const double yjtmp = x(j,1);
+    const double zjtmp = x(j,2);
+    const double delxij = xjtmp - xitmp;
+    const double delyij = yjtmp - yitmp;
+    const double delzij = zjtmp - zitmp;
+
+    const double rij2 = delxij * delxij + delyij * delyij + delzij * delzij;
+
+    if (rij2 > cutforcesq) {
+      d_dscrfcn[offset+jn] = 0.0;
+      d_scrfcn[offset+jn] = 0.0;
+      d_fcpair[offset+jn] = 0.0;
+      continue;
+    }
+
+    // Now compute derivatives
+    const double rbound = ebound_meam[elti][eltj] * rij2;
+    const double rij = sqrt(rij2);
+    const double rnorm = (cutforce - rij) * drinv;
+    double sij = 1.0;
+
+    // if rjk2 > ebound*rijsq, atom k is definitely outside the ellipse
+    for (int kn = 0; kn < d_numneigh_full[i]; kn++) {
+      int k = d_neighbors_full(i,kn);
+      if (k == j) continue;
+      int eltk = d_map[type[k]];
+      if (eltk < 0) continue;
+
+      const double xktmp = x(k,0);
+      const double yktmp = x(k,1);
+      const double zktmp = x(k,2);
+
+      const double delxjk = xktmp - xjtmp;
+      const double delyjk = yktmp - yjtmp;
+      const double delzjk = zktmp - zjtmp;
+      const double rjk2 = delxjk * delxjk + delyjk * delyjk + delzjk * delzjk;
+      if (rjk2 > rbound) continue;
+
+      const double delxik = xktmp - xitmp;
+      const double delyik = yktmp - yitmp;
+      const double delzik = zktmp - zitmp;
+      const double rik2 = delxik * delxik + delyik * delyik + delzik * delzik;
+      if (rik2 > rbound) continue;
+
+      const double xik = rik2 / rij2;
+      const double xjk = rjk2 / rij2;
+      const double a = 1 - (xik - xjk) * (xik - xjk);
+      // if a < 0, then ellipse equation doesn't describe this case and
+      // atom k can't possibly screen i-j
+      if (a <= 0.0) continue;
+
+      double cikj = (2.0 * (xik + xjk) + a - 2.0) / a;
+      const double Cmax = Cmax_meam[elti][eltj][eltk];
+      const double Cmin = Cmin_meam[elti][eltj][eltk];
+      double sikj;
+      if (cikj >= Cmax) continue;
+      // note that cikj may be slightly negative (within numerical
+      // tolerance) if atoms are colinear, so don't reject that case here
+      // (other negative cikj cases were handled by the test on "a" above)
+      else if (cikj <= Cmin) {
+        sij = 0.0;
+        break;
+      } else {
+        const double delc = Cmax - Cmin;
+        cikj = (cikj - Cmin) / delc;
+        sikj = fcut(cikj);
+      }
+      sij *= sikj;
+    }
+
+    double dfc;
+    const double fc = dfcut(rnorm, dfc);
+    const double fcij = fc;
+    const double dfcij = dfc * drinv;
+
+    // Now compute derivatives
+    d_dscrfcn[offset+jn] = 0.0;
+    const double sfcij = sij * fcij;
+    if (!iszero_kk(sfcij) && !isone_kk(sfcij)) {
+      for (int kn = 0; kn < d_numneigh_full[i]; kn++) {
+        const int k = d_neighbors_full(i,kn);
+        if (k == j) continue;
+        const int eltk = d_map[type[k]];
+        if (eltk < 0) continue;
+
+        const double delxjk = x(k,0) - xjtmp;
+        const double delyjk = x(k,1) - yjtmp;
+        const double delzjk = x(k,2) - zjtmp;
+        const double rjk2 = delxjk * delxjk + delyjk * delyjk + delzjk * delzjk;
+        if (rjk2 > rbound) continue;
+
+        const double delxik = x(k,0) - xitmp;
+        const double delyik = x(k,1) - yitmp;
+        const double delzik = x(k,2) - zitmp;
+        const double rik2 = delxik * delxik + delyik * delyik + delzik * delzik;
+        if (rik2 > rbound) continue;
+
+        const double xik = rik2 / rij2;
+        const double xjk = rjk2 / rij2;
+        const double a = 1 - (xik - xjk) * (xik - xjk);
+        // if a < 0, then ellipse equation doesn't describe this case and
+        // atom k can't possibly screen i-j
+        if (a <= 0.0) continue;
+
+        double cikj = (2.0 * (xik + xjk) + a - 2.0) / a;
+        const double Cmax = Cmax_meam[elti][eltj][eltk];
+        const double Cmin = Cmin_meam[elti][eltj][eltk];
+        if (cikj >= Cmax) {
+          continue;
+          // Note that cikj may be slightly negative (within numerical
+          // tolerance) if atoms are colinear, so don't reject that case
+          // here
+          // (other negative cikj cases were handled by the test on "a"
+          // above)
+          // Note that we never have 0<cikj<Cmin here, else sij=0
+          // (rejected above)
+        } else {
+          const double delc = Cmax - Cmin;
+          cikj = (cikj - Cmin) / delc;
+          double dfikj;
+          const double sikj = dfcut(cikj, dfikj);
+          const double coef1 = dfikj / (delc * sikj);
+          const double dCikj = dCfunc(rij2, rik2, rjk2);
+          d_dscrfcn[offset+jn] += coef1 * dCikj;
+        }
+      }
+      const double coef1 = sfcij;
+      const double coef2 = sij * dfcij / rij;
+      d_dscrfcn[offset+jn] = d_dscrfcn[offset+jn] * coef1 - coef2;
+    }
+
+    d_scrfcn[offset+jn] = sij;
+    d_fcpair[offset+jn] = fcij;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+template<int NEIGHFLAG>
+KOKKOS_INLINE_FUNCTION
+void
+MEAMKokkos<DeviceType>::calc_rho1(int i, int /*ntype*/, typename AT::t_int_1d type, typename AT::t_int_1d d_map, typename AT::t_x_array x, typename AT::t_int_1d d_numneigh,
+                int offset) const
+{
+  // The rho0, etc. arrays are duplicated for OpenMP, atomic for CUDA, and neither for Serial
+
+  auto v_rho0 = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_rho0),decltype(ndup_rho0)>::get(dup_rho0,ndup_rho0);
+  auto a_rho0 = v_rho0.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();
+  auto v_arho2b = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_arho2b),decltype(ndup_arho2b)>::get(dup_arho2b,ndup_arho2b);
+  auto a_arho2b = v_arho2b.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();
+  auto v_arho1 = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_arho1),decltype(ndup_arho1)>::get(dup_arho1,ndup_arho1);
+  auto a_arho1 = v_arho1.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();
+  auto v_arho2 = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_arho2),decltype(ndup_arho2)>::get(dup_arho2,ndup_arho2);
+  auto a_arho2 = v_arho2.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();
+  auto v_arho3 = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_arho3),decltype(ndup_arho3)>::get(dup_arho3,ndup_arho3);
+  auto a_arho3 = v_arho3.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();
+  auto v_arho3b = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_arho3b),decltype(ndup_arho3b)>::get(dup_arho3b,ndup_arho3b);
+  auto a_arho3b = v_arho3b.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();
+  auto v_t_ave = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_t_ave),decltype(ndup_t_ave)>::get(dup_t_ave,ndup_t_ave);
+  auto a_t_ave = v_t_ave.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();
+  auto v_tsq_ave = ScatterViewHelper<NeedDup_v<NEIGHFLAG,DeviceType>,decltype(dup_tsq_ave),decltype(ndup_tsq_ave)>::get(dup_tsq_ave,ndup_tsq_ave);
+  auto a_tsq_ave = v_tsq_ave.template access<AtomicDup_v<NEIGHFLAG,DeviceType>>();
+
+  const int elti = d_map[type[i]];
+  const double xtmp = x(i,0);
+  const double ytmp = x(i,1);
+  const double ztmp = x(i,2);
+  for (int jn = 0; jn < d_numneigh[i]; jn++) {
+    if (!iszero_kk(d_scrfcn[offset+jn])) {
+      const int j = d_neighbors_half(i,jn);
+      const double sij = d_scrfcn[offset+jn] * d_fcpair[offset+jn];
+      double delij[3];
+      delij[0] = x(j,0) - xtmp;
+      delij[1] = x(j,1) - ytmp;
+      delij[2] = x(j,2) - ztmp;
+      const double rij2 = delij[0] * delij[0] + delij[1] * delij[1] + delij[2] * delij[2];
+      if (rij2 < cutforcesq) {
+        const int eltj = d_map[type[j]];
+        const double rij = sqrt(rij2);
+        const double ai = rij / re_meam[elti][elti] - 1.0;
+        const double aj = rij / re_meam[eltj][eltj] - 1.0;
+        const double ro0i = rho0_meam[elti];
+        const double ro0j = rho0_meam[eltj];
+        const double rhoa0j = ro0j * MathSpecialKokkos::fm_exp(-beta0_meam[eltj] * aj) * sij;
+        double rhoa1j = ro0j * MathSpecialKokkos::fm_exp(-beta1_meam[eltj] * aj) * sij;
+        double rhoa2j = ro0j * MathSpecialKokkos::fm_exp(-beta2_meam[eltj] * aj) * sij;
+        double rhoa3j = ro0j * MathSpecialKokkos::fm_exp(-beta3_meam[eltj] * aj) * sij;
+        const double rhoa0i = ro0i * MathSpecialKokkos::fm_exp(-beta0_meam[elti] * ai) * sij;
+        double rhoa1i = ro0i * MathSpecialKokkos::fm_exp(-beta1_meam[elti] * ai) * sij;
+        double rhoa2i = ro0i * MathSpecialKokkos::fm_exp(-beta2_meam[elti] * ai) * sij;
+        double rhoa3i = ro0i * MathSpecialKokkos::fm_exp(-beta3_meam[elti] * ai) * sij;
+        if (ialloy == 1) {
+          rhoa1j *= t1_meam[eltj];
+          rhoa2j *= t2_meam[eltj];
+          rhoa3j *= t3_meam[eltj];
+          rhoa1i *= t1_meam[elti];
+          rhoa2i *= t2_meam[elti];
+          rhoa3i *= t3_meam[elti];
+        }
+        a_rho0[i] += rhoa0j;
+        a_rho0[j] += rhoa0i;
+        // For ialloy = 2, use single-element value (not average)
+        if (ialloy != 2) {
+          a_t_ave(i,0) += t1_meam[eltj] * rhoa0j;
+          a_t_ave(i,1) += t2_meam[eltj] * rhoa0j;
+          a_t_ave(i,2) += t3_meam[eltj] * rhoa0j;
+          a_t_ave(j,0) += t1_meam[elti] * rhoa0i;
+          a_t_ave(j,1) += t2_meam[elti] * rhoa0i;
+          a_t_ave(j,2) += t3_meam[elti] * rhoa0i;
+        }
+        if (ialloy == 1) {
+          a_tsq_ave(i,0) += t1_meam[eltj] * t1_meam[eltj] * rhoa0j;
+          a_tsq_ave(i,1) += t2_meam[eltj] * t2_meam[eltj] * rhoa0j;
+          a_tsq_ave(i,2) += t3_meam[eltj] * t3_meam[eltj] * rhoa0j;
+          a_tsq_ave(j,0) += t1_meam[elti] * t1_meam[elti] * rhoa0i;
+          a_tsq_ave(j,1) += t2_meam[elti] * t2_meam[elti] * rhoa0i;
+          a_tsq_ave(j,2) += t3_meam[elti] * t3_meam[elti] * rhoa0i;
+        }
+        a_arho2b[i] += rhoa2j;
+        a_arho2b[j] += rhoa2i;
+
+        const double A1j = rhoa1j / rij;
+        const double A2j = rhoa2j / rij2;
+        const double A3j = rhoa3j / (rij2 * rij);
+        const double A1i = rhoa1i / rij;
+        const double A2i = rhoa2i / rij2;
+        const double A3i = rhoa3i / (rij2 * rij);
+        int nv2 = 0;
+        int nv3 = 0;
+        for (int m = 0; m < 3; m++) {
+          a_arho1(i,m) += A1j * delij[m];
+          a_arho1(j,m) += -A1i * delij[m];
+          a_arho3b(i,m) += rhoa3j * delij[m] / rij;
+          a_arho3b(j,m) += -rhoa3i * delij[m] / rij;
+          for (int n = m; n < 3; n++) {
+            a_arho2(i,nv2) += A2j * delij[m] * delij[n];
+            a_arho2(j,nv2) += A2i * delij[m] * delij[n];
+            nv2++;
+            for (int p = n; p < 3; p++) {
+              a_arho3(i,nv3) += A3j * delij[m] * delij[n] * delij[p];
+              a_arho3(j,nv3) += -A3i * delij[m] * delij[n] * delij[p];
+              nv3++;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+//Cutoff function and derivative
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+double MEAMKokkos<DeviceType>::dfcut(const double xi, double& dfc) const
+{
+  if (xi >= 1.0) {
+    dfc = 0.0;
+    return 1.0;
+  } else if (xi <= 0.0) {
+    dfc = 0.0;
+    return 0.0;
+  } else {
+    const double a = 1.0 - xi;
+    const double a3 = a * a * a;
+    const double a4 = a * a3;
+    const double a1m4 = 1.0 - a4;
+
+    dfc = 8 * a1m4 * a3;
+    return a1m4*a1m4;
+  }
+}
+
+  //-----------------------------------------------------------------------------
+  // Derivative of Cikj w.r.t. rij
+  // Inputs: rij,rij2,rik2,rjk2
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+double MEAMKokkos<DeviceType>::dCfunc(const double rij2, const double rik2, const double rjk2) const
+{
+  const double rij4 = rij2 * rij2;
+  const double a = rik2 - rjk2;
+  const double b = rik2 + rjk2;
+  const double asq = a*a;
+  double denom = rij4 - asq;
+  denom = denom * denom;
+  return -4 * (-2 * rij2 * asq + rij4 * b + asq * b) / denom;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void MEAMKokkos<DeviceType>::dCfunc2(const double rij2, const double rik2, const double rjk2, double& dCikj1, double& dCikj2) const
+{
+  const double rij4 = rij2 * rij2;
+  const double rik4 = rik2 * rik2;
+  const double rjk4 = rjk2 * rjk2;
+  const double a = rik2 - rjk2;
+  double denom = rij4 - a * a;
+  denom = denom * denom;
+  dCikj1 = 4 * rij2 * (rij4 + rik4 + 2 * rik2 * rjk2 - 3 * rjk4 - 2 * rij2 * a) / denom;
+  dCikj2 = 4 * rij2 * (rij4 - 3 * rik4 + 2 * rik2 * rjk2 + rjk4 + 2 * rij2 * a) / denom;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+double MEAMKokkos<DeviceType>::fcut(const double xi) const
+{
+  double a;
+  if (xi >= 1.0)
+    return 1.0;
+  else if (xi <= 0.0)
+    return 0.0;
+  else {
+    // ( 1.d0 - (1.d0 - xi)**4 )**2, but with better codegen
+    a = 1.0 - xi;
+    a *= a; a *= a;
+    a = 1.0 - a;
+    return a * a;
+  }
+}
+
--- a/src/KOKKOS/meam_force_kokkos.h
+++ b/src/KOKKOS/meam_force_kokkos.h
@ -0,0 +1,613 @@
+#include "math_special_kokkos.h"
+#include "meam_kokkos.h"
+#include <algorithm>
+
+using namespace LAMMPS_NS;
+using namespace MathSpecialKokkos;
+
+template <class DeviceType>
+void MEAMKokkos<DeviceType>::meam_force(
+    int inum_half, int eflag_global, int eflag_atom, int vflag_global, int vflag_atom,
+    typename ArrayTypes<DeviceType>::t_efloat_1d eatom, int ntype, typename AT::t_int_1d type,
+    typename AT::t_int_1d d_map, typename AT::t_x_array x, typename AT::t_int_1d numneigh,
+    typename AT::t_int_1d numneigh_full, typename AT::t_f_array f,
+    typename ArrayTypes<DeviceType>::t_virial_array vatom, typename AT::t_int_1d d_ilist_half,
+    typename AT::t_int_1d d_offset, typename AT::t_neighbors_2d d_neighbors_half,
+    typename AT::t_neighbors_2d d_neighbors_full, int neighflag, int need_dup, EV_FLOAT &ev_all)
+{
+  EV_FLOAT ev;
+
+  this->eflag_either = eflag_either;
+  this->eflag_global = eflag_global;
+  this->eflag_atom = eflag_atom;
+  this->vflag_global = vflag_global;
+  this->vflag_atom = vflag_atom;
+  eflag_either = eflag_atom || eflag_global;
+  vflag_either = vflag_atom || vflag_global;
+  this->d_eatom = eatom;
+  this->ntype = ntype;
+  this->type = type;
+  this->d_map = d_map;
+  this->x = x;
+  this->d_numneigh_half = numneigh;
+  this->d_numneigh_full = numneigh_full;
+  this->d_neighbors_half = d_neighbors_half;
+  this->d_neighbors_full = d_neighbors_full;
+  this->f = f;
+  this->d_vatom = vatom;
+  this->d_ilist_half = d_ilist_half;
+  this->d_offset = d_offset;
+
+  if (need_dup) {
+    dup_f = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum,
+                                                      Kokkos::Experimental::ScatterDuplicated>(f);
+    if (eflag_atom)
+      dup_eatom = Kokkos::Experimental::create_scatter_view<
+          Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_eatom);
+    if (vflag_atom)
+      dup_vatom = Kokkos::Experimental::create_scatter_view<
+          Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_vatom);
+  } else {
+    ndup_f =
+        Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum,
+                                                  Kokkos::Experimental::ScatterNonDuplicated>(f);
+    if (eflag_atom)
+      ndup_eatom = Kokkos::Experimental::create_scatter_view<
+          Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_eatom);
+    if (vflag_atom)
+      ndup_vatom = Kokkos::Experimental::create_scatter_view<
+          Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_vatom);
+  }
+
+  copymode = 1;
+  if (neighflag == HALF)
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagMEAMForce<HALF>>(0, inum_half),
+                            *this, ev);
+  else if (neighflag == HALFTHREAD)
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagMEAMForce<HALFTHREAD>>(0, inum_half),
+                            *this, ev);
+  ev_all += ev;
+  copymode = 0;
+
+  if (need_dup) {
+    Kokkos::Experimental::contribute(f, dup_f);
+    if (eflag_atom) Kokkos::Experimental::contribute(d_eatom, dup_eatom);
+    if (vflag_atom) Kokkos::Experimental::contribute(d_vatom, dup_vatom);
+
+    // free duplicated memory
+    dup_f = decltype(dup_f)();
+    if (eflag_atom) dup_eatom = decltype(dup_eatom)();
+    if (vflag_atom) dup_vatom = decltype(dup_vatom)();
+  }
+}
+
+template <class DeviceType>
+template <int NEIGHFLAG>
+KOKKOS_INLINE_FUNCTION void MEAMKokkos<DeviceType>::operator()(TagMEAMForce<NEIGHFLAG>,
+                                                               const int &ii, EV_FLOAT &ev) const
+{
+  int i, j, jn, k, kn, kk, m, n, p, q;
+  int nv2, nv3, elti, eltj, eltk, ind;
+  X_FLOAT xitmp, yitmp, zitmp, delij[3];
+  double rij2, rij, rij3;
+  double v[6], fi[3], fj[3];
+  double third, sixth;
+  double pp, dUdrij, dUdsij, dUdrijm[3], force, forcem;
+  double recip, phi, phip;
+  double sij;
+  double a1, a1i, a1j, a2, a2i, a2j;
+  double a3i, a3j;
+  double shpi[3], shpj[3];
+  double ai, aj, ro0i, ro0j, invrei, invrej;
+  double rhoa0j, drhoa0j, rhoa0i, drhoa0i;
+  double rhoa1j, drhoa1j, rhoa1i, drhoa1i;
+  double rhoa2j, drhoa2j, rhoa2i, drhoa2i;
+  double a3, a3a, rhoa3j, drhoa3j, rhoa3i, drhoa3i;
+  double drho0dr1, drho0dr2, drho0ds1, drho0ds2;
+  double drho1dr1, drho1dr2, drho1ds1, drho1ds2;
+  double drho1drm1[3], drho1drm2[3];
+  double drho2dr1, drho2dr2, drho2ds1, drho2ds2;
+  double drho2drm1[3], drho2drm2[3];
+  double drho3dr1, drho3dr2, drho3ds1, drho3ds2;
+  double drho3drm1[3], drho3drm2[3];
+  double dt1dr1, dt1dr2, dt1ds1, dt1ds2;
+  double dt2dr1, dt2dr2, dt2ds1, dt2ds2;
+  double dt3dr1, dt3dr2, dt3ds1, dt3ds2;
+  double drhodr1, drhodr2, drhods1, drhods2, drhodrm1[3], drhodrm2[3];
+  double arg;
+  double arg1i1, arg1j1, arg1i2, arg1j2, arg1i3, arg1j3, arg3i3, arg3j3;
+  double dsij1, dsij2, force1, force2;
+  double t1i, t2i, t3i, t1j, t2j, t3j;
+  int fnoffset;
+
+  // The f, etc. arrays are duplicated for OpenMP, atomic for CUDA, and neither for Serial
+
+  auto v_f =
+      ScatterViewHelper<NeedDup_v<NEIGHFLAG, DeviceType>, decltype(dup_f), decltype(ndup_f)>::get(
+          dup_f, ndup_f);
+  auto a_f = v_f.template access<AtomicDup_v<NEIGHFLAG, DeviceType>>();
+  auto v_eatom = ScatterViewHelper<NeedDup_v<NEIGHFLAG, DeviceType>, decltype(dup_eatom),
+                                   decltype(ndup_eatom)>::get(dup_eatom, ndup_eatom);
+  auto a_eatom = v_eatom.template access<AtomicDup_v<NEIGHFLAG, DeviceType>>();
+  auto v_vatom = ScatterViewHelper<NeedDup_v<NEIGHFLAG, DeviceType>, decltype(dup_vatom),
+                                   decltype(ndup_vatom)>::get(dup_vatom, ndup_vatom);
+  auto a_vatom = v_vatom.template access<AtomicDup_v<NEIGHFLAG, DeviceType>>();
+
+  i = d_ilist_half[ii];
+  fnoffset = d_offset[i];
+  third = 1.0 / 3.0;
+  sixth = 1.0 / 6.0;
+
+  elti = d_map[type[i]];
+  if (elti < 0) return;
+
+  xitmp = x(i, 0);
+  yitmp = x(i, 1);
+  zitmp = x(i, 2);
+
+  // Treat each pair
+  for (jn = 0; jn < d_numneigh_half[i]; jn++) {
+    j = d_neighbors_half(i, jn);
+    eltj = d_map[type[j]];
+
+    if (!iszero_kk(d_scrfcn[fnoffset + jn]) && eltj >= 0) {
+
+      sij = d_scrfcn[fnoffset + jn] * d_fcpair[fnoffset + jn];
+      delij[0] = x(j, 0) - xitmp;
+      delij[1] = x(j, 1) - yitmp;
+      delij[2] = x(j, 2) - zitmp;
+      rij2 = delij[0] * delij[0] + delij[1] * delij[1] + delij[2] * delij[2];
+      if (rij2 < cutforcesq) {
+        rij = sqrt(rij2);
+        recip = 1.0 / rij;
+
+        // Compute phi and phip
+        ind = eltind[elti][eltj];
+        pp = rij * rdrar;
+        kk = (int) pp;
+        kk = (kk <= (nrar - 2)) ? kk : nrar - 2;
+        pp = pp - kk;
+        pp = (pp <= 1.0) ? pp : 1.0;
+        phi = ((d_phirar3(ind, kk) * pp + d_phirar2(ind, kk)) * pp + d_phirar1(ind, kk)) * pp +
+            d_phirar(ind, kk);
+        phip = (d_phirar6(ind, kk) * pp + d_phirar5(ind, kk)) * pp + d_phirar4(ind, kk);
+
+        if (eflag_either) {
+          double scaleij = d_scale(type[i], type[i]);
+          double phi_sc = phi * scaleij;
+          if (eflag_global) ev.evdwl += phi_sc * sij;
+          if (eflag_atom) {
+            a_eatom[i] += 0.5 * phi * sij;
+            a_eatom[j] += 0.5 * phi * sij;
+          }
+        }
+
+        // write(1,*) "force_meamf: phi: ",phi
+        // write(1,*) "force_meamf: phip: ",phip
+
+        // Compute pair densities and derivatives
+        invrei = 1.0 / re_meam[elti][elti];
+        ai = rij * invrei - 1.0;
+        ro0i = rho0_meam[elti];
+        rhoa0i = ro0i * MathSpecialKokkos::fm_exp(-beta0_meam[elti] * ai);
+        drhoa0i = -beta0_meam[elti] * invrei * rhoa0i;
+        rhoa1i = ro0i * MathSpecialKokkos::fm_exp(-beta1_meam[elti] * ai);
+        drhoa1i = -beta1_meam[elti] * invrei * rhoa1i;
+        rhoa2i = ro0i * MathSpecialKokkos::fm_exp(-beta2_meam[elti] * ai);
+        drhoa2i = -beta2_meam[elti] * invrei * rhoa2i;
+        rhoa3i = ro0i * MathSpecialKokkos::fm_exp(-beta3_meam[elti] * ai);
+        drhoa3i = -beta3_meam[elti] * invrei * rhoa3i;
+
+        if (elti != eltj) {
+          invrej = 1.0 / re_meam[eltj][eltj];
+          aj = rij * invrej - 1.0;
+          ro0j = rho0_meam[eltj];
+          rhoa0j = ro0j * MathSpecialKokkos::fm_exp(-beta0_meam[eltj] * aj);
+          drhoa0j = -beta0_meam[eltj] * invrej * rhoa0j;
+          rhoa1j = ro0j * MathSpecialKokkos::fm_exp(-beta1_meam[eltj] * aj);
+          drhoa1j = -beta1_meam[eltj] * invrej * rhoa1j;
+          rhoa2j = ro0j * MathSpecialKokkos::fm_exp(-beta2_meam[eltj] * aj);
+          drhoa2j = -beta2_meam[eltj] * invrej * rhoa2j;
+          rhoa3j = ro0j * MathSpecialKokkos::fm_exp(-beta3_meam[eltj] * aj);
+          drhoa3j = -beta3_meam[eltj] * invrej * rhoa3j;
+        } else {
+          rhoa0j = rhoa0i;
+          drhoa0j = drhoa0i;
+          rhoa1j = rhoa1i;
+          drhoa1j = drhoa1i;
+          rhoa2j = rhoa2i;
+          drhoa2j = drhoa2i;
+          rhoa3j = rhoa3i;
+          drhoa3j = drhoa3i;
+        }
+
+        const double t1mi = t1_meam[elti];
+        const double t2mi = t2_meam[elti];
+        const double t3mi = t3_meam[elti];
+        const double t1mj = t1_meam[eltj];
+        const double t2mj = t2_meam[eltj];
+        const double t3mj = t3_meam[eltj];
+
+        if (ialloy == 1) {
+          rhoa1j *= t1mj;
+          rhoa2j *= t2mj;
+          rhoa3j *= t3mj;
+          rhoa1i *= t1mi;
+          rhoa2i *= t2mi;
+          rhoa3i *= t3mi;
+          drhoa1j *= t1mj;
+          drhoa2j *= t2mj;
+          drhoa3j *= t3mj;
+          drhoa1i *= t1mi;
+          drhoa2i *= t2mi;
+          drhoa3i *= t3mi;
+        }
+
+        nv2 = 0;
+        nv3 = 0;
+        arg1i1 = 0.0;
+        arg1j1 = 0.0;
+        arg1i2 = 0.0;
+        arg1j2 = 0.0;
+        arg1i3 = 0.0;
+        arg1j3 = 0.0;
+        arg3i3 = 0.0;
+        arg3j3 = 0.0;
+        for (n = 0; n < 3; n++) {
+          for (p = n; p < 3; p++) {
+            for (q = p; q < 3; q++) {
+              arg = delij[n] * delij[p] * delij[q] * v3D[nv3];
+              arg1i3 = arg1i3 + d_arho3(i, nv3) * arg;
+              arg1j3 = arg1j3 - d_arho3(j, nv3) * arg;
+              nv3 = nv3 + 1;
+            }
+            arg = delij[n] * delij[p] * v2D[nv2];
+            arg1i2 = arg1i2 + d_arho2(i, nv2) * arg;
+            arg1j2 = arg1j2 + d_arho2(j, nv2) * arg;
+            nv2 = nv2 + 1;
+          }
+          arg1i1 = arg1i1 + d_arho1(i, n) * delij[n];
+          arg1j1 = arg1j1 - d_arho1(j, n) * delij[n];
+          arg3i3 = arg3i3 + d_arho3b(i, n) * delij[n];
+          arg3j3 = arg3j3 - d_arho3b(j, n) * delij[n];
+        }
+
+        // rho0 terms
+        drho0dr1 = drhoa0j * sij;
+        drho0dr2 = drhoa0i * sij;
+
+        // rho1 terms
+        a1 = 2 * sij / rij;
+        drho1dr1 = a1 * (drhoa1j - rhoa1j / rij) * arg1i1;
+        drho1dr2 = a1 * (drhoa1i - rhoa1i / rij) * arg1j1;
+        a1 = 2.0 * sij / rij;
+        for (m = 0; m < 3; m++) {
+          drho1drm1[m] = a1 * rhoa1j * d_arho1(i, m);
+          drho1drm2[m] = -a1 * rhoa1i * d_arho1(j, m);
+        }
+
+        // rho2 terms
+        a2 = 2 * sij / rij2;
+        drho2dr1 =
+            a2 * (drhoa2j - 2 * rhoa2j / rij) * arg1i2 - 2.0 / 3.0 * d_arho2b[i] * drhoa2j * sij;
+        drho2dr2 =
+            a2 * (drhoa2i - 2 * rhoa2i / rij) * arg1j2 - 2.0 / 3.0 * d_arho2b[j] * drhoa2i * sij;
+        a2 = 4 * sij / rij2;
+        for (m = 0; m < 3; m++) {
+          drho2drm1[m] = 0.0;
+          drho2drm2[m] = 0.0;
+          for (n = 0; n < 3; n++) {
+            drho2drm1[m] = drho2drm1[m] + d_arho2(i, vind2D[m][n]) * delij[n];
+            drho2drm2[m] = drho2drm2[m] - d_arho2(j, vind2D[m][n]) * delij[n];
+          }
+          drho2drm1[m] = a2 * rhoa2j * drho2drm1[m];
+          drho2drm2[m] = -a2 * rhoa2i * drho2drm2[m];
+        }
+
+        // rho3 terms
+        rij3 = rij * rij2;
+        a3 = 2 * sij / rij3;
+        a3a = 6.0 / 5.0 * sij / rij;
+        drho3dr1 =
+            a3 * (drhoa3j - 3 * rhoa3j / rij) * arg1i3 - a3a * (drhoa3j - rhoa3j / rij) * arg3i3;
+        drho3dr2 =
+            a3 * (drhoa3i - 3 * rhoa3i / rij) * arg1j3 - a3a * (drhoa3i - rhoa3i / rij) * arg3j3;
+        a3 = 6 * sij / rij3;
+        a3a = 6 * sij / (5 * rij);
+        for (m = 0; m < 3; m++) {
+          drho3drm1[m] = 0.0;
+          drho3drm2[m] = 0.0;
+          nv2 = 0;
+          for (n = 0; n < 3; n++) {
+            for (p = n; p < 3; p++) {
+              arg = delij[n] * delij[p] * v2D[nv2];
+              drho3drm1[m] = drho3drm1[m] + d_arho3(i, vind3D[m][n][p]) * arg;
+              drho3drm2[m] = drho3drm2[m] + d_arho3(j, vind3D[m][n][p]) * arg;
+              nv2 = nv2 + 1;
+            }
+          }
+          drho3drm1[m] = (a3 * drho3drm1[m] - a3a * d_arho3b(i, m)) * rhoa3j;
+          drho3drm2[m] = (-a3 * drho3drm2[m] + a3a * d_arho3b(j, m)) * rhoa3i;
+        }
+
+        // Compute derivatives of weighting functions t wrt rij
+        t1i = d_t_ave(i, 0);
+        t2i = d_t_ave(i, 1);
+        t3i = d_t_ave(i, 2);
+        t1j = d_t_ave(j, 0);
+        t2j = d_t_ave(j, 1);
+        t3j = d_t_ave(j, 2);
+
+        if (ialloy == 1) {
+
+          a1i = fdiv_zero_kk(drhoa0j * sij, d_tsq_ave(i, 0));
+          a1j = fdiv_zero_kk(drhoa0i * sij, d_tsq_ave(j, 0));
+          a2i = fdiv_zero_kk(drhoa0j * sij, d_tsq_ave(i, 1));
+          a2j = fdiv_zero_kk(drhoa0i * sij, d_tsq_ave(j, 1));
+          a3i = fdiv_zero_kk(drhoa0j * sij, d_tsq_ave(i, 2));
+          a3j = fdiv_zero_kk(drhoa0i * sij, d_tsq_ave(j, 2));
+
+          dt1dr1 = a1i * (t1mj - t1i * MathSpecialKokkos::square(t1mj));
+          dt1dr2 = a1j * (t1mi - t1j * MathSpecialKokkos::square(t1mi));
+          dt2dr1 = a2i * (t2mj - t2i * MathSpecialKokkos::square(t2mj));
+          dt2dr2 = a2j * (t2mi - t2j * MathSpecialKokkos::square(t2mi));
+          dt3dr1 = a3i * (t3mj - t3i * MathSpecialKokkos::square(t3mj));
+          dt3dr2 = a3j * (t3mi - t3j * MathSpecialKokkos::square(t3mi));
+
+        } else if (ialloy == 2) {
+
+          dt1dr1 = 0.0;
+          dt1dr2 = 0.0;
+          dt2dr1 = 0.0;
+          dt2dr2 = 0.0;
+          dt3dr1 = 0.0;
+          dt3dr2 = 0.0;
+
+        } else {
+
+          ai = 0.0;
+          if (!iszero_kk(d_rho0[i])) ai = drhoa0j * sij / d_rho0[i];
+          aj = 0.0;
+          if (!iszero_kk(d_rho0[j])) aj = drhoa0i * sij / d_rho0[j];
+
+          dt1dr1 = ai * (t1mj - t1i);
+          dt1dr2 = aj * (t1mi - t1j);
+          dt2dr1 = ai * (t2mj - t2i);
+          dt2dr2 = aj * (t2mi - t2j);
+          dt3dr1 = ai * (t3mj - t3i);
+          dt3dr2 = aj * (t3mi - t3j);
+        }
+
+        // Compute derivatives of total density wrt rij, sij and rij(3)
+        get_shpfcn(lattce_meam[elti][elti], stheta_meam[elti][elti], ctheta_meam[elti][elti], shpi);
+        get_shpfcn(lattce_meam[eltj][eltj], stheta_meam[elti][elti], ctheta_meam[elti][elti], shpj);
+
+        drhodr1 = d_dgamma1[i] * drho0dr1 +
+            d_dgamma2[i] *
+                (dt1dr1 * d_rho1[i] + t1i * drho1dr1 + dt2dr1 * d_rho2[i] + t2i * drho2dr1 +
+                 dt3dr1 * d_rho3[i] + t3i * drho3dr1) -
+            d_dgamma3[i] * (shpi[0] * dt1dr1 + shpi[1] * dt2dr1 + shpi[2] * dt3dr1);
+        drhodr2 = d_dgamma1[j] * drho0dr2 +
+            d_dgamma2[j] *
+                (dt1dr2 * d_rho1[j] + t1j * drho1dr2 + dt2dr2 * d_rho2[j] + t2j * drho2dr2 +
+                 dt3dr2 * d_rho3[j] + t3j * drho3dr2) -
+            d_dgamma3[j] * (shpj[0] * dt1dr2 + shpj[1] * dt2dr2 + shpj[2] * dt3dr2);
+        for (m = 0; m < 3; m++) {
+          drhodrm1[m] = 0.0;
+          drhodrm2[m] = 0.0;
+          drhodrm1[m] =
+              d_dgamma2[i] * (t1i * drho1drm1[m] + t2i * drho2drm1[m] + t3i * drho3drm1[m]);
+          drhodrm2[m] =
+              d_dgamma2[j] * (t1j * drho1drm2[m] + t2j * drho2drm2[m] + t3j * drho3drm2[m]);
+        }
+
+        // Compute derivatives wrt sij, but only if necessary
+        if (!iszero_kk(d_dscrfcn[fnoffset + jn])) {
+          drho0ds1 = rhoa0j;
+          drho0ds2 = rhoa0i;
+          a1 = 2.0 / rij;
+          drho1ds1 = a1 * rhoa1j * arg1i1;
+          drho1ds2 = a1 * rhoa1i * arg1j1;
+          a2 = 2.0 / rij2;
+          drho2ds1 = a2 * rhoa2j * arg1i2 - 2.0 / 3.0 * d_arho2b[i] * rhoa2j;
+          drho2ds2 = a2 * rhoa2i * arg1j2 - 2.0 / 3.0 * d_arho2b[j] * rhoa2i;
+          a3 = 2.0 / rij3;
+          a3a = 6.0 / (5.0 * rij);
+          drho3ds1 = a3 * rhoa3j * arg1i3 - a3a * rhoa3j * arg3i3;
+          drho3ds2 = a3 * rhoa3i * arg1j3 - a3a * rhoa3i * arg3j3;
+
+          if (ialloy == 1) {
+            a1i = fdiv_zero_kk(rhoa0j, d_tsq_ave(i, 0));
+            a1j = fdiv_zero_kk(rhoa0i, d_tsq_ave(j, 0));
+            a2i = fdiv_zero_kk(rhoa0j, d_tsq_ave(i, 1));
+            a2j = fdiv_zero_kk(rhoa0i, d_tsq_ave(j, 1));
+            a3i = fdiv_zero_kk(rhoa0j, d_tsq_ave(i, 2));
+            a3j = fdiv_zero_kk(rhoa0i, d_tsq_ave(j, 2));
+
+            dt1ds1 = a1i * (t1mj - t1i * MathSpecialKokkos::square(t1mj));
+            dt1ds2 = a1j * (t1mi - t1j * MathSpecialKokkos::square(t1mi));
+            dt2ds1 = a2i * (t2mj - t2i * MathSpecialKokkos::square(t2mj));
+            dt2ds2 = a2j * (t2mi - t2j * MathSpecialKokkos::square(t2mi));
+            dt3ds1 = a3i * (t3mj - t3i * MathSpecialKokkos::square(t3mj));
+            dt3ds2 = a3j * (t3mi - t3j * MathSpecialKokkos::square(t3mi));
+
+          } else if (ialloy == 2) {
+
+            dt1ds1 = 0.0;
+            dt1ds2 = 0.0;
+            dt2ds1 = 0.0;
+            dt2ds2 = 0.0;
+            dt3ds1 = 0.0;
+            dt3ds2 = 0.0;
+
+          } else {
+
+            ai = 0.0;
+            if (!iszero_kk(d_rho0[i])) ai = rhoa0j / d_rho0[i];
+            aj = 0.0;
+            if (!iszero_kk(d_rho0[j])) aj = rhoa0i / d_rho0[j];
+
+            dt1ds1 = ai * (t1mj - t1i);
+            dt1ds2 = aj * (t1mi - t1j);
+            dt2ds1 = ai * (t2mj - t2i);
+            dt2ds2 = aj * (t2mi - t2j);
+            dt3ds1 = ai * (t3mj - t3i);
+            dt3ds2 = aj * (t3mi - t3j);
+          }
+
+          drhods1 = d_dgamma1[i] * drho0ds1 +
+              d_dgamma2[i] *
+                  (dt1ds1 * d_rho1[i] + t1i * drho1ds1 + dt2ds1 * d_rho2[i] + t2i * drho2ds1 +
+                   dt3ds1 * d_rho3[i] + t3i * drho3ds1) -
+              d_dgamma3[i] * (shpi[0] * dt1ds1 + shpi[1] * dt2ds1 + shpi[2] * dt3ds1);
+          drhods2 = d_dgamma1[j] * drho0ds2 +
+              d_dgamma2[j] *
+                  (dt1ds2 * d_rho1[j] + t1j * drho1ds2 + dt2ds2 * d_rho2[j] + t2j * drho2ds2 +
+                   dt3ds2 * d_rho3[j] + t3j * drho3ds2) -
+              d_dgamma3[j] * (shpj[0] * dt1ds2 + shpj[1] * dt2ds2 + shpj[2] * dt3ds2);
+        }
+
+        // Compute derivatives of energy wrt rij, sij and rij[3]
+        dUdrij = phip * sij + d_frhop[i] * drhodr1 + d_frhop[j] * drhodr2;
+        dUdsij = 0.0;
+        if (!iszero_kk(d_dscrfcn[fnoffset + jn])) {
+          dUdsij = phi + d_frhop[i] * drhods1 + d_frhop[j] * drhods2;
+        }
+        for (m = 0; m < 3; m++) {
+          dUdrijm[m] = d_frhop[i] * drhodrm1[m] + d_frhop[j] * drhodrm2[m];
+        }
+
+        // Add the part of the force due to dUdrij and dUdsij
+        force = dUdrij * recip + dUdsij * d_dscrfcn[fnoffset + jn];
+        for (m = 0; m < 3; m++) {
+          forcem = delij[m] * force + dUdrijm[m];
+          a_f(i, m) += forcem;
+          a_f(j, m) -= forcem;
+        }
+
+        // Tabulate per-atom virial as symmetrized stress tensor
+
+        if (vflag_either) {
+          fi[0] = delij[0] * force + dUdrijm[0];
+          fi[1] = delij[1] * force + dUdrijm[1];
+          fi[2] = delij[2] * force + dUdrijm[2];
+          v[0] = -0.5 * (delij[0] * fi[0]);
+          v[1] = -0.5 * (delij[1] * fi[1]);
+          v[2] = -0.5 * (delij[2] * fi[2]);
+          v[3] = -0.25 * (delij[0] * fi[1] + delij[1] * fi[0]);
+          v[4] = -0.25 * (delij[0] * fi[2] + delij[2] * fi[0]);
+          v[5] = -0.25 * (delij[1] * fi[2] + delij[2] * fi[1]);
+
+          if (vflag_global)
+            for (m = 0; m < 6; m++) ev.v[m] += 2.0 * v[m];
+
+          if (vflag_atom) {
+            for (m = 0; m < 6; m++) {
+              a_vatom(i, m) += v[m];
+              a_vatom(j, m) += v[m];
+            }
+          }
+        }
+
+        // Now compute forces on other atoms k due to change in sij
+
+        if (iszero_kk(sij) || isone_kk(sij)) continue;    //: cont jn loop
+
+        double dxik(0), dyik(0), dzik(0);
+        double dxjk(0), dyjk(0), dzjk(0);
+
+        for (kn = 0; kn < d_numneigh_full[i]; kn++) {
+          k = d_neighbors_full(i, kn);
+          eltk = d_map[type[k]];
+          if (k != j && eltk >= 0) {
+            double xik, xjk, cikj, sikj, dfc, a;
+            double dCikj1, dCikj2;
+            double delc, rik2, rjk2;
+
+            sij = d_scrfcn[jn + fnoffset] * d_fcpair[jn + fnoffset];
+            const double Cmax = Cmax_meam[elti][eltj][eltk];
+            const double Cmin = Cmin_meam[elti][eltj][eltk];
+
+            dsij1 = 0.0;
+            dsij2 = 0.0;
+            if (!iszero_kk(sij) && !isone_kk(sij)) {
+              const double rbound = rij2 * ebound_meam[elti][eltj];
+              delc = Cmax - Cmin;
+              dxjk = x(k, 0) - x(j, 0);
+              dyjk = x(k, 1) - x(j, 1);
+              dzjk = x(k, 2) - x(j, 2);
+              rjk2 = dxjk * dxjk + dyjk * dyjk + dzjk * dzjk;
+              if (rjk2 <= rbound) {
+                dxik = x(k, 0) - x(i, 0);
+                dyik = x(k, 1) - x(i, 1);
+                dzik = x(k, 2) - x(i, 2);
+                rik2 = dxik * dxik + dyik * dyik + dzik * dzik;
+                if (rik2 <= rbound) {
+                  xik = rik2 / rij2;
+                  xjk = rjk2 / rij2;
+                  a = 1 - (xik - xjk) * (xik - xjk);
+                  if (!iszero_kk(a)) {
+                    cikj = (2.0 * (xik + xjk) + a - 2.0) / a;
+                    if (cikj >= Cmin && cikj <= Cmax) {
+                      cikj = (cikj - Cmin) / delc;
+                      sikj = dfcut(cikj, dfc);
+                      dCfunc2(rij2, rik2, rjk2, dCikj1, dCikj2);
+                      a = sij / delc * dfc / sikj;
+                      dsij1 = a * dCikj1;
+                      dsij2 = a * dCikj2;
+                    }
+                  }
+                }
+              }
+            }
+
+            if (!iszero_kk(dsij1) || !iszero_kk(dsij2)) {
+              force1 = dUdsij * dsij1;
+              force2 = dUdsij * dsij2;
+
+              a_f(i, 0) += force1 * dxik;
+              a_f(i, 1) += force1 * dyik;
+              a_f(i, 2) += force1 * dzik;
+              a_f(j, 0) += force2 * dxjk;
+              a_f(j, 1) += force2 * dyjk;
+              a_f(j, 2) += force2 * dzjk;
+              a_f(k, 0) -= force1 * dxik + force2 * dxjk;
+              a_f(k, 1) -= force1 * dyik + force2 * dyjk;
+              a_f(k, 2) -= force1 * dzik + force2 * dzjk;
+
+              // Tabulate per-atom virial as symmetrized stress tensor
+
+              if (vflag_either) {
+                fi[0] = force1 * dxik;
+                fi[1] = force1 * dyik;
+                fi[2] = force1 * dzik;
+                fj[0] = force2 * dxjk;
+                fj[1] = force2 * dyjk;
+                fj[2] = force2 * dzjk;
+                v[0] = -third * (dxik * fi[0] + dxjk * fj[0]);
+                v[1] = -third * (dyik * fi[1] + dyjk * fj[1]);
+                v[2] = -third * (dzik * fi[2] + dzjk * fj[2]);
+                v[3] = -sixth * (dxik * fi[1] + dxjk * fj[1] + dyik * fi[0] + dyjk * fj[0]);
+                v[4] = -sixth * (dxik * fi[2] + dxjk * fj[2] + dzik * fi[0] + dzjk * fj[0]);
+                v[5] = -sixth * (dyik * fi[2] + dyjk * fj[2] + dzik * fi[1] + dzjk * fj[1]);
+
+                if (vflag_global)
+                  for (m = 0; m < 6; m++) ev.v[m] += 3.0 * v[m];
+
+                if (vflag_atom) {
+                  for (m = 0; m < 6; m++) {
+                    a_vatom(i, m) += v[m];
+                    a_vatom(j, m) += v[m];
+                    a_vatom(k, m) += v[m];
+                  }
+                }
+              }
+            }
+          }
+          // end of k loop
+        }
+      }
+    }
+    // end of j loop
+  }
+}
--- a/src/KOKKOS/meam_funcs_kokkos.h
+++ b/src/KOKKOS/meam_funcs_kokkos.h
@ -0,0 +1,289 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Naga Vydyanathan (NVIDIA)
+------------------------------------------------------------------------- */
+
+#include "math_special_kokkos.h"
+#include <cmath>
+#include "meam_kokkos.h"
+using namespace MathSpecialKokkos;
+
+//-----------------------------------------------------------------------------
+// Compute G(gamma) based on selection flag ibar:
+//  0 => G = sqrt(1+gamma)
+//  1 => G = exp(gamma/2)
+//  2 => not implemented
+//  3 => G = 2/(1+exp(-gamma))
+//  4 => G = sqrt(1+gamma)
+// -5 => G = +-sqrt(abs(1+gamma))
+//
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+double MEAMKokkos<DeviceType>::G_gam(const double gamma, const int ibar, int &errorflag) const
+{
+  double gsmooth_switchpoint;
+
+  switch (ibar) {
+    case 0:
+    case 4:
+      gsmooth_switchpoint = -gsmooth_factor / (gsmooth_factor + 1);
+      if (gamma < gsmooth_switchpoint) {
+        // e.g. gsmooth_factor is 99, {:
+        // gsmooth_switchpoint = -0.99
+        // G = 0.01*(-0.99/gamma)**99
+        double G = 1 / (gsmooth_factor + 1) * pow((gsmooth_switchpoint / gamma), gsmooth_factor);
+        return sqrt(G);
+      } else {
+        return sqrt(1.0 + gamma);
+      }
+    case 1:
+      return MathSpecialKokkos::fm_exp(gamma / 2.0);
+    case 3:
+      return 2.0 / (1.0 + MathSpecialKokkos::fm_exp(-gamma));
+    case -5:
+      if ((1.0 + gamma) >= 0) {
+        return sqrt(1.0 + gamma);
+      } else {
+        return -sqrt(-1.0 - gamma);
+      }
+  }
+  errorflag = 1;
+  return 0.0;
+}
+
+//-----------------------------------------------------------------------------
+// Compute G(gamma and dG(gamma) based on selection flag ibar:
+//  0 => G = sqrt(1+gamma)
+//  1 => G = exp(gamma/2)
+//  2 => not implemented
+//  3 => G = 2/(1+exp(-gamma))
+//  4 => G = sqrt(1+gamma)
+// -5 => G = +-sqrt(abs(1+gamma))
+//
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+double MEAMKokkos<DeviceType>::dG_gam(const double gamma, const int ibar, double& dG) const
+{
+  double gsmooth_switchpoint;
+  double G;
+
+  switch (ibar) {
+    case 0:
+    case 4:
+      gsmooth_switchpoint = -gsmooth_factor / (gsmooth_factor + 1);
+      if (gamma < gsmooth_switchpoint) {
+        // e.g. gsmooth_factor is 99, {:
+        // gsmooth_switchpoint = -0.99
+        // G = 0.01*(-0.99/gamma)**99
+        G = 1 / (gsmooth_factor + 1) * pow((gsmooth_switchpoint / gamma), gsmooth_factor);
+        G = sqrt(G);
+        dG = -gsmooth_factor * G / (2.0 * gamma);
+        return G;
+      } else {
+        G = sqrt(1.0 + gamma);
+        dG = 1.0 / (2.0 * G);
+        return G;
+      }
+    case 1:
+      G = MathSpecialKokkos::fm_exp(gamma / 2.0);
+      dG = G / 2.0;
+      return G;
+    case 3:
+      G = 2.0 / (1.0 + MathSpecialKokkos::fm_exp(-gamma));
+      dG = G * (2.0 - G) / 2;
+      return G;
+    case -5:
+      if ((1.0 + gamma) >= 0) {
+        G = sqrt(1.0 + gamma);
+        dG = 1.0 / (2.0 * G);
+        return G;
+      } else {
+        G = -sqrt(-1.0 - gamma);
+        dG = -1.0 / (2.0 * G);
+        return G;
+      }
+  }
+  dG = 1.0;
+  return 0.0;
+}
+
+//-----------------------------------------------------------------------------
+// Compute ZBL potential
+//
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+double MEAMKokkos<DeviceType>::zbl(const double r, const int z1, const int z2) const
+{
+  int i;
+  const double c[] = { 0.028171, 0.28022, 0.50986, 0.18175 };
+  const double d[] = { 0.20162, 0.40290, 0.94229, 3.1998 };
+  const double azero = 0.4685;
+  const double cc = 14.3997;
+  double a, x;
+  // azero = (9pi^2/128)^1/3 (0.529) Angstroms
+  a = azero / (pow(z1, 0.23) + pow(z2, 0.23));
+  double result = 0.0;
+  x = r / a;
+  for (i = 0; i <= 3; i++) {
+    result = result + c[i] * MathSpecialKokkos::fm_exp(-d[i] * x);
+  }
+  if (r > 0.0)
+    result = result * z1 * z2 / r * cc;
+  return result;
+}
+
+//-----------------------------------------------------------------------------
+// Compute embedding function F(rhobar) and derivative F'(rhobar), eqn I.5
+//
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+double MEAMKokkos<DeviceType>::embedding(const double A, const double Ec, const double rhobar, double& dF) const
+{
+  const double AEc = A * Ec;
+
+  if (rhobar > 0.0) {
+      const double lrb = log(rhobar);
+      dF = AEc * (1.0 + lrb);
+      return AEc * rhobar * lrb;
+  } else {
+    if (emb_lin_neg == 0) {
+      dF = 0.0;
+      return 0.0;
+    } else {
+      dF = - AEc;
+      return - AEc * rhobar;
+    }
+  }
+}
+
+//-----------------------------------------------------------------------------
+// Compute Rose energy function, I.16
+//
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+double MEAMKokkos<DeviceType>::erose(const double r, const double re, const double alpha, const double Ec, const double repuls,
+            const double attrac, const int form) const
+{
+  double astar, a3;
+  double result = 0.0;
+
+  if (r > 0.0) {
+    astar = alpha * (r / re - 1.0);
+    a3 = 0.0;
+    if (astar >= 0)
+      a3 = attrac;
+    else if (astar < 0)
+      a3 = repuls;
+
+    if (form == 1)
+      result = -Ec * (1 + astar + (-attrac + repuls / r) * MathSpecialKokkos::cube(astar)) * MathSpecialKokkos::fm_exp(-astar);
+    else if (form == 2)
+      result = -Ec * (1 + astar + a3 * MathSpecialKokkos::cube(astar)) * MathSpecialKokkos::fm_exp(-astar);
+    else
+      result = -Ec * (1 + astar + a3 * MathSpecialKokkos::cube(astar) / (r / re)) * MathSpecialKokkos::fm_exp(-astar);
+  }
+  return result;
+}
+
+//-----------------------------------------------------------------------------
+// Shape factors for various configurations
+//
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void MEAMKokkos<DeviceType>::get_shpfcn(const lattice_t latt, const double sthe, const double cthe, double (&s)[3]) const
+{
+  switch (latt) {
+    case FCC:
+    case BCC:
+    case B1:
+    case B2:
+      s[0] = 0.0;
+      s[1] = 0.0;
+      s[2] = 0.0;
+      break;
+    case HCP:
+      s[0] = 0.0;
+      s[1] = 0.0;
+      s[2] = 1.0 / 3.0;
+      break;
+    case CH4: // CH4 actually needs shape factor for diamond for C, dimer for H
+    case DIA:
+    case DIA3:
+      s[0] = 0.0;
+      s[1] = 0.0;
+      s[2] = 32.0 / 9.0;
+      break;
+    case DIM:
+      s[0] = 1.0;
+      s[1] = 2.0 / 3.0;
+      // s(4) = 1.d0 // this should be 0.4 unless (1-legendre) is multiplied in the density calc.
+      s[2] = 0.40; // this is (1-legendre) where legendre = 0.6 in dynamo is accounted.
+      break;
+    case LIN: // linear, theta being 180
+      s[0] = 0.0;
+      s[1] = 8.0 / 3.0; // 4*(co**4 + si**4 - 1.0/3.0) in zig become 4*(1-1/3)
+      s[2] = 0.0;
+      break;
+    case ZIG: //zig-zag
+    case TRI: //trimer e.g. H2O
+      s[0] = 4.0*pow(cthe,2);
+      s[1] = 4.0*(pow(cthe,4) + pow(sthe,4) - 1.0/3.0);
+      s[2] = 4.0*(pow(cthe,2) * (3*pow(sthe,4) + pow(cthe,4)));
+      s[2] = s[2] - 0.6*s[0]; //legend in dyn, 0.6 is default value.
+      break;
+    default:
+      s[0] = 0.0;
+      // call error('Lattice not defined in get_shpfcn.')
+  }
+}
+
+//-----------------------------------------------------------------------------
+// Number of neighbors for the reference structure
+//
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+int MEAMKokkos<DeviceType>::get_Zij(const lattice_t latt) const
+{
+  switch (latt) {
+    case FCC:
+      return 12;
+    case BCC:
+      return 8;
+    case HCP:
+      return 12;
+    case DIA:
+    case DIA3:
+      return 4;
+    case DIM:
+      return 1;
+    case B1:
+      return 6;
+    case C11:
+      return 10;
+    case L12:
+      return 12;
+    case B2:
+      return 8;
+    case CH4: // DYNAMO currently implemented this way while it needs two Z values, 4 and 1
+      return 4;
+    case LIN:
+    case ZIG:
+    case TRI:
+      return 2;
+      // call error('Lattice not defined in get_Zij.')
+  }
+  return 0;
+}
--- a/src/KOKKOS/meam_impl_kokkos.h
+++ b/src/KOKKOS/meam_impl_kokkos.h
@ -0,0 +1,68 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Naga Vydyanathan (NVIDIA), Stan Moore (SNL)
+------------------------------------------------------------------------- */
+
+#include "memory_kokkos.h"
+#include "meam_kokkos.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+MEAMKokkos<DeviceType>::MEAMKokkos(Memory *mem) : MEAM(mem)
+{
+  d_errorflag = typename AT::t_int_scalar("meam:errorflag");
+}
+
+template<class DeviceType>
+MEAMKokkos<DeviceType>::~MEAMKokkos()
+{
+  if (copymode) return;
+
+  MemoryKokkos *memoryKK = (MemoryKokkos *)memory;
+
+  memoryKK->destroy_kokkos(k_rho,rho);
+  memoryKK->destroy_kokkos(k_rho0,rho0);
+  memoryKK->destroy_kokkos(k_rho1,rho1);
+  memoryKK->destroy_kokkos(k_rho2,rho2);
+  memoryKK->destroy_kokkos(k_rho3,rho3);
+  memoryKK->destroy_kokkos(k_frhop,frhop);
+  memoryKK->destroy_kokkos(k_gamma,gamma);
+  memoryKK->destroy_kokkos(k_dgamma1,dgamma1);
+  memoryKK->destroy_kokkos(k_dgamma2,dgamma2);
+  memoryKK->destroy_kokkos(k_dgamma3,dgamma3);
+  memoryKK->destroy_kokkos(k_arho2b,arho2b);
+
+  memoryKK->destroy_kokkos(k_arho1,arho1);
+  memoryKK->destroy_kokkos(k_arho2,arho2);
+  memoryKK->destroy_kokkos(k_arho3,arho3);
+  memoryKK->destroy_kokkos(k_arho3b,arho3b);
+  memoryKK->destroy_kokkos(k_t_ave,t_ave);
+  memoryKK->destroy_kokkos(k_tsq_ave,tsq_ave);
+
+  memoryKK->destroy_kokkos(k_scrfcn,scrfcn);
+  memoryKK->destroy_kokkos(k_dscrfcn,dscrfcn);
+  memoryKK->destroy_kokkos(k_fcpair,fcpair);
+}
+
+#include "meam_setup_done_kokkos.h"
+#include "meam_funcs_kokkos.h"
+#include "meam_dens_init_kokkos.h"
+#include "meam_dens_final_kokkos.h"
+#include "meam_force_kokkos.h"
+
--- a/src/KOKKOS/meam_kokkos.h
+++ b/src/KOKKOS/meam_kokkos.h
@ -0,0 +1,224 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_MEAMKOKKOS_H
+#define LMP_MEAMKOKKOS_H
+
+#include "kokkos.h"
+#include "meam.h"
+#include "memory_kokkos.h"
+#include "neigh_request.h"
+#include "neighbor_kokkos.h"
+#include <cmath>
+#include <cstdlib>
+
+namespace LAMMPS_NS {
+
+struct TagMEAMDensFinal {};
+template <int NEIGHFLAG> struct TagMEAMDensInit {
+};
+struct TagMEAMZero {};
+template <int NEIGHFLAG> struct TagMEAMForce {
+};
+
+template <class DeviceType> class MEAMKokkos : public MEAM {
+ public:
+  typedef ArrayTypes<DeviceType> AT;
+  typedef EV_FLOAT value_type;
+  MEAMKokkos(Memory *mem);
+  ~MEAMKokkos() override;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagMEAMDensFinal, const int &, EV_FLOAT &) const;
+
+  template <int NEIGHFLAG>
+  KOKKOS_INLINE_FUNCTION void operator()(TagMEAMDensInit<NEIGHFLAG>, const int &) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagMEAMZero, const int &) const;
+
+  template <int NEIGHFLAG>
+  KOKKOS_INLINE_FUNCTION void operator()(TagMEAMForce<NEIGHFLAG>, const int &, EV_FLOAT &) const;
+
+ private:
+  // parameters to meam_dens_init
+
+  int ntype, nlocal;
+  typename AT::t_int_1d type;
+  typename AT::t_int_1d d_offset;
+  typename AT::t_int_1d d_map;
+  typename AT::t_int_2d d_scale;
+  typename AT::t_x_array x;
+  typename AT::t_int_1d d_numneigh_half;
+  typename AT::t_int_1d d_numneigh_full;
+  typename AT::t_neighbors_2d d_neighbors_half;
+  typename AT::t_neighbors_2d d_neighbors_full;
+  typename AT::t_int_1d d_ilist_half;
+  typename AT::t_f_array f;
+  typename ArrayTypes<DeviceType>::t_virial_array d_vatom;
+
+  // parameters to meam_dens_final
+
+  typename AT::t_int_scalar d_errorflag;
+  int eflag_either, eflag_global, eflag_atom, vflag_either, vflag_global, vflag_atom;
+  typename ArrayTypes<DeviceType>::t_efloat_1d d_eatom;
+
+ public:
+  void meam_dens_setup(int, int, int) override;
+  void meam_setup_done(double *) override;
+  void meam_dens_init(int, int, typename AT::t_int_1d, typename AT::t_int_1d,
+                      typename AT::t_x_array, typename AT::t_int_1d, typename AT::t_int_1d,
+                      typename AT::t_int_1d, typename AT::t_neighbors_2d,
+                      typename AT::t_neighbors_2d, typename AT::t_int_1d, int, int);
+  void meam_dens_final(int, int, int, int, typename ArrayTypes<DeviceType>::t_efloat_1d, int,
+                       typename AT::t_int_1d, typename AT::t_int_1d, typename AT::t_int_2d, int &,
+                       EV_FLOAT &);
+  void meam_force(int, int, int, int, int, typename ArrayTypes<DeviceType>::t_efloat_1d, int,
+                  typename AT::t_int_1d, typename AT::t_int_1d, typename AT::t_x_array,
+                  typename AT::t_int_1d, typename AT::t_int_1d, typename AT::t_f_array,
+                  typename ArrayTypes<DeviceType>::t_virial_array, typename AT::t_int_1d,
+                  typename AT::t_int_1d, typename AT::t_neighbors_2d, typename AT::t_neighbors_2d,
+                  int, int, EV_FLOAT &);
+  template <int NEIGHFLAG>
+  KOKKOS_INLINE_FUNCTION void getscreen(int, int, typename AT::t_x_array, typename AT::t_int_1d,
+                                        typename AT::t_int_1d, int, typename AT::t_int_1d,
+                                        typename AT::t_int_1d) const;
+  template <int NEIGHFLAG>
+  KOKKOS_INLINE_FUNCTION void calc_rho1(int, int, typename AT::t_int_1d, typename AT::t_int_1d,
+                                        typename AT::t_x_array, typename AT::t_int_1d, int) const;
+  KOKKOS_INLINE_FUNCTION
+  double fcut(const double xi) const;
+  KOKKOS_INLINE_FUNCTION
+  double dfcut(const double xi, double &dfc) const;
+  KOKKOS_INLINE_FUNCTION
+  double dCfunc(const double, const double, const double) const;
+  KOKKOS_INLINE_FUNCTION
+  void dCfunc2(const double, const double, const double, double &, double &) const;
+  KOKKOS_INLINE_FUNCTION
+  double G_gam(const double, const int, int &) const;
+  KOKKOS_INLINE_FUNCTION
+  double dG_gam(const double, const int, double &) const;
+  KOKKOS_INLINE_FUNCTION
+  double zbl(const double, const int, const int) const;
+  KOKKOS_INLINE_FUNCTION
+  double embedding(const double, const double, const double, double &) const;
+  KOKKOS_INLINE_FUNCTION
+  double erose(const double, const double, const double, const double, const double, const double,
+               const int) const;
+  KOKKOS_INLINE_FUNCTION
+  void get_shpfcn(const lattice_t latt, const double sthe, const double cthe, double (&s)[3]) const;
+  KOKKOS_INLINE_FUNCTION
+  int get_Zij(const lattice_t) const;
+
+ public:
+  DAT::tdual_ffloat_1d k_rho, k_rho0, k_rho1, k_rho2, k_rho3, k_frhop;
+  typename ArrayTypes<DeviceType>::t_ffloat_1d d_rho, d_rho0, d_rho1, d_rho2, d_rho3, d_frhop;
+  HAT::t_ffloat_1d h_rho, h_rho0, h_rho1, h_rho2, h_rho3, h_frhop;
+  DAT::tdual_ffloat_1d k_gamma, k_dgamma1, k_dgamma2, k_dgamma3, k_arho2b;
+  typename ArrayTypes<DeviceType>::t_ffloat_1d d_gamma, d_dgamma1, d_dgamma2, d_dgamma3, d_arho2b;
+  HAT::t_ffloat_1d h_gamma, h_dgamma1, h_dgamma2, h_dgamma3, h_arho2b;
+  DAT::tdual_ffloat_2d k_arho1, k_arho2, k_arho3, k_arho3b, k_t_ave, k_tsq_ave;
+  typename ArrayTypes<DeviceType>::t_ffloat_2d d_arho1, d_arho2, d_arho3, d_arho3b, d_t_ave,
+      d_tsq_ave;
+  HAT::t_ffloat_2d h_arho1, h_arho2, h_arho3, h_arho3b, h_t_ave, h_tsq_ave;
+  typename ArrayTypes<DeviceType>::t_ffloat_2d d_phir, d_phirar, d_phirar1, d_phirar2, d_phirar3,
+      d_phirar4, d_phirar5, d_phirar6;
+  DAT::tdual_ffloat_1d k_scrfcn, k_dscrfcn, k_fcpair;
+  typename ArrayTypes<DeviceType>::t_ffloat_1d d_scrfcn, d_dscrfcn, d_fcpair;
+  HAT::t_ffloat_1d h_scrfcn, h_dscrfcn, h_fcpair;
+
+ protected:
+  int need_dup;
+  using KKDeviceType = typename KKDevice<DeviceType>::value;
+
+  template <typename DataType, typename Layout>
+  using DupScatterView =
+      KKScatterView<DataType, Layout, KKDeviceType, KKScatterSum, KKScatterDuplicated>;
+
+  template <typename DataType, typename Layout>
+  using NonDupScatterView =
+      KKScatterView<DataType, Layout, KKDeviceType, KKScatterSum, KKScatterNonDuplicated>;
+
+  DupScatterView<typename decltype(d_rho0)::data_type, typename decltype(d_rho0)::array_layout>
+      dup_rho0;
+  NonDupScatterView<typename decltype(d_rho0)::data_type, typename decltype(d_rho0)::array_layout>
+      ndup_rho0;
+  DupScatterView<typename decltype(d_arho2b)::data_type, typename decltype(d_arho2b)::array_layout>
+      dup_arho2b;
+  NonDupScatterView<typename decltype(d_arho2b)::data_type,
+                    typename decltype(d_arho2b)::array_layout>
+      ndup_arho2b;
+  DupScatterView<typename decltype(d_arho1)::data_type, typename decltype(d_arho1)::array_layout>
+      dup_arho1;
+  NonDupScatterView<typename decltype(d_arho1)::data_type, typename decltype(d_arho1)::array_layout>
+      ndup_arho1;
+  DupScatterView<typename decltype(d_arho2)::data_type, typename decltype(d_arho2)::array_layout>
+      dup_arho2;
+  NonDupScatterView<typename decltype(d_arho2)::data_type, typename decltype(d_arho2)::array_layout>
+      ndup_arho2;
+  DupScatterView<typename decltype(d_arho3)::data_type, typename decltype(d_arho3)::array_layout>
+      dup_arho3;
+  NonDupScatterView<typename decltype(d_arho3)::data_type, typename decltype(d_arho3)::array_layout>
+      ndup_arho3;
+  DupScatterView<typename decltype(d_arho3b)::data_type, typename decltype(d_arho3b)::array_layout>
+      dup_arho3b;
+  NonDupScatterView<typename decltype(d_arho3b)::data_type,
+                    typename decltype(d_arho3b)::array_layout>
+      ndup_arho3b;
+  DupScatterView<typename decltype(d_t_ave)::data_type, typename decltype(d_t_ave)::array_layout>
+      dup_t_ave;
+  NonDupScatterView<typename decltype(d_t_ave)::data_type, typename decltype(d_t_ave)::array_layout>
+      ndup_t_ave;
+  DupScatterView<typename decltype(d_tsq_ave)::data_type,
+                 typename decltype(d_tsq_ave)::array_layout>
+      dup_tsq_ave;
+  NonDupScatterView<typename decltype(d_tsq_ave)::data_type,
+                    typename decltype(d_tsq_ave)::array_layout>
+      ndup_tsq_ave;
+  DupScatterView<typename decltype(f)::data_type, typename decltype(f)::array_layout> dup_f;
+  NonDupScatterView<typename decltype(f)::data_type, typename decltype(f)::array_layout> ndup_f;
+  DupScatterView<typename decltype(d_eatom)::data_type, typename decltype(d_eatom)::array_layout>
+      dup_eatom;
+  NonDupScatterView<typename decltype(d_eatom)::data_type, typename decltype(d_eatom)::array_layout>
+      ndup_eatom;
+  DupScatterView<typename decltype(d_vatom)::data_type, typename decltype(d_vatom)::array_layout>
+      dup_vatom;
+  NonDupScatterView<typename decltype(d_vatom)::data_type, typename decltype(d_vatom)::array_layout>
+      ndup_vatom;
+};
+
+KOKKOS_INLINE_FUNCTION
+static bool iszero_kk(const double f)
+{
+  return fabs(f) < 1e-20;
+}
+
+KOKKOS_INLINE_FUNCTION
+static bool isone_kk(const double f)
+{
+  return fabs(f - 1.0) < 1e-20;
+}
+
+KOKKOS_INLINE_FUNCTION
+static double fdiv_zero_kk(const double n, const double d)
+{
+  if (iszero_kk(d)) return 0.0;
+  return n / d;
+}
+
+// Functions we need for compat
+
+}    // namespace LAMMPS_NS
+#include "meam_impl_kokkos.h"
+
+#endif
--- a/src/KOKKOS/meam_setup_done_kokkos.h
+++ b/src/KOKKOS/meam_setup_done_kokkos.h
@ -0,0 +1,60 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "meam_kokkos.h"
+
+template<class DeviceType>
+void MEAMKokkos<DeviceType>::meam_setup_done(double* cutmax)
+{
+  MEAM::meam_setup_done(cutmax);
+
+  MemKK::realloc_kokkos(d_phir, "pair:phir", (neltypes * (neltypes + 1)) / 2, nr);
+  MemKK::realloc_kokkos(d_phirar, "pair:phirar", (neltypes * (neltypes + 1)) / 2, nr);
+  MemKK::realloc_kokkos(d_phirar1, "pair:phirar1", (neltypes * (neltypes + 1)) / 2, nr);
+  MemKK::realloc_kokkos(d_phirar2, "pair:phirar2", (neltypes * (neltypes + 1)) / 2, nr);
+  MemKK::realloc_kokkos(d_phirar3, "pair:phirar3", (neltypes * (neltypes + 1)) / 2, nr);
+  MemKK::realloc_kokkos(d_phirar4, "pair:phirar4", (neltypes * (neltypes + 1)) / 2, nr);
+  MemKK::realloc_kokkos(d_phirar5, "pair:phirar5", (neltypes * (neltypes + 1)) / 2, nr);
+  MemKK::realloc_kokkos(d_phirar6, "pair:phirar6", (neltypes * (neltypes + 1)) / 2, nr);
+
+  auto h_phir = Kokkos::create_mirror_view(d_phir);
+  auto h_phirar = Kokkos::create_mirror_view(d_phirar);
+  auto h_phirar1 = Kokkos::create_mirror_view(d_phirar1);
+  auto h_phirar2 = Kokkos::create_mirror_view(d_phirar2);
+  auto h_phirar3 = Kokkos::create_mirror_view(d_phirar3);
+  auto h_phirar4 = Kokkos::create_mirror_view(d_phirar4);
+  auto h_phirar5 = Kokkos::create_mirror_view(d_phirar5);
+  auto h_phirar6 = Kokkos::create_mirror_view(d_phirar6);
+
+  for (int i = 0; i <(neltypes * (neltypes + 1)) / 2; i++)
+    for(int j = 0; j < nr; j++) {
+      h_phir(i,j) = phir[i][j];
+      h_phirar(i,j) = phirar[i][j];
+      h_phirar1(i,j) = phirar1[i][j];
+      h_phirar2(i,j) = phirar2[i][j];
+      h_phirar3(i,j) = phirar3[i][j];
+      h_phirar4(i,j) = phirar4[i][j];
+      h_phirar5(i,j) = phirar5[i][j];
+      h_phirar6(i,j) = phirar6[i][j];
+    }
+
+  Kokkos::deep_copy(d_phir,h_phir);
+  Kokkos::deep_copy(d_phirar,h_phirar);
+  Kokkos::deep_copy(d_phirar1,h_phirar1);
+  Kokkos::deep_copy(d_phirar2,h_phirar2);
+  Kokkos::deep_copy(d_phirar3,h_phirar3);
+  Kokkos::deep_copy(d_phirar4,h_phirar4);
+  Kokkos::deep_copy(d_phirar5,h_phirar5);
+  Kokkos::deep_copy(d_phirar6,h_phirar6);
+}
--- a/src/KOKKOS/min_cg_kokkos.cpp
+++ b/src/KOKKOS/min_cg_kokkos.cpp
@ -49,6 +49,8 @@ int MinCGKokkos::iterate(int maxiter)
  fix_minimize_kk->k_vectors.sync<LMPDeviceType>();
  fix_minimize_kk->k_vectors.modify<LMPDeviceType>();

+  atomKK->sync(Device,F_MASK);
+
  // nlimit = max # of CG iterations before restarting
  // set to ndoftotal unless too big

--- a/src/KOKKOS/min_kokkos.cpp
+++ b/src/KOKKOS/min_kokkos.cpp
@ -79,6 +79,8 @@ void MinKokkos::setup(int flag)
  }
  update->setupflag = 1;

+  lmp->kokkos->auto_sync = 1;
+
  // setup extra global dof due to fixes
  // cannot be done in init() b/c update init() is before modify init()

@ -170,7 +172,7 @@ void MinKokkos::setup(int flag)
  }
  else if (force->pair) force->pair->compute_dummy(eflag,vflag);

-  if (atomKK->molecular) {
+  if (atom->molecular != Atom::ATOMIC) {
    if (force->bond) {
      atomKK->sync(force->bond->execution_space,force->bond->datamask_read);
      force->bond->compute(eflag,vflag);
@ -242,6 +244,8 @@ void MinKokkos::setup_minimal(int flag)
  // acquire ghosts
  // build neighbor lists

+  lmp->kokkos->auto_sync = 1;
+
  if (flag) {
    modify->setup_pre_exchange();
    if (triclinic) domain->x2lamda(atom->nlocal);
@ -277,7 +281,7 @@ void MinKokkos::setup_minimal(int flag)
  }
  else if (force->pair) force->pair->compute_dummy(eflag,vflag);

-  if (atomKK->molecular) {
+  if (atom->molecular != Atom::ATOMIC) {
    if (force->bond) {
      atomKK->sync(force->bond->execution_space,force->bond->datamask_read);
      force->bond->compute(eflag,vflag);
@ -495,6 +499,7 @@ double MinKokkos::energy_force(int resetflag)
  if (force->newton) {
    comm->reverse_comm();
    timer->stamp(Timer::COMM);
+    atomKK->sync(Device,F_MASK);
  }

  // update per-atom minimization variables stored by pair styles
@ -567,7 +572,7 @@ void MinKokkos::force_clear()
      }
    });
  }
-  atomKK->modified(Device,F_MASK);
+  atomKK->modified(Device,F_MASK|TORQUE_MASK);
 }

 /* ----------------------------------------------------------------------
@ -576,6 +581,7 @@ void MinKokkos::force_clear()

 double MinKokkos::fnorm_sqr()
 {
+  atomKK->sync(Device,F_MASK);

  double local_norm2_sqr = 0.0;
  {
@ -604,6 +610,7 @@ double MinKokkos::fnorm_sqr()

 double MinKokkos::fnorm_inf()
 {
+  atomKK->sync(Device,F_MASK);

  double local_norm_inf = 0.0;
  {
@ -632,6 +639,7 @@ double MinKokkos::fnorm_inf()

 double MinKokkos::fnorm_max()
 {
+  atomKK->sync(Device,F_MASK);

  double local_norm_max = 0.0;
  {
--- a/src/KOKKOS/min_linesearch_kokkos.cpp
+++ b/src/KOKKOS/min_linesearch_kokkos.cpp
@ -111,9 +111,6 @@ void MinLineSearchKokkos::reset_vectors()
  x0 = fix_minimize_kk->request_vector_kokkos(0);
  g = fix_minimize_kk->request_vector_kokkos(1);
  h = fix_minimize_kk->request_vector_kokkos(2);
-
-  auto h_fvec = Kokkos::create_mirror_view(fvec);
-  Kokkos::deep_copy(h_fvec,fvec);
 }

 /* ----------------------------------------------------------------------
@ -181,6 +178,8 @@ int MinLineSearchKokkos::linemin_quadratic(double eoriginal, double &alpha)
  fix_minimize_kk->k_vectors.sync<LMPDeviceType>();
  fix_minimize_kk->k_vectors.modify<LMPDeviceType>();

+  atomKK->sync(Device,X_MASK|F_MASK);
+
  // fdothall = projection of search dir along downhill gradient
  // if search direction is not downhill, exit with error

@ -364,8 +363,8 @@ double MinLineSearchKokkos::alpha_step(double alpha, int resetflag)
  // reset to starting point

  if (nextra_global) modify->min_step(0.0,hextra);
-  atomKK->k_x.clear_sync_state(); // ignore if host positions since device
-                                  //  positions will be reset below
+  atomKK->k_x.clear_sync_state(); // ignore if host positions modified since
+                                  //  device positions will be reset below
  {
    // local variables for lambda capture

@ -409,6 +408,8 @@ double MinLineSearchKokkos::compute_dir_deriv(double &ff)
  double dot[2],dotall[2];
  double fh;

+  atomKK->sync(Device,F_MASK);
+
  // compute new fh, alpha, delfh

  s_double2 sdot;
--- a/src/KOKKOS/npair_kokkos.cpp
+++ b/src/KOKKOS/npair_kokkos.cpp
@ -153,6 +153,9 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI,SIZE>::build(NeighList *list_)
  int nall = nlocal;
  if (GHOST)
    nall += atom->nghost;
+
+  if (nall == 0) return;
+
  list->grow(nall);

  NeighborKokkosExecute<DeviceType>
--- a/src/KOKKOS/pair_meam_kokkos.cpp
+++ b/src/KOKKOS/pair_meam_kokkos.cpp
@ -0,0 +1,753 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Naga Vydyanathan (NVIDIA), Stan Moore (SNL)
+------------------------------------------------------------------------- */
+
+#include "pair_meam_kokkos.h"
+#include "meam_kokkos.h"
+
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "kokkos.h"
+#include "memory_kokkos.h"
+#include "neigh_list_kokkos.h"
+#include "neigh_request.h"
+#include "neighbor.h"
+
+#include <cmath>
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairMEAMKokkos<DeviceType>::PairMEAMKokkos(LAMMPS *lmp) : PairMEAM(lmp)
+{
+  respa_enable = 0;
+
+  kokkosable = 1;
+  reverse_comm_device = 1;
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK;
+  datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK;
+
+  delete meam_inst;
+  meam_inst_kk = new MEAMKokkos<DeviceType>(memory);
+  meam_inst = meam_inst_kk;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairMEAMKokkos<DeviceType>::~PairMEAMKokkos()
+{
+  if (copymode) return;
+
+  memoryKK->destroy_kokkos(k_eatom,eatom);
+  memoryKK->destroy_kokkos(k_vatom,vatom);
+  delete meam_inst_kk;
+  meam_inst = nullptr;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairMEAMKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
+{
+  eflag = eflag_in;
+  vflag = vflag_in;
+
+  if (neighflag == FULL) no_virial_fdotr_compute = 1;
+
+  ev_init(eflag,vflag,0);
+
+  // reallocate per-atom arrays if necessary
+
+  if (eflag_atom) {
+    memoryKK->destroy_kokkos(k_eatom,eatom);
+    memoryKK->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom");
+    d_eatom = k_eatom.view<DeviceType>();
+  }
+  if (vflag_atom) {
+    memoryKK->destroy_kokkos(k_vatom,vatom);
+    memoryKK->create_kokkos(k_vatom,vatom,maxvatom,"pair:vatom");
+    d_vatom = k_vatom.view<DeviceType>();
+  }
+
+  // neighbor list info
+
+  int inum_half = listhalf->inum;
+  NeighListKokkos<DeviceType>* k_halflist = static_cast<NeighListKokkos<DeviceType>*>(listhalf);
+  d_ilist_half = k_halflist->d_ilist;
+  d_numneigh_half = k_halflist->d_numneigh;
+  d_neighbors_half = k_halflist->d_neighbors;
+
+  NeighListKokkos<DeviceType>* k_fulllist = static_cast<NeighListKokkos<DeviceType>*>(listfull);
+  d_numneigh_full = k_fulllist->d_numneigh;
+  d_neighbors_full = k_fulllist->d_neighbors;
+
+  EV_FLOAT ev;
+
+  copymode = 1;
+  meam_inst_kk->copymode = 1;
+
+  // strip neighbor lists of any special bond flags before using with MEAM
+  // necessary before doing neigh_f2c and neigh_c2f conversions each step
+
+  if (neighbor->ago == 0)
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMEAMNeighStrip >(0,inum_half),*this);
+
+  // check size of scrfcn based on half neighbor list
+
+  nlocal = atom->nlocal;
+  nall = nlocal + atom->nghost;
+
+  int n = 0;
+  Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairMEAMOffsets>(0,inum_half),*this,n);
+
+  meam_inst_kk->meam_dens_setup(atom->nmax, nall, n);
+
+  x = atomKK->k_x.view<DeviceType>();
+  f = atomKK->k_f.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+
+  atomKK->sync(execution_space,datamask_read);
+
+  int ntype = atom->ntypes;
+
+  // 3 stages of MEAM calculation
+  // loop over my atoms followed by communication
+
+  int errorflag = 0;
+
+  d_offset = typename AT::t_int_1d("pair:offset",inum_half+1);
+  {
+    // local variables for lambda capture
+
+    auto l_ilist_half = d_ilist_half;
+    auto l_numneigh_half = d_numneigh_half;
+    auto l_offset = d_offset;
+
+    Kokkos::parallel_scan(inum_half, LAMMPS_LAMBDA(int ii, int &m_fill, bool final) {
+      int i = l_ilist_half[ii];
+      m_fill += l_numneigh_half[i];
+      if (final)
+        l_offset[ii+1] = m_fill;
+    });
+  }
+
+  int need_dup = lmp->kokkos->need_dup<DeviceType>();
+
+  meam_inst_kk->meam_dens_init(inum_half,ntype,type,d_map,x,d_numneigh_half,d_numneigh_full,d_ilist_half,d_neighbors_half, d_neighbors_full, d_offset, neighflag, need_dup);
+
+  meam_inst_kk->k_rho0.template modify<DeviceType>();
+  meam_inst_kk->k_arho2b.template modify<DeviceType>();
+  meam_inst_kk->k_arho1.template modify<DeviceType>();
+  meam_inst_kk->k_arho2.template modify<DeviceType>();
+  meam_inst_kk->k_arho3.template modify<DeviceType>();
+  meam_inst_kk->k_arho3b.template modify<DeviceType>();
+  meam_inst_kk->k_t_ave.template modify<DeviceType>();
+  meam_inst_kk->k_tsq_ave.template modify<DeviceType>();
+
+  comm->reverse_comm(this);
+
+  meam_inst_kk->k_rho0.template sync<DeviceType>();
+  meam_inst_kk->k_arho2b.template sync<DeviceType>();
+  meam_inst_kk->k_arho1.template sync<DeviceType>();
+  meam_inst_kk->k_arho2.template sync<DeviceType>();
+  meam_inst_kk->k_arho3.template sync<DeviceType>();
+  meam_inst_kk->k_arho3b.template sync<DeviceType>();
+  meam_inst_kk->k_t_ave.template sync<DeviceType>();
+  meam_inst_kk->k_tsq_ave.template sync<DeviceType>();
+
+  meam_inst_kk->meam_dens_final(nlocal,eflag_either,eflag_global,eflag_atom,
+                   d_eatom,ntype,type,d_map,d_scale,errorflag,ev);
+
+  if (errorflag)
+    error->one(FLERR,"MEAM library error {}",errorflag);
+
+  meam_inst_kk->k_rho0.template modify<DeviceType>();
+  meam_inst_kk->k_rho1.template modify<DeviceType>();
+  meam_inst_kk->k_rho2.template modify<DeviceType>();
+  meam_inst_kk->k_rho3.template modify<DeviceType>();
+  meam_inst_kk->k_frhop.template modify<DeviceType>();
+  meam_inst_kk->k_gamma.template modify<DeviceType>();
+  meam_inst_kk->k_dgamma1.template modify<DeviceType>();
+  meam_inst_kk->k_dgamma2.template modify<DeviceType>();
+  meam_inst_kk->k_dgamma3.template modify<DeviceType>();
+  meam_inst_kk->k_arho2b.template modify<DeviceType>();
+  meam_inst_kk->k_arho1.template modify<DeviceType>();
+  meam_inst_kk->k_arho2.template modify<DeviceType>();
+  meam_inst_kk->k_arho3.template modify<DeviceType>();
+  meam_inst_kk->k_arho3b.template modify<DeviceType>();
+  meam_inst_kk->k_t_ave.template modify<DeviceType>();
+  meam_inst_kk->k_tsq_ave.template modify<DeviceType>();
+
+  comm->forward_comm(this);
+
+  meam_inst_kk->k_rho0.template sync<DeviceType>();
+  meam_inst_kk->k_rho1.template sync<DeviceType>();
+  meam_inst_kk->k_rho2.template sync<DeviceType>();
+  meam_inst_kk->k_rho3.template sync<DeviceType>();
+  meam_inst_kk->k_frhop.template sync<DeviceType>();
+  meam_inst_kk->k_gamma.template sync<DeviceType>();
+  meam_inst_kk->k_dgamma1.template sync<DeviceType>();
+  meam_inst_kk->k_dgamma2.template sync<DeviceType>();
+  meam_inst_kk->k_dgamma3.template sync<DeviceType>();
+  meam_inst_kk->k_arho2b.template sync<DeviceType>();
+  meam_inst_kk->k_arho1.template sync<DeviceType>();
+  meam_inst_kk->k_arho2.template sync<DeviceType>();
+  meam_inst_kk->k_arho3.template sync<DeviceType>();
+  meam_inst_kk->k_arho3b.template sync<DeviceType>();
+  meam_inst_kk->k_t_ave.template sync<DeviceType>();
+  meam_inst_kk->k_tsq_ave.template sync<DeviceType>();
+
+  meam_inst_kk->meam_force(inum_half,eflag_global,eflag_atom,vflag_global,
+                           vflag_atom,d_eatom,ntype,type,d_map,x,
+                           d_numneigh_half, d_numneigh_full,f,d_vatom,
+                           d_ilist_half, d_offset, d_neighbors_half, d_neighbors_full,
+                           neighflag, need_dup, ev);
+
+  if (eflag_global) eng_vdwl += ev.evdwl;
+  if (vflag_global) {
+    virial[0] += ev.v[0];
+    virial[1] += ev.v[1];
+    virial[2] += ev.v[2];
+    virial[3] += ev.v[3];
+    virial[4] += ev.v[4];
+    virial[5] += ev.v[5];
+  }
+
+  if (vflag_fdotr) pair_virial_fdotr_compute(this);
+  if (eflag_atom) {
+    k_eatom.template modify<DeviceType>();
+    k_eatom.sync_host();
+  }
+
+  if (vflag_atom) {
+    k_vatom.template modify<DeviceType>();
+    k_vatom.sync_host();
+  }
+
+  if (eflag || vflag) atomKK->modified(execution_space,datamask_modify);
+  else atomKK->modified(execution_space,F_MASK);
+
+  copymode = 0;
+  meam_inst_kk->copymode = 0;
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+------------------------------------------------------------------------- */
+template<class DeviceType>
+void PairMEAMKokkos<DeviceType>::coeff(int narg, char **arg)
+{
+  PairMEAM::coeff(narg,arg);
+
+  // sync map and scale
+
+  int n = atom->ntypes;
+  MemKK::realloc_kokkos(d_map,"pair:map",n+1);
+  MemKK::realloc_kokkos(d_scale,"pair:scale",n+1,n+1);
+  auto h_map = Kokkos::create_mirror_view(d_map);
+  auto h_scale = Kokkos::create_mirror_view(d_scale);
+
+  for (int i = 1; i <= n; i++) {
+    h_map[i] = map[i];
+    for (int j = 1; j <= n; j++)
+      h_scale(i,j) = scale[i][j];
+  }
+
+  Kokkos::deep_copy(d_map,h_map);
+  Kokkos::deep_copy(d_scale,h_scale);
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+template<class DeviceType>
+void PairMEAMKokkos<DeviceType>::init_style()
+{
+  PairMEAM::init_style();
+
+  // adjust neighbor list request for KOKKOS
+
+  neighflag = lmp->kokkos->neighflag;
+  auto request = neighbor->find_request(this,1);
+  request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
+                           !std::is_same<DeviceType,LMPDeviceType>::value);
+  request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
+
+  request = neighbor->find_request(this,2);
+  request->set_kokkos_host(std::is_same<DeviceType,LMPHostType>::value &&
+                           !std::is_same<DeviceType,LMPDeviceType>::value);
+  request->set_kokkos_device(std::is_same<DeviceType,LMPDeviceType>::value);
+
+  if (neighflag == FULL)
+    error->all(FLERR,"Must use half neighbor list style with pair meam/kk");
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+int PairMEAMKokkos<DeviceType>::pack_forward_comm_kokkos(int n, DAT::tdual_int_2d k_sendlist, int iswap_in, DAT::tdual_xfloat_1d &buf,
+                                int /*pbc_flag*/, int * /*pbc*/)
+{
+  d_sendlist = k_sendlist.view<DeviceType>();
+  iswap = iswap_in;
+  v_buf = buf.view<DeviceType>();
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMEAMPackForwardComm>(0,n),*this);
+  return n*38;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairMEAMKokkos<DeviceType>::operator()(TagPairMEAMPackForwardComm, const int &i) const {
+  int j = d_sendlist(iswap, i);
+  int m = i*38;
+  v_buf[m++] = meam_inst_kk->d_rho0[j];
+  v_buf[m++] = meam_inst_kk->d_rho1[j];
+  v_buf[m++] = meam_inst_kk->d_rho2[j];
+  v_buf[m++] = meam_inst_kk->d_rho3[j];
+  v_buf[m++] = meam_inst_kk->d_frhop[j];
+  v_buf[m++] = meam_inst_kk->d_gamma[j];
+  v_buf[m++] = meam_inst_kk->d_dgamma1[j];
+  v_buf[m++] = meam_inst_kk->d_dgamma2[j];
+  v_buf[m++] = meam_inst_kk->d_dgamma3[j];
+  v_buf[m++] = meam_inst_kk->d_arho2b[j];
+  v_buf[m++] = meam_inst_kk->d_arho1(j,0);
+  v_buf[m++] = meam_inst_kk->d_arho1(j,1);
+  v_buf[m++] = meam_inst_kk->d_arho1(j,2);
+  v_buf[m++] = meam_inst_kk->d_arho2(j,0);
+  v_buf[m++] = meam_inst_kk->d_arho2(j,1);
+  v_buf[m++] = meam_inst_kk->d_arho2(j,2);
+  v_buf[m++] = meam_inst_kk->d_arho2(j,3);
+  v_buf[m++] = meam_inst_kk->d_arho2(j,4);
+  v_buf[m++] = meam_inst_kk->d_arho2(j,5);
+  for (int k = 0; k < 10; k++) v_buf[m++] = meam_inst_kk->d_arho3(j,k);
+  v_buf[m++] = meam_inst_kk->d_arho3b(j,0);
+  v_buf[m++] = meam_inst_kk->d_arho3b(j,1);
+  v_buf[m++] = meam_inst_kk->d_arho3b(j,2);
+  v_buf[m++] = meam_inst_kk->d_t_ave(j,0);
+  v_buf[m++] = meam_inst_kk->d_t_ave(j,1);
+  v_buf[m++] = meam_inst_kk->d_t_ave(j,2);
+  v_buf[m++] = meam_inst_kk->d_tsq_ave(j,0);
+  v_buf[m++] = meam_inst_kk->d_tsq_ave(j,1);
+  v_buf[m++] = meam_inst_kk->d_tsq_ave(j,2);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairMEAMKokkos<DeviceType>::unpack_forward_comm_kokkos(int n, int first_in, DAT::tdual_xfloat_1d &buf)
+{
+  first = first_in;
+  v_buf = buf.view<DeviceType>();
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMEAMUnpackForwardComm>(0,n),*this);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairMEAMKokkos<DeviceType>::operator()(TagPairMEAMUnpackForwardComm, const int &i) const{
+   int m = i*38;
+
+    meam_inst_kk->d_rho0[i+first] = v_buf[m++];
+    meam_inst_kk->d_rho1[i+first] = v_buf[m++];
+    meam_inst_kk->d_rho2[i+first] = v_buf[m++];
+    meam_inst_kk->d_rho3[i+first] = v_buf[m++];
+    meam_inst_kk->d_frhop[i+first] = v_buf[m++];
+    meam_inst_kk->d_gamma[i+first] = v_buf[m++];
+    meam_inst_kk->d_dgamma1[i+first] = v_buf[m++];
+    meam_inst_kk->d_dgamma2[i+first] = v_buf[m++];
+    meam_inst_kk->d_dgamma3[i+first] = v_buf[m++];
+    meam_inst_kk->d_arho2b[i+first] = v_buf[m++];
+    meam_inst_kk->d_arho1(i+first,0) = v_buf[m++];
+    meam_inst_kk->d_arho1(i+first,1) = v_buf[m++];
+    meam_inst_kk->d_arho1(i+first,2) = v_buf[m++];
+    meam_inst_kk->d_arho2(i+first,0) = v_buf[m++];
+    meam_inst_kk->d_arho2(i+first,1) = v_buf[m++];
+    meam_inst_kk->d_arho2(i+first,2) = v_buf[m++];
+    meam_inst_kk->d_arho2(i+first,3) = v_buf[m++];
+    meam_inst_kk->d_arho2(i+first,4) = v_buf[m++];
+    meam_inst_kk->d_arho2(i+first,5) = v_buf[m++];
+    for (int k = 0; k < 10; k++) meam_inst_kk->d_arho3(i+first,k) = v_buf[m++];
+    meam_inst_kk->d_arho3b(i+first,0) = v_buf[m++];
+    meam_inst_kk->d_arho3b(i+first,1) = v_buf[m++];
+    meam_inst_kk->d_arho3b(i+first,2) = v_buf[m++];
+    meam_inst_kk->d_t_ave(i+first,0) = v_buf[m++];
+    meam_inst_kk->d_t_ave(i+first,1) = v_buf[m++];
+    meam_inst_kk->d_t_ave(i+first,2) = v_buf[m++];
+    meam_inst_kk->d_tsq_ave(i+first,0) = v_buf[m++];
+    meam_inst_kk->d_tsq_ave(i+first,1) = v_buf[m++];
+    meam_inst_kk->d_tsq_ave(i+first,2) = v_buf[m++];
+ }
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+int PairMEAMKokkos<DeviceType>::pack_forward_comm(int n, int *list, double *buf,
+                               int pbc_flag, int *pbc)
+{
+  meam_inst_kk->k_rho0.sync_host();
+  meam_inst_kk->k_rho1.sync_host();
+  meam_inst_kk->k_rho2.sync_host();
+  meam_inst_kk->k_rho3.sync_host();
+  meam_inst_kk->k_frhop.sync_host();
+  meam_inst_kk->k_gamma.sync_host();
+  meam_inst_kk->k_dgamma1.sync_host();
+  meam_inst_kk->k_dgamma2.sync_host();
+  meam_inst_kk->k_dgamma3.sync_host();
+  meam_inst_kk->k_arho2b.sync_host();
+  meam_inst_kk->k_arho1.sync_host();
+  meam_inst_kk->k_arho2.sync_host();
+  meam_inst_kk->k_arho3.sync_host();
+  meam_inst_kk->k_arho3b.sync_host();
+  meam_inst_kk->k_t_ave.sync_host();
+  meam_inst_kk->k_tsq_ave.sync_host();
+
+  int m = 0;
+  for (int i = 0; i < n; i++) {
+    const int j = list[i];
+    buf[m++] = meam_inst_kk->h_rho0[j];
+    buf[m++] = meam_inst_kk->h_rho1[j];
+    buf[m++] = meam_inst_kk->h_rho2[j];
+    buf[m++] = meam_inst_kk->h_rho3[j];
+    buf[m++] = meam_inst_kk->h_frhop[j];
+    buf[m++] = meam_inst_kk->h_gamma[j];
+    buf[m++] = meam_inst_kk->h_dgamma1[j];
+    buf[m++] = meam_inst_kk->h_dgamma2[j];
+    buf[m++] = meam_inst_kk->h_dgamma3[j];
+    buf[m++] = meam_inst_kk->h_arho2b[j];
+    buf[m++] = meam_inst_kk->h_arho1(j,0);
+    buf[m++] = meam_inst_kk->h_arho1(j,1);
+    buf[m++] = meam_inst_kk->h_arho1(j,2);
+    buf[m++] = meam_inst_kk->h_arho2(j,0);
+    buf[m++] = meam_inst_kk->h_arho2(j,1);
+    buf[m++] = meam_inst_kk->h_arho2(j,2);
+    buf[m++] = meam_inst_kk->h_arho2(j,3);
+    buf[m++] = meam_inst_kk->h_arho2(j,4);
+    buf[m++] = meam_inst_kk->h_arho2(j,5);
+    for (int k = 0; k < 10; k++) buf[m++] = meam_inst_kk->h_arho3(j,k);
+    buf[m++] = meam_inst_kk->h_arho3b(j,0);
+    buf[m++] = meam_inst_kk->h_arho3b(j,1);
+    buf[m++] = meam_inst_kk->h_arho3b(j,2);
+    buf[m++] = meam_inst_kk->h_t_ave(j,0);
+    buf[m++] = meam_inst_kk->h_t_ave(j,1);
+    buf[m++] = meam_inst_kk->h_t_ave(j,2);
+    buf[m++] = meam_inst_kk->h_tsq_ave(j,0);
+    buf[m++] = meam_inst_kk->h_tsq_ave(j,1);
+    buf[m++] = meam_inst_kk->h_tsq_ave(j,2);
+  }
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairMEAMKokkos<DeviceType>::unpack_forward_comm(int n, int first, double *buf)
+{
+  meam_inst_kk->k_rho0.sync_host();
+  meam_inst_kk->k_rho1.sync_host();
+  meam_inst_kk->k_rho2.sync_host();
+  meam_inst_kk->k_rho3.sync_host();
+  meam_inst_kk->k_frhop.sync_host();
+  meam_inst_kk->k_gamma.sync_host();
+  meam_inst_kk->k_dgamma1.sync_host();
+  meam_inst_kk->k_dgamma2.sync_host();
+  meam_inst_kk->k_dgamma3.sync_host();
+  meam_inst_kk->k_arho2b.sync_host();
+  meam_inst_kk->k_arho1.sync_host();
+  meam_inst_kk->k_arho2.sync_host();
+  meam_inst_kk->k_arho3.sync_host();
+  meam_inst_kk->k_arho3b.sync_host();
+  meam_inst_kk->k_t_ave.sync_host();
+  meam_inst_kk->k_tsq_ave.sync_host();
+
+  int m = 0;
+  const int last = first + n;
+  for (int i = first; i < last; i++) {
+    meam_inst_kk->h_rho0[i] = buf[m++];
+    meam_inst_kk->h_rho1[i] = buf[m++];
+    meam_inst_kk->h_rho2[i] = buf[m++];
+    meam_inst_kk->h_rho3[i] = buf[m++];
+    meam_inst_kk->h_frhop[i] = buf[m++];
+    meam_inst_kk->h_gamma[i] = buf[m++];
+    meam_inst_kk->h_dgamma1[i] = buf[m++];
+    meam_inst_kk->h_dgamma2[i] = buf[m++];
+    meam_inst_kk->h_dgamma3[i] = buf[m++];
+    meam_inst_kk->h_arho2b[i] = buf[m++];
+    meam_inst_kk->h_arho1(i,0) = buf[m++];
+    meam_inst_kk->h_arho1(i,1) = buf[m++];
+    meam_inst_kk->h_arho1(i,2) = buf[m++];
+    meam_inst_kk->h_arho2(i,0) = buf[m++];
+    meam_inst_kk->h_arho2(i,1) = buf[m++];
+    meam_inst_kk->h_arho2(i,2) = buf[m++];
+    meam_inst_kk->h_arho2(i,3) = buf[m++];
+    meam_inst_kk->h_arho2(i,4) = buf[m++];
+    meam_inst_kk->h_arho2(i,5) = buf[m++];
+    for (int k = 0; k < 10; k++) meam_inst_kk->h_arho3(i,k) = buf[m++];
+    meam_inst_kk->h_arho3b(i,0) = buf[m++];
+    meam_inst_kk->h_arho3b(i,1) = buf[m++];
+    meam_inst_kk->h_arho3b(i,2) = buf[m++];
+    meam_inst_kk->h_t_ave(i,0) = buf[m++];
+    meam_inst_kk->h_t_ave(i,1) = buf[m++];
+    meam_inst_kk->h_t_ave(i,2) = buf[m++];
+    meam_inst_kk->h_tsq_ave(i,0) = buf[m++];
+    meam_inst_kk->h_tsq_ave(i,1) = buf[m++];
+    meam_inst_kk->h_tsq_ave(i,2) = buf[m++];
+  }
+
+  meam_inst_kk->k_rho0.modify_host();
+  meam_inst_kk->k_rho1.modify_host();
+  meam_inst_kk->k_rho2.modify_host();
+  meam_inst_kk->k_rho3.modify_host();
+  meam_inst_kk->k_frhop.modify_host();
+  meam_inst_kk->k_gamma.modify_host();
+  meam_inst_kk->k_dgamma1.modify_host();
+  meam_inst_kk->k_dgamma2.modify_host();
+  meam_inst_kk->k_dgamma3.modify_host();
+  meam_inst_kk->k_arho2b.modify_host();
+  meam_inst_kk->k_arho1.modify_host();
+  meam_inst_kk->k_arho2.modify_host();
+  meam_inst_kk->k_arho3.modify_host();
+  meam_inst_kk->k_arho3b.modify_host();
+  meam_inst_kk->k_t_ave.modify_host();
+  meam_inst_kk->k_tsq_ave.modify_host();
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+int PairMEAMKokkos<DeviceType>::pack_reverse_comm_kokkos(int n, int first_in, DAT::tdual_xfloat_1d &buf)
+{
+  first = first_in;
+  v_buf = buf.view<DeviceType>();
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMEAMPackReverseComm>(0,n),*this);
+  return n*30;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairMEAMKokkos<DeviceType>::operator()(TagPairMEAMPackReverseComm, const int &i) const {
+  int m = i*30;
+
+  v_buf[m++] = meam_inst_kk->d_rho0[i+first];
+  v_buf[m++] = meam_inst_kk->d_arho2b[i+first];
+  v_buf[m++] = meam_inst_kk->d_arho1(i+first,0);
+  v_buf[m++] = meam_inst_kk->d_arho1(i+first,1);
+  v_buf[m++] = meam_inst_kk->d_arho1(i+first,2);
+  v_buf[m++] = meam_inst_kk->d_arho2(i+first,0);
+  v_buf[m++] = meam_inst_kk->d_arho2(i+first,1);
+  v_buf[m++] = meam_inst_kk->d_arho2(i+first,2);
+  v_buf[m++] = meam_inst_kk->d_arho2(i+first,3);
+  v_buf[m++] = meam_inst_kk->d_arho2(i+first,4);
+  v_buf[m++] = meam_inst_kk->d_arho2(i+first,5);
+  for (int k = 0; k < 10; k++) v_buf[m++] = meam_inst_kk->d_arho3(i+first,k);
+  v_buf[m++] = meam_inst_kk->d_arho3b(i+first,0);
+  v_buf[m++] = meam_inst_kk->d_arho3b(i+first,1);
+  v_buf[m++] = meam_inst_kk->d_arho3b(i+first,2);
+  v_buf[m++] = meam_inst_kk->d_t_ave(i+first,0);
+  v_buf[m++] = meam_inst_kk->d_t_ave(i+first,1);
+  v_buf[m++] = meam_inst_kk->d_t_ave(i+first,2);
+  v_buf[m++] = meam_inst_kk->d_tsq_ave(i+first,0);
+  v_buf[m++] = meam_inst_kk->d_tsq_ave(i+first,1);
+  v_buf[m++] = meam_inst_kk->d_tsq_ave(i+first,2);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+int PairMEAMKokkos<DeviceType>::pack_reverse_comm(int n, int first, double *buf)
+{
+  meam_inst_kk->k_rho0.sync_host();
+  meam_inst_kk->k_arho2b.sync_host();
+  meam_inst_kk->k_arho1.sync_host();
+  meam_inst_kk->k_arho2.sync_host();
+  meam_inst_kk->k_arho3.sync_host();
+  meam_inst_kk->k_arho3b.sync_host();
+  meam_inst_kk->k_t_ave.sync_host();
+  meam_inst_kk->k_tsq_ave.sync_host();
+
+  int m = 0;
+  const int last = first + n;
+  for (int i = first; i < last; i++) {
+    buf[m++] = meam_inst_kk->h_rho0[i];
+    buf[m++] = meam_inst_kk->h_arho2b[i];
+    buf[m++] = meam_inst_kk->h_arho1(i,0);
+    buf[m++] = meam_inst_kk->h_arho1(i,1);
+    buf[m++] = meam_inst_kk->h_arho1(i,2);
+    buf[m++] = meam_inst_kk->h_arho2(i,0);
+    buf[m++] = meam_inst_kk->h_arho2(i,1);
+    buf[m++] = meam_inst_kk->h_arho2(i,2);
+    buf[m++] = meam_inst_kk->h_arho2(i,3);
+    buf[m++] = meam_inst_kk->h_arho2(i,4);
+    buf[m++] = meam_inst_kk->h_arho2(i,5);
+    for (int k = 0; k < 10; k++) buf[m++] = meam_inst_kk->h_arho3(i,k);
+    buf[m++] = meam_inst_kk->h_arho3b(i,0);
+    buf[m++] = meam_inst_kk->h_arho3b(i,1);
+    buf[m++] = meam_inst_kk->h_arho3b(i,2);
+    buf[m++] = meam_inst_kk->h_t_ave(i,0);
+    buf[m++] = meam_inst_kk->h_t_ave(i,1);
+    buf[m++] = meam_inst_kk->h_t_ave(i,2);
+    buf[m++] = meam_inst_kk->h_tsq_ave(i,0);
+    buf[m++] = meam_inst_kk->h_tsq_ave(i,1);
+    buf[m++] = meam_inst_kk->h_tsq_ave(i,2);
+  }
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairMEAMKokkos<DeviceType>::unpack_reverse_comm_kokkos(int n, DAT::tdual_int_2d k_sendlist, int iswap_in, DAT::tdual_xfloat_1d &buf)
+{
+  d_sendlist = k_sendlist.view<DeviceType>();
+  iswap = iswap_in;
+  v_buf = buf.view<DeviceType>();
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMEAMUnpackReverseComm>(0,n),*this);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairMEAMKokkos<DeviceType>::operator()(TagPairMEAMUnpackReverseComm, const int &i) const {
+  int j = d_sendlist(iswap, i);
+  int m = i*30;
+
+  meam_inst_kk->d_rho0[j] += v_buf[m++];
+  meam_inst_kk->d_arho2b[j] += v_buf[m++];
+  meam_inst_kk->d_arho1(j,0) += v_buf[m++];
+  meam_inst_kk->d_arho1(j,1) += v_buf[m++];
+  meam_inst_kk->d_arho1(j,2) += v_buf[m++];
+  meam_inst_kk->d_arho2(j,0) += v_buf[m++];
+  meam_inst_kk->d_arho2(j,1) += v_buf[m++];
+  meam_inst_kk->d_arho2(j,2) += v_buf[m++];
+  meam_inst_kk->d_arho2(j,3) += v_buf[m++];
+  meam_inst_kk->d_arho2(j,4) += v_buf[m++];
+  meam_inst_kk->d_arho2(j,5) += v_buf[m++];
+  for (int k = 0; k < 10; k++) meam_inst_kk->d_arho3(j,k) += v_buf[m++];
+  meam_inst_kk->d_arho3b(j,0) += v_buf[m++];
+  meam_inst_kk->d_arho3b(j,1) += v_buf[m++];
+  meam_inst_kk->d_arho3b(j,2) += v_buf[m++];
+  meam_inst_kk->d_t_ave(j,0) += v_buf[m++];
+  meam_inst_kk->d_t_ave(j,1) += v_buf[m++];
+  meam_inst_kk->d_t_ave(j,2) += v_buf[m++];
+  meam_inst_kk->d_tsq_ave(j,0) += v_buf[m++];
+  meam_inst_kk->d_tsq_ave(j,1) += v_buf[m++];
+  meam_inst_kk->d_tsq_ave(j,2) += v_buf[m++];
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairMEAMKokkos<DeviceType>::unpack_reverse_comm(int n, int *list, double *buf)
+{
+  meam_inst_kk->k_rho0.sync_host();
+  meam_inst_kk->k_arho2b.sync_host();
+  meam_inst_kk->k_arho1.sync_host();
+  meam_inst_kk->k_arho2.sync_host();
+  meam_inst_kk->k_arho3.sync_host();
+  meam_inst_kk->k_arho3b.sync_host();
+  meam_inst_kk->k_t_ave.sync_host();
+  meam_inst_kk->k_tsq_ave.sync_host();
+
+  int m = 0;
+  for (int i = 0; i < n; i++) {
+    const int j = list[i];
+    meam_inst_kk->h_rho0[j] += buf[m++];
+    meam_inst_kk->h_arho2b[j] += buf[m++];
+    meam_inst_kk->h_arho1(j,0) += buf[m++];
+    meam_inst_kk->h_arho1(j,1) += buf[m++];
+    meam_inst_kk->h_arho1(j,2) += buf[m++];
+    meam_inst_kk->h_arho2(j,0) += buf[m++];
+    meam_inst_kk->h_arho2(j,1) += buf[m++];
+    meam_inst_kk->h_arho2(j,2) += buf[m++];
+    meam_inst_kk->h_arho2(j,3) += buf[m++];
+    meam_inst_kk->h_arho2(j,4) += buf[m++];
+    meam_inst_kk->h_arho2(j,5) += buf[m++];
+    for (int k = 0; k < 10; k++) meam_inst_kk->h_arho3(j,k) += buf[m++];
+    meam_inst_kk->h_arho3b(j,0) += buf[m++];
+    meam_inst_kk->h_arho3b(j,1) += buf[m++];
+    meam_inst_kk->h_arho3b(j,2) += buf[m++];
+    meam_inst_kk->h_t_ave(j,0) += buf[m++];
+    meam_inst_kk->h_t_ave(j,1) += buf[m++];
+    meam_inst_kk->h_t_ave(j,2) += buf[m++];
+    meam_inst_kk->h_tsq_ave(j,0) += buf[m++];
+    meam_inst_kk->h_tsq_ave(j,1) += buf[m++];
+    meam_inst_kk->h_tsq_ave(j,2) += buf[m++];
+  }
+
+  meam_inst_kk->k_rho0.modify_host();
+  meam_inst_kk->k_arho2b.modify_host();
+  meam_inst_kk->k_arho1.modify_host();
+  meam_inst_kk->k_arho2.modify_host();
+  meam_inst_kk->k_arho3.modify_host();
+  meam_inst_kk->k_arho3b.modify_host();
+  meam_inst_kk->k_t_ave.modify_host();
+  meam_inst_kk->k_tsq_ave.modify_host();
+}
+
+/* ----------------------------------------------------------------------
+   strip special bond flags from neighbor list entries
+   are not used with MEAM
+   need to do here so Fortran lib doesn't see them
+   done once per reneighbor so that neigh_f2c and neigh_c2f don't see them
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairMEAMKokkos<DeviceType>::operator()(TagPairMEAMNeighStrip, const int &ii) const {
+
+  const int i = d_ilist_half[ii];
+  const int jnum_half = d_numneigh_half[i];
+  const int jnum_full = d_numneigh_full[i];
+  for (int jj = 0; jj < jnum_half; jj++)
+    d_neighbors_half(i,jj) &= NEIGHMASK;
+  for (int jj = 0; jj < jnum_full; jj++)
+    d_neighbors_full(i,jj) &= NEIGHMASK;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairMEAMKokkos<DeviceType>::operator()(TagPairMEAMOffsets, const int ii, int &n) const {
+  const int i = d_ilist_half[ii];
+  n += d_numneigh_half[i];
+}
+
+/* ---------------------------------------------------------------------- */
+
+namespace LAMMPS_NS {
+template class PairMEAMKokkos<LMPDeviceType>;
+#ifdef KOKKOS_ENABLE_CUDA
+template class PairMEAMKokkos<LMPHostType>;
+#endif
+}
+
--- a/src/KOKKOS/pair_meam_kokkos.h
+++ b/src/KOKKOS/pair_meam_kokkos.h
@ -0,0 +1,123 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+// clang-format off
+PairStyle(meam/c/kk,PairMEAMKokkos<LMPDeviceType>)
+PairStyle(meam/c/kk/device,PairMEAMKokkos<LMPDeviceType>)
+PairStyle(meam/c/kk/host,PairMEAMKokkos<LMPHostType>)
+PairStyle(meam/kk,PairMEAMKokkos<LMPDeviceType>)
+PairStyle(meam/kk/device,PairMEAMKokkos<LMPDeviceType>)
+PairStyle(meam/kk/host,PairMEAMKokkos<LMPHostType>)
+// clang-format on
+#else
+
+// clang-format off
+#ifndef LMP_PAIR_MEAM_KOKKOS_H
+#define LMP_PAIR_MEAM_KOKKOS_H
+
+#include "kokkos_base.h"
+#include "pair_kokkos.h"
+#include "pair_meam.h"
+#include "meam_kokkos.h"
+
+namespace LAMMPS_NS {
+
+struct TagPairMEAMNeighStrip{};
+struct TagPairMEAMOffsets{};
+struct TagPairMEAMPackForwardComm{};
+struct TagPairMEAMUnpackForwardComm{};
+struct TagPairMEAMPackReverseComm{};
+struct TagPairMEAMUnpackReverseComm{};
+
+template<class DeviceType>
+class MEAMKokkos;
+
+template<class DeviceType>
+class PairMEAMKokkos : public PairMEAM, public KokkosBase {
+ public:
+  enum {EnabledNeighFlags=FULL|HALFTHREAD|HALF};
+  enum {COUL_FLAG=0};
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typedef int value_type;
+
+  PairMEAMKokkos(class LAMMPS *);
+  ~PairMEAMKokkos() override;
+  void compute(int, int) override;
+  void coeff(int, char **) override;
+  void init_style() override;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairMEAMPackForwardComm, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairMEAMUnpackForwardComm, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairMEAMPackReverseComm, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairMEAMUnpackReverseComm, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairMEAMNeighStrip,  const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairMEAMOffsets,  const int, int&) const;
+
+  int pack_forward_comm_kokkos(int, DAT::tdual_int_2d, int, DAT::tdual_xfloat_1d&,
+                               int, int *) override;
+  int pack_forward_comm(int, int *, double *, int, int *) override;
+  void unpack_forward_comm_kokkos(int, int, DAT::tdual_xfloat_1d&) override;
+  void unpack_forward_comm(int, int, double *) override;
+  int pack_reverse_comm_kokkos(int, int, DAT::tdual_xfloat_1d&) override;
+  int pack_reverse_comm(int, int, double *) override;
+  void unpack_reverse_comm_kokkos(int, DAT::tdual_int_2d,
+                                  int, DAT::tdual_xfloat_1d&) override;
+  void unpack_reverse_comm(int, int *, double *) override;
+
+ protected:
+  class MEAMKokkos<DeviceType> *meam_inst_kk;
+  typename AT::t_x_array x;
+  typename AT::t_f_array f;
+  typename AT::t_int_1d type;
+
+  DAT::tdual_efloat_1d k_eatom;
+  DAT::tdual_virial_array k_vatom;
+  typename AT::t_efloat_1d d_eatom;
+  typename AT::t_virial_array d_vatom;
+
+  typename AT::t_int_1d d_offset;
+
+  DAT::tdual_int_1d k_map;
+  typename AT::t_int_1d d_map;
+  typename AT::t_int_2d d_scale;
+  typename AT::t_int_1d d_ilist_half;
+  typename AT::t_int_1d d_numneigh_half;
+  typename AT::t_neighbors_2d d_neighbors_half;
+  typename AT::t_int_1d d_numneigh_full;
+  typename AT::t_neighbors_2d d_neighbors_full;
+  typename AT::t_int_2d d_sendlist;
+  typename AT::t_xfloat_1d_um v_buf;
+
+  int iswap,first;
+  int neighflag,nlocal,nall,eflag,vflag;
+
+  friend void pair_virial_fdotr_compute<PairMEAMKokkos>(PairMEAMKokkos*);
+};
+
+}
+#endif
+#endif
+
--- a/src/KOKKOS/pair_snap_kokkos_impl.h
+++ b/src/KOKKOS/pair_snap_kokkos_impl.h
@ -139,7 +139,7 @@ template<class DeviceType, typename real_type, int vector_length>
 void PairSNAPKokkos<DeviceType, real_type, vector_length>::compute(int eflag_in, int vflag_in)
 {
  if (host_flag) {
-    atomKK->sync(Host,X_MASK|TYPE_MASK);
+    atomKK->sync(Host,X_MASK|F_MASK|TYPE_MASK);
    PairSNAP::compute(eflag_in,vflag_in);
    atomKK->modified(Host,F_MASK);
    return;
--- a/src/KOKKOS/pair_zbl_kokkos.cpp
+++ b/src/KOKKOS/pair_zbl_kokkos.cpp
@ -126,8 +126,6 @@ void PairZBLKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
  }

  atomKK->sync(execution_space,datamask_read);
-  if (eflag || vflag) atomKK->modified(execution_space,datamask_modify);
-  else atomKK->modified(execution_space,F_MASK);

  x = atomKK->k_x.view<DeviceType>();
  f = atomKK->k_f.view<DeviceType>();
@ -177,6 +175,9 @@ void PairZBLKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
  }

  if (vflag_fdotr) pair_virial_fdotr_compute(this);
+
+  if (eflag || vflag) atomKK->modified(execution_space,datamask_modify);
+  else atomKK->modified(execution_space,F_MASK);
 }

 template<class DeviceType>
--- a/src/KOKKOS/verlet_kokkos.cpp
+++ b/src/KOKKOS/verlet_kokkos.cpp
@ -282,8 +282,6 @@ void VerletKokkos::run(int n)
  f_merge_copy = DAT::t_f_array("VerletKokkos::f_merge_copy",atomKK->k_f.extent(0));

  atomKK->sync(Device,ALL_MASK);
-  //static double time = 0.0;
-  //Kokkos::Timer ktimer;

  timer->init_timeout();
  for (int i = 0; i < n; i++) {
@ -297,10 +295,8 @@ void VerletKokkos::run(int n)

    // initial time integration

-    //ktimer.reset();
    timer->stamp();
    modify->initial_integrate(vflag);
-    //time += ktimer.seconds();
    if (n_post_integrate) modify->post_integrate();
    timer->stamp(Timer::MODIFY);

@ -445,7 +441,6 @@ void VerletKokkos::run(int n)
    if (pair_compute_flag) {
      atomKK->sync(force->pair->execution_space,force->pair->datamask_read);
      atomKK->sync(force->pair->execution_space,~(~force->pair->datamask_read|(F_MASK | ENERGY_MASK | VIRIAL_MASK)));
-      Kokkos::Timer ktimer;
      force->pair->compute(eflag,vflag);
      atomKK->modified(force->pair->execution_space,force->pair->datamask_modify);
      atomKK->modified(force->pair->execution_space,~(~force->pair->datamask_modify|(F_MASK | ENERGY_MASK | VIRIAL_MASK)));
--- a/src/MDI/fix_mdi_aimd.cpp
+++ b/src/MDI/fix_mdi_aimd.cpp
@ -1,379 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   https://www.lammps.org/ Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */
-
-#include "fix_mdi_aimd.h"
-#include "atom.h"
-#include "comm.h"
-#include "domain.h"
-#include "error.h"
-#include "force.h"
-#include "memory.h"
-#include "update.h"
-
-using namespace LAMMPS_NS;
-using namespace FixConst;
-
-enum { NATIVE, REAL, METAL };    // LAMMPS units which MDI supports
-
-/* ---------------------------------------------------------------------- */
-
-FixMDIAimd::FixMDIAimd(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
-{
-  if (narg != 3) error->all(FLERR, "Illegal fix mdi/aimd command");
-
-  scalar_flag = 1;
-  global_freq = 1;
-  extscalar = 1;
-  energy_global_flag = 1;
-  virial_global_flag = 1;
-  thermo_energy = thermo_virial = 1;
-
-  // check requirements for LAMMPS to work with MDI as an engine
-
-  if (atom->tag_enable == 0) error->all(FLERR, "Cannot use MDI engine without atom IDs");
-
-  if (atom->natoms && atom->tag_consecutive() == 0)
-    error->all(FLERR, "MDI engine requires consecutive atom IDs");
-
-  // confirm LAMMPS is being run as a driver
-
-  int role;
-  MDI_Get_role(&role);
-  if (role != MDI_DRIVER)
-    error->all(FLERR, "Must invoke LAMMPS as an MDI driver to use fix mdi/aimd");
-
-  // mdicomm will be one-time initialized in init()
-  // cannot be done here for a plugin library, b/c mdi plugin command is later
-
-  mdicomm = MDI_COMM_NULL;
-
-  // storage for all atoms
-
-  buf3 = buf3all = nullptr;
-  maxbuf = 0;
-
-  // set unit conversion factors
-
-  if (strcmp(update->unit_style, "real") == 0)
-    lmpunits = REAL;
-  else if (strcmp(update->unit_style, "metal") == 0)
-    lmpunits = METAL;
-  else
-    lmpunits = NATIVE;
-
-  unit_conversions();
-
-  nprocs = comm->nprocs;
-}
-
-/* ---------------------------------------------------------------------- */
-
-FixMDIAimd::~FixMDIAimd()
-{
-  // send exit command to engine if it is a stand-alone code
-  // for plugin, this happens in MDIPlugin::plugin_wrapper()
-
-  if (!plugin) {
-    int ierr = MDI_Send_command("EXIT", mdicomm);
-    if (ierr) error->all(FLERR, "MDI: EXIT command");
-  }
-
-  // clean up
-
-  memory->destroy(buf3);
-  memory->destroy(buf3all);
-}
-
-/* ---------------------------------------------------------------------- */
-
-int FixMDIAimd::setmask()
-{
-  int mask = 0;
-  mask |= PRE_REVERSE;
-  mask |= POST_FORCE;
-  mask |= MIN_POST_FORCE;
-  return mask;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixMDIAimd::init()
-{
-  if (mdicomm != MDI_COMM_NULL) return;
-
-  // one-time auto-detect whether engine is stand-alone code or plugin library
-  // also initializes mdicomm
-  // plugin = 0/1 for engine = stand-alone code vs plugin library
-
-  MDI_Get_communicator(&mdicomm, 0);
-
-  if (mdicomm == MDI_COMM_NULL) {
-    plugin = 0;
-    MDI_Accept_communicator(&mdicomm);
-    if (mdicomm == MDI_COMM_NULL) error->all(FLERR, "MDI unable to connect to stand-alone engine");
-  } else {
-    plugin = 1;
-    int method;
-    MDI_Get_method(&method, mdicomm);
-    if (method != MDI_PLUGIN) error->all(FLERR, "MDI internal error for plugin engine");
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixMDIAimd::setup(int vflag)
-{
-  post_force(vflag);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixMDIAimd::setup_pre_reverse(int eflag, int vflag)
-{
-  pre_reverse(eflag, vflag);
-}
-
-/* ----------------------------------------------------------------------
-   store eflag, so can use it in post_force to request energy
------------------------------------------------------------------------- */
-
-void FixMDIAimd::pre_reverse(int eflag, int /*vflag*/)
-{
-  eflag_caller = eflag;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixMDIAimd::post_force(int vflag)
-{
-  int ilocal, ierr;
-  double cell[9];
-
-  int eflag = eflag_caller;
-  ev_init(eflag, vflag);
-
-  // if simulation box dynamically changes, send current box to MDI engine
-
-  if (domain->box_change_size || domain->box_change_shape) {
-    ierr = MDI_Send_command(">CELL_DISPL", mdicomm);
-    if (ierr) error->all(FLERR, "MDI: >CELL_DISPL command");
-    cell[0] = domain->boxlo[0] * lmp2mdi_length;
-    cell[1] = domain->boxlo[1] * lmp2mdi_length;
-    cell[2] = domain->boxlo[2] * lmp2mdi_length;
-    ierr = MDI_Send(cell, 3, MDI_DOUBLE, mdicomm);
-    if (ierr) error->all(FLERR, "MDI: >CELL_DISPL data");
-
-    ierr = MDI_Send_command(">CELL", mdicomm);
-    if (ierr) error->all(FLERR, "MDI: >CELL command");
-    cell[0] = domain->boxhi[0] - domain->boxlo[0];
-    cell[1] = 0.0;
-    cell[2] = 0.0;
-    cell[3] = domain->xy;
-    cell[4] = domain->boxhi[1] - domain->boxlo[1];
-    cell[5] = 0.0;
-    cell[6] = domain->xz;
-    cell[7] = domain->yz;
-    cell[8] = domain->boxhi[2] - domain->boxlo[2];
-    ierr = MDI_Send(cell, 9, MDI_DOUBLE, mdicomm);
-    if (ierr) error->all(FLERR, "MDI: >CELL data");
-  }
-
-  // gather all coords, ordered by atomID
-
-  reallocate();
-  memset(buf3, 0, 3 * atom->natoms * sizeof(double));
-
-  double **x = atom->x;
-  tagint *tag = atom->tag;
-  int nlocal = atom->nlocal;
-
-  for (int i = 0; i < nlocal; i++) {
-    ilocal = static_cast<int>(tag[i]) - 1;
-    buf3[3 * ilocal + 0] = x[i][0] * lmp2mdi_length;
-    buf3[3 * ilocal + 1] = x[i][1] * lmp2mdi_length;
-    buf3[3 * ilocal + 2] = x[i][2] * lmp2mdi_length;
-  }
-
-  MPI_Reduce(buf3, buf3all, 3 * atom->natoms, MPI_DOUBLE, MPI_SUM, 0, world);
-
-  // send current coords to MDI engine
-
-  ierr = MDI_Send_command(">COORDS", mdicomm);
-  if (ierr) error->all(FLERR, "MDI: >COORDS command");
-  ierr = MDI_Send(buf3all, 3 * atom->natoms, MDI_DOUBLE, mdicomm);
-  if (ierr) error->all(FLERR, "MDI: >COORDS data");
-
-  // request forces from MDI engine
-  // this triggers engine to evaluate forces,energy,stress for current system
-
-  ierr = MDI_Send_command("<FORCES", mdicomm);
-  if (ierr) error->all(FLERR, "MDI: <FORCES command");
-  ierr = MDI_Recv(buf3, 3 * atom->natoms, MDI_DOUBLE, mdicomm);
-  if (ierr) error->all(FLERR, "MDI: <FORCES data");
-  MPI_Bcast(buf3, 3 * atom->natoms, MPI_DOUBLE, 0, world);
-
-  // add forces to owned atoms
-  // use atomID to index into ordered buf3
-
-  double **f = atom->f;
-
-  for (int i = 0; i < nlocal; i++) {
-    ilocal = static_cast<int>(tag[i]) - 1;
-    f[i][0] += buf3[3 * ilocal + 0] * mdi2lmp_force;
-    f[i][1] += buf3[3 * ilocal + 1] * mdi2lmp_force;
-    f[i][2] += buf3[3 * ilocal + 2] * mdi2lmp_force;
-  }
-
-  // optionally request potential energy from MDI engine
-
-  if (eflag_global) {
-    ierr = MDI_Send_command("<PE", mdicomm);
-    if (ierr) error->all(FLERR, "MDI: <PE command");
-    ierr = MDI_Recv(&engine_energy, 1, MDI_DOUBLE, mdicomm);
-    if (ierr) error->all(FLERR, "MDI: <PE data");
-    MPI_Bcast(&engine_energy, 1, MPI_DOUBLE, 0, world);
-    engine_energy *= mdi2lmp_energy;
-  }
-
-  // optionally request pressure tensor from MDI engine, convert to virial
-  // divide by nprocs so each proc stores a portion
-
-  if (vflag_global) {
-    double ptensor[6];
-    ierr = MDI_Send_command("<STRESS", mdicomm);
-    if (ierr) error->all(FLERR, "MDI: <STRESS command");
-    ierr = MDI_Recv(ptensor, 6, MDI_DOUBLE, mdicomm);
-    if (ierr) error->all(FLERR, "MDI: <STRESS data");
-    MPI_Bcast(ptensor, 6, MPI_DOUBLE, 0, world);
-
-    double volume = domain->xprd * domain->yprd * domain->zprd;
-    for (int i = 0; i < 6; i++) {
-      ptensor[i] *= mdi2lmp_pressure;
-      virial[i] = ptensor[i] * volume / force->nktv2p / nprocs;
-    }
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixMDIAimd::min_post_force(int vflag)
-{
-  post_force(vflag);
-}
-
-/* ----------------------------------------------------------------------
-   energy from MDI engine
------------------------------------------------------------------------- */
-
-double FixMDIAimd::compute_scalar()
-{
-  return engine_energy;
-}
-
-/* ----------------------------------------------------------------------
-   reallocate storage for all atoms if necessary
------------------------------------------------------------------------- */
-
-void FixMDIAimd::reallocate()
-{
-  if (atom->natoms <= maxbuf) return;
-
-  if (3 * atom->natoms > MAXSMALLINT)
-    error->all(FLERR, "Natoms too large to use with fix mdi/aimd");
-
-  maxbuf = atom->natoms;
-
-  memory->destroy(buf3);
-  memory->destroy(buf3all);
-
-  memory->create(buf3, 3 * maxbuf, "mdi:buf3");
-  memory->create(buf3all, 3 * maxbuf, "mdi:buf3all");
-}
-
-/* ----------------------------------------------------------------------
-   MDI to/from LAMMPS conversion factors
------------------------------------------------------------------------- */
-
-void FixMDIAimd::unit_conversions()
-{
-  double angstrom_to_bohr, kelvin_to_hartree, ev_to_hartree, second_to_aut;
-
-  MDI_Conversion_factor("angstrom", "bohr", &angstrom_to_bohr);
-  MDI_Conversion_factor("kelvin_energy", "hartree", &kelvin_to_hartree);
-  MDI_Conversion_factor("electron_volt", "hartree", &ev_to_hartree);
-  MDI_Conversion_Factor("second", "atomic_unit_of_time", &second_to_aut);
-
-  // length units
-
-  mdi2lmp_length = 1.0;
-  lmp2mdi_length = 1.0;
-
-  if (lmpunits == REAL || lmpunits == METAL) {
-    lmp2mdi_length = angstrom_to_bohr;
-    mdi2lmp_length = 1.0 / angstrom_to_bohr;
-  }
-
-  // energy units
-
-  mdi2lmp_energy = 1.0;
-  lmp2mdi_energy = 1.0;
-
-  if (lmpunits == REAL) {
-    lmp2mdi_energy = kelvin_to_hartree / force->boltz;
-    mdi2lmp_energy = force->boltz / kelvin_to_hartree;
-  } else if (lmpunits == METAL) {
-    lmp2mdi_energy = ev_to_hartree;
-    mdi2lmp_energy = 1.0 / ev_to_hartree;
-  }
-
-  // force units = energy/length
-
-  mdi2lmp_force = 1.0;
-  lmp2mdi_force = 1.0;
-
-  if (lmpunits == REAL) {
-    lmp2mdi_force = (kelvin_to_hartree / force->boltz) / angstrom_to_bohr;
-    mdi2lmp_force = 1.0 / lmp2mdi_force;
-  } else if (lmpunits == METAL) {
-    lmp2mdi_force = ev_to_hartree / angstrom_to_bohr;
-    mdi2lmp_force = angstrom_to_bohr / ev_to_hartree;
-  }
-
-  // pressure or stress units = force/area = energy/volume
-
-  mdi2lmp_pressure = 1.0;
-  lmp2mdi_pressure = 1.0;
-
-  if (lmpunits == REAL) {
-    lmp2mdi_pressure = (kelvin_to_hartree / force->boltz) /
-        (angstrom_to_bohr * angstrom_to_bohr * angstrom_to_bohr) / force->nktv2p;
-    mdi2lmp_pressure = 1.0 / lmp2mdi_pressure;
-  } else if (lmpunits == METAL) {
-    lmp2mdi_pressure =
-        ev_to_hartree / (angstrom_to_bohr * angstrom_to_bohr * angstrom_to_bohr) / force->nktv2p;
-    mdi2lmp_pressure = 1.0 / lmp2mdi_pressure;
-  }
-
-  // velocity units = distance/time
-
-  mdi2lmp_velocity = 1.0;
-  lmp2mdi_velocity = 1.0;
-
-  if (lmpunits == REAL) {
-    lmp2mdi_velocity = angstrom_to_bohr / (1.0e-15 * second_to_aut);
-    mdi2lmp_velocity = 1.0 / lmp2mdi_velocity;
-  } else if (lmpunits == METAL) {
-    lmp2mdi_velocity = angstrom_to_bohr / (1.0e-12 * second_to_aut);
-    mdi2lmp_velocity = 1.0 / lmp2mdi_velocity;
-  }
-}
--- a/src/MDI/fix_mdi_qm.cpp
+++ b/src/MDI/fix_mdi_qm.cpp
@ -0,0 +1,591 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/ Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "fix_mdi_qm.h"
+#include "atom.h"
+#include "comm.h"
+#include "domain.h"
+#include "error.h"
+#include "force.h"
+#include "memory.h"
+#include "update.h"
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+enum { NATIVE, REAL, METAL };    // LAMMPS units which MDI supports
+
+#define MAXELEMENT 103           // used elsewhere in MDI package
+
+/* ---------------------------------------------------------------------- */
+
+FixMDIQM::FixMDIQM(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg)
+{
+  // check requirements for LAMMPS to work with MDI as an engine
+
+  if (atom->tag_enable == 0)
+    error->all(FLERR, "Cannot use MDI engine without atom IDs");
+  if (atom->natoms && atom->tag_consecutive() == 0)
+    error->all(FLERR, "MDI engine requires consecutive atom IDs");
+
+  // confirm LAMMPS is being run as a driver
+
+  int role;
+  MDI_Get_role(&role);
+  if (role != MDI_DRIVER)
+    error->all(FLERR, "Must invoke LAMMPS as an MDI driver to use fix mdi/qm");
+
+  // optional args
+
+  virialflag = 0;
+  addflag = 1;
+  every = 1;
+  connectflag = 1;
+  elements = nullptr;
+
+  int iarg = 3;
+  while (iarg < narg) {
+    if (strcmp(arg[iarg],"virial") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal fix mdi/qm command");
+      if (strcmp(arg[iarg+1],"yes") == 0) virialflag = 1;
+      else if (strcmp(arg[iarg+1],"no") == 0) virialflag = 0;
+      else error->all(FLERR,"Illegal fix mdi/qm command");
+      iarg += 2;
+    } else if (strcmp(arg[iarg],"add") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal fix mdi/qm command");
+      if (strcmp(arg[iarg+1],"yes") == 0) addflag = 1;
+      else if (strcmp(arg[iarg+1],"no") == 0) addflag = 0;
+      else error->all(FLERR,"Illegal fix mdi/qm command");
+      iarg += 2;
+    } else if (strcmp(arg[iarg],"every") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal fix mdi/qm command");
+      every = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
+      if (every <= 0) error->all(FLERR,"Illegal fix mdi/qm command");
+      iarg += 2;
+    } else if (strcmp(arg[iarg],"connect") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal fix mdi/qm command");
+      if (strcmp(arg[iarg+1],"yes") == 0) connectflag = 1;
+      else if (strcmp(arg[iarg+1],"no") == 0) connectflag = 0;
+      else error->all(FLERR,"Illegal fix mdi/qm command");
+      iarg += 2;
+    } else if (strcmp(arg[iarg],"elements") == 0) {
+      int ntypes = atom->ntypes;
+      if (iarg+ntypes+1 > narg) error->all(FLERR,"Illegal fix mdi/qm command");
+      delete [] elements;
+      elements = new int[ntypes+1];
+      for (int i = 1; i <= ntypes; i++) {
+        elements[i] = utils::inumeric(FLERR,arg[iarg+i],false,lmp);
+        if (elements[i] < 1 || elements[i] > MAXELEMENT)
+          error->all(FLERR,"Illegal fix mdi/qm command");
+      }
+      iarg += ntypes+1;
+    } else error->all(FLERR,"Illegal fix mdi/qm command");
+  }
+
+  // fix output settings are based on optional keywords
+
+  scalar_flag = 1;
+  global_freq = every;
+  extscalar = 1;
+
+  peratom_flag = 1;
+  size_peratom_cols = 3;
+  peratom_freq = every;
+  extvector = 0;
+
+  if (virialflag) {
+    vector_flag = 1;
+    size_vector = 6;
+  }
+
+  if (addflag) {
+    energy_global_flag = 1;
+    virial_global_flag = 1;
+    thermo_energy = thermo_virial = 1;
+  }
+
+  // mdicomm will be initialized in init()
+  // cannot do here for a plugin library, b/c mdi plugin command comes later
+
+  mdicomm = MDI_COMM_NULL;
+
+  // peratom storage, both for nlocal and global natoms
+
+  fqm = nullptr;
+  maxlocal = 0;
+
+  ibuf1 = ibuf1all = nullptr;
+  buf3 = buf3all = nullptr;
+  maxbuf = 0;
+
+  // set unit conversion factors
+
+  if (strcmp(update->unit_style, "real") == 0)
+    lmpunits = REAL;
+  else if (strcmp(update->unit_style, "metal") == 0)
+    lmpunits = METAL;
+  else
+    lmpunits = NATIVE;
+
+  unit_conversions();
+
+  nprocs = comm->nprocs;
+
+  // initialize outputs
+
+  qm_energy = 0.0;
+  if (virialflag) {
+    for (int i = 0; i < 6; i++) {
+      qm_virial[i] = 0.0;
+      virial[i] = 0.0;
+    }
+    sumflag = 0;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+FixMDIQM::~FixMDIQM()
+{
+  // send exit command to stand-alone engine code
+  // for connnectflag = 0, this is done via "mdi exit" command
+  // for plugin, this is done in MDIPlugin::plugin_wrapper()
+
+  if (mdicomm != MDI_COMM_NULL && connectflag && !plugin) {
+    int ierr = MDI_Send_command("EXIT", mdicomm);
+    if (ierr) error->all(FLERR, "MDI: EXIT command");
+  }
+
+  // clean up
+
+  delete[] elements;
+
+  memory->destroy(fqm);
+
+  memory->destroy(ibuf1);
+  memory->destroy(ibuf1all);
+  memory->destroy(buf3);
+  memory->destroy(buf3all);
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixMDIQM::setmask()
+{
+  int mask = 0;
+  mask |= PRE_REVERSE;
+  mask |= POST_FORCE;
+  mask |= MIN_POST_FORCE;
+  return mask;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixMDIQM::init()
+{
+  // set local mdicomm one-time only
+  // also set plugin = 0/1 for engine = stand-alone code vs plugin library
+
+  if (mdicomm == MDI_COMM_NULL) {
+
+    // this fix makes one-time connection to engine
+
+    if (connectflag) {
+
+      // if MDI's mdicomm not set, need to Accept_comm() with stand-alone engine
+      // othewise are already connected to plugin engine
+
+      MDI_Get_communicator(&mdicomm, 0);
+
+      if (mdicomm == MDI_COMM_NULL) {
+        plugin = 0;
+        MDI_Accept_communicator(&mdicomm);
+        if (mdicomm == MDI_COMM_NULL)
+          error->all(FLERR, "MDI unable to connect to stand-alone engine");
+
+      } else {
+        plugin = 1;
+        int method;
+        MDI_Get_method(&method, mdicomm);
+        if (method != MDI_PLUGIN)
+          error->all(FLERR, "MDI internal error for plugin engine");
+      }
+
+    // connection should have been already made by "mdi connect" command
+    // only works for stand-alone engines
+
+    } else {
+      plugin = 0;
+
+      if (lmp->mdicomm == nullptr)
+        error->all(FLERR,"Fix mdi/qm is not connected to engine via mdi connect");
+
+      int nbytes = sizeof(MDI_Comm);
+      char *ptrcomm = (char *) lmp->mdicomm;
+      memcpy(&mdicomm,ptrcomm,nbytes);
+    }
+  }
+
+  // send natoms, atom types or elements, and simulation box to engine
+  // this will trigger setup of a new system
+  // subsequent calls in post_force() will be for same system until new init()
+
+  reallocate();
+
+  int ierr = MDI_Send_command(">NATOMS", mdicomm);
+  if (ierr) error->all(FLERR, "MDI: >NATOMS command");
+  int n = static_cast<int> (atom->natoms);
+  ierr = MDI_Send(&n, 1, MDI_INT, mdicomm);
+  if (ierr) error->all(FLERR, "MDI: >NATOMS data");
+
+  if (elements) send_elements();
+  else send_types();
+  send_box();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixMDIQM::setup(int vflag)
+{
+  post_force(vflag);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixMDIQM::post_force(int vflag)
+{
+  int index, ierr;
+
+  // skip if timestep is not a multiple of every
+
+  if (update->ntimestep % every) return;
+
+  // reallocate peratom storage if necessary, both natoms and nlocal
+
+  reallocate();
+
+  // if simulation box dynamically changes, send current box to MDI engine
+
+  if (domain->box_change_size || domain->box_change_shape)
+    send_box();
+
+  // gather all coords, ordered by atomID
+
+  memset(buf3, 0, 3 * atom->natoms * sizeof(double));
+
+  double **x = atom->x;
+  tagint *tag = atom->tag;
+  int nlocal = atom->nlocal;
+
+  for (int i = 0; i < nlocal; i++) {
+    index = static_cast<int>(tag[i]) - 1;
+    buf3[3 * index + 0] = x[i][0] * lmp2mdi_length;
+    buf3[3 * index + 1] = x[i][1] * lmp2mdi_length;
+    buf3[3 * index + 2] = x[i][2] * lmp2mdi_length;
+  }
+
+  int n = static_cast<int> (atom->natoms);
+  MPI_Reduce(buf3, buf3all, 3 * n, MPI_DOUBLE, MPI_SUM, 0, world);
+
+  // send current coords to MDI engine
+
+  ierr = MDI_Send_command(">COORDS", mdicomm);
+  if (ierr) error->all(FLERR, "MDI: >COORDS command");
+  ierr = MDI_Send(buf3all, 3 * atom->natoms, MDI_DOUBLE, mdicomm);
+  if (ierr) error->all(FLERR, "MDI: >COORDS data");
+
+  // request potential energy from MDI engine
+  // this triggers engine to perform QM calculation
+  // qm_energy = fix output for global QM energy
+
+  ierr = MDI_Send_command("<PE", mdicomm);
+  if (ierr) error->all(FLERR, "MDI: <PE command");
+  ierr = MDI_Recv(&qm_energy, 1, MDI_DOUBLE, mdicomm);
+  if (ierr) error->all(FLERR, "MDI: <PE data");
+  MPI_Bcast(&qm_energy, 1, MPI_DOUBLE, 0, world);
+  qm_energy *= mdi2lmp_energy;
+
+  // request forces from MDI engine
+
+  ierr = MDI_Send_command("<FORCES", mdicomm);
+  if (ierr) error->all(FLERR, "MDI: <FORCES command");
+  ierr = MDI_Recv(buf3, 3 * atom->natoms, MDI_DOUBLE, mdicomm);
+  if (ierr) error->all(FLERR, "MDI: <FORCES data");
+  MPI_Bcast(buf3, 3 * n, MPI_DOUBLE, 0, world);
+
+  // fqm = fix output for peratom QM forces
+  // use atomID of local atoms to index into ordered buf3
+
+  for (int i = 0; i < nlocal; i++) {
+    index = static_cast<int>(tag[i]) - 1;
+    fqm[i][0] = buf3[3 * index + 0] * mdi2lmp_force;
+    fqm[i][1] = buf3[3 * index + 1] * mdi2lmp_force;
+    fqm[i][2] = buf3[3 * index + 2] * mdi2lmp_force;
+  }
+
+
+  // optionally add forces to owned atoms
+  // use atomID of local atoms to index into ordered buf3
+
+  if (addflag) {
+    double **f = atom->f;
+    for (int i = 0; i < nlocal; i++) {
+      index = static_cast<int>(tag[i]) - 1;
+      f[i][0] += buf3[3 * index + 0] * mdi2lmp_force;
+      f[i][1] += buf3[3 * index + 1] * mdi2lmp_force;
+      f[i][2] += buf3[3 * index + 2] * mdi2lmp_force;
+    }
+  }
+
+  // optionally request stress tensor from MDI engine, convert to virial
+  // qm_virial = fix output for global QM virial
+
+  if (virialflag) {
+    ierr = MDI_Send_command("<STRESS", mdicomm);
+    if (ierr) error->all(FLERR, "MDI: <STRESS command");
+    ierr = MDI_Recv(qm_virial, 9, MDI_DOUBLE, mdicomm);
+    if (ierr) error->all(FLERR, "MDI: <STRESS data");
+    MPI_Bcast(qm_virial, 9, MPI_DOUBLE, 0, world);
+
+    qm_virial_symmetric[0] = qm_virial[0] * mdi2lmp_pressure;
+    qm_virial_symmetric[1] = qm_virial[4] * mdi2lmp_pressure;
+    qm_virial_symmetric[2] = qm_virial[8] * mdi2lmp_pressure;
+    qm_virial_symmetric[3] = 0.5*(qm_virial[1]+qm_virial[3]) * mdi2lmp_pressure;
+    qm_virial_symmetric[4] = 0.5*(qm_virial[2]+qm_virial[6]) * mdi2lmp_pressure;
+    qm_virial_symmetric[5] = 0.5*(qm_virial[5]+qm_virial[7]) * mdi2lmp_pressure;
+  }
+
+  // optionally set fix->virial
+  //   multiply by volume to make it extensive
+  //   divide by nprocs so each proc stores a portion
+  // this is b/c ComputePressure expects that as input from a fix
+  //   it will do an MPI_Allreduce and divide by volume
+
+  if (virialflag && addflag) {
+    double volume;
+    if (domain->dimension == 2)
+      volume = domain->xprd * domain->yprd;
+    else if (domain->dimension == 3)
+      volume = domain->xprd * domain->yprd * domain->zprd;
+    for (int i = 0; i < 6; i++)
+      virial[i] = qm_virial_symmetric[i]*volume/nprocs;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixMDIQM::min_post_force(int vflag)
+{
+  post_force(vflag);
+}
+
+/* ----------------------------------------------------------------------
+   energy from MDI engine
+------------------------------------------------------------------------- */
+
+double FixMDIQM::compute_scalar()
+{
+  return qm_energy;
+}
+
+/* ----------------------------------------------------------------------
+   virial from MDI engine
+------------------------------------------------------------------------- */
+
+double FixMDIQM::compute_vector(int n)
+{
+  return qm_virial_symmetric[n];
+}
+
+/* ----------------------------------------------------------------------
+   reallocate storage for local and global and atoms if needed
+------------------------------------------------------------------------- */
+
+void FixMDIQM::reallocate()
+{
+  if (atom->nlocal > maxlocal) {
+    maxlocal = atom->nmax;
+    memory->destroy(fqm);
+    memory->create(fqm, maxlocal, 3, "mdi:fqm");
+    array_atom = fqm;
+  }
+
+  if (atom->natoms > maxbuf) {
+    bigint nsize = atom->natoms * 3;
+    if (nsize > MAXSMALLINT)
+      error->all(FLERR, "Natoms too large to use with fix mdi/qm");
+
+    maxbuf = static_cast<int> (atom->natoms);
+    memory->destroy(ibuf1);
+    memory->destroy(buf3);
+    memory->destroy(buf3all);
+    memory->create(ibuf1, maxbuf, "mdi:ibuf1");
+    memory->create(ibuf1all, maxbuf, "mdi:ibuf1all");
+    memory->create(buf3, 3 * maxbuf, "mdi:buf3");
+    memory->create(buf3all, 3 * maxbuf, "mdi:buf3all");
+  }
+}
+
+/* ----------------------------------------------------------------------
+   send LAMMPS atom types to MDI engine
+------------------------------------------------------------------------- */
+
+void FixMDIQM::send_types()
+{
+  int n = static_cast<int> (atom->natoms);
+  memset(ibuf1, 0, n * sizeof(int));
+
+  // use local atomID to index into ordered ibuf1
+
+  tagint *tag = atom->tag;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+
+  int index;
+  for (int i = 0; i < nlocal; i++) {
+    index = static_cast<int>(tag[i]) - 1;
+    ibuf1[index] = type[i];
+  }
+
+  MPI_Reduce(ibuf1, ibuf1all, n, MPI_INT, MPI_SUM, 0, world);
+
+  int ierr = MDI_Send_command(">TYPES", mdicomm);
+  if (ierr) error->all(FLERR, "MDI: >TYPES command");
+  ierr = MDI_Send(ibuf1all, n, MDI_INT, mdicomm);
+  if (ierr) error->all(FLERR, "MDI: >TYPES data");
+}
+
+/* ----------------------------------------------------------------------
+   send elements to MDI engine = atomic numbers for each type
+------------------------------------------------------------------------- */
+
+void FixMDIQM::send_elements()
+{
+  int n = static_cast<int> (atom->natoms);
+  memset(ibuf1, 0, n * sizeof(int));
+
+  // use local atomID to index into ordered ibuf1
+
+  tagint *tag = atom->tag;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+
+  int index;
+  for (int i = 0; i < nlocal; i++) {
+    index = static_cast<int>(tag[i]) - 1;
+    ibuf1[index] = elements[type[i]];
+  }
+
+  MPI_Reduce(ibuf1, ibuf1all, n, MPI_INT, MPI_SUM, 0, world);
+
+  int ierr = MDI_Send_command(">ELEMENTS", mdicomm);
+  if (ierr) error->all(FLERR, "MDI: >ELEMENTS command");
+  ierr = MDI_Send(ibuf1all, n, MDI_INT, mdicomm);
+  if (ierr) error->all(FLERR, "MDI: >ELEMETNS data");
+}
+
+/* ----------------------------------------------------------------------
+   send simulation box size and shape to MDI engine
+------------------------------------------------------------------------- */
+
+void FixMDIQM::send_box()
+{
+  double cell[9];
+
+  int ierr = MDI_Send_command(">CELL_DISPL", mdicomm);
+  if (ierr) error->all(FLERR, "MDI: >CELL_DISPL command");
+  cell[0] = domain->boxlo[0] * lmp2mdi_length;
+  cell[1] = domain->boxlo[1] * lmp2mdi_length;
+  cell[2] = domain->boxlo[2] * lmp2mdi_length;
+  ierr = MDI_Send(cell, 3, MDI_DOUBLE, mdicomm);
+  if (ierr) error->all(FLERR, "MDI: >CELL_DISPL data");
+
+  ierr = MDI_Send_command(">CELL", mdicomm);
+  if (ierr) error->all(FLERR, "MDI: >CELL command");
+  cell[0] = domain->boxhi[0] - domain->boxlo[0];
+  cell[1] = 0.0;
+  cell[2] = 0.0;
+  cell[3] = domain->xy;
+  cell[4] = domain->boxhi[1] - domain->boxlo[1];
+  cell[5] = 0.0;
+  cell[6] = domain->xz;
+  cell[7] = domain->yz;
+  cell[8] = domain->boxhi[2] - domain->boxlo[2];
+  ierr = MDI_Send(cell, 9, MDI_DOUBLE, mdicomm);
+  if (ierr) error->all(FLERR, "MDI: >CELL data");
+}
+
+/* ----------------------------------------------------------------------
+   MDI to/from LAMMPS conversion factors
+------------------------------------------------------------------------- */
+
+void FixMDIQM::unit_conversions()
+{
+  double angstrom_to_bohr, kelvin_to_hartree, ev_to_hartree, second_to_aut;
+
+  MDI_Conversion_factor("angstrom", "bohr", &angstrom_to_bohr);
+  MDI_Conversion_factor("kelvin_energy", "hartree", &kelvin_to_hartree);
+  MDI_Conversion_factor("electron_volt", "hartree", &ev_to_hartree);
+  MDI_Conversion_Factor("second", "atomic_unit_of_time", &second_to_aut);
+
+  // length units
+
+  mdi2lmp_length = 1.0;
+  lmp2mdi_length = 1.0;
+
+  if (lmpunits == REAL || lmpunits == METAL) {
+    lmp2mdi_length = angstrom_to_bohr;
+    mdi2lmp_length = 1.0 / angstrom_to_bohr;
+  }
+
+  // energy units
+
+  mdi2lmp_energy = 1.0;
+  lmp2mdi_energy = 1.0;
+
+  if (lmpunits == REAL) {
+    lmp2mdi_energy = kelvin_to_hartree / force->boltz;
+    mdi2lmp_energy = force->boltz / kelvin_to_hartree;
+  } else if (lmpunits == METAL) {
+    lmp2mdi_energy = ev_to_hartree;
+    mdi2lmp_energy = 1.0 / ev_to_hartree;
+  }
+
+  // force units = energy/length
+
+  mdi2lmp_force = 1.0;
+  lmp2mdi_force = 1.0;
+
+  if (lmpunits == REAL) {
+    lmp2mdi_force = (kelvin_to_hartree / force->boltz) / angstrom_to_bohr;
+    mdi2lmp_force = 1.0 / lmp2mdi_force;
+  } else if (lmpunits == METAL) {
+    lmp2mdi_force = ev_to_hartree / angstrom_to_bohr;
+    mdi2lmp_force = angstrom_to_bohr / ev_to_hartree;
+  }
+
+  // pressure or stress units = force/area = energy/volume
+
+  mdi2lmp_pressure = 1.0;
+  lmp2mdi_pressure = 1.0;
+
+  if (lmpunits == REAL) {
+    lmp2mdi_pressure = (kelvin_to_hartree / force->boltz) /
+        (angstrom_to_bohr * angstrom_to_bohr * angstrom_to_bohr) / force->nktv2p;
+    mdi2lmp_pressure = 1.0 / lmp2mdi_pressure;
+  } else if (lmpunits == METAL) {
+    lmp2mdi_pressure =
+        ev_to_hartree / (angstrom_to_bohr * angstrom_to_bohr * angstrom_to_bohr) / force->nktv2p;
+    mdi2lmp_pressure = 1.0 / lmp2mdi_pressure;
+  }
+}
--- a/src/MDI/fix_mdi_aimd.h
+++ b/src/MDI/fix_mdi_aimd.h
@ -13,42 +13,46 @@

 #ifdef FIX_CLASS
 // clang-format off
-FixStyle(mdi/aimd,FixMDIAimd);
+FixStyle(mdi/qm,FixMDIQM);
 // clang-format on
 #else

-#ifndef LMP_FIX_MDI_AIMD_H
-#define LMP_FIX_MDI_AIMD_H
+#ifndef LMP_FIX_MDI_QM_H
+#define LMP_FIX_MDI_QM_H

 #include "fix.h"
 #include <mdi.h>

 namespace LAMMPS_NS {

-class FixMDIAimd : public Fix {
+class FixMDIQM : public Fix {
 public:
-  FixMDIAimd(class LAMMPS *, int, char **);
-  ~FixMDIAimd();
+  FixMDIQM(class LAMMPS *, int, char **);
+  ~FixMDIQM();
  int setmask();

  void init();
  void setup(int);
-  void setup_pre_reverse(int, int);
-  void pre_reverse(int, int);
  void post_force(int);
  void min_post_force(int);
  double compute_scalar();
+  double compute_vector(int);

 private:
  int nprocs;
+  int every,virialflag,addflag,connectflag;
  int plugin;
+  int maxlocal;
+  int sumflag;
+  int *elements;
+
+  double qm_energy;
+  int lmpunits;
+  double qm_virial[9],qm_virial_symmetric[6];
+  double **fqm;

  MDI_Comm mdicomm;

-  int eflag_caller;
-  double engine_energy;
-  int lmpunits;
-
  // unit conversion factors

  double lmp2mdi_length, mdi2lmp_length;
@ -60,11 +64,15 @@ class FixMDIAimd : public Fix {
  // buffers for MDI comm

  int maxbuf;
+  int *ibuf1, *ibuf1all;
  double *buf3, *buf3all;

  // methods

  void reallocate();
+  void send_types();
+  void send_elements();
+  void send_box();
  void unit_conversions();
 };

--- a/src/MDI/mdi_command.cpp
+++ b/src/MDI/mdi_command.cpp
@ -16,13 +16,19 @@
 #include "error.h"
 #include "mdi_engine.h"
 #include "mdi_plugin.h"
+#include "memory.h"

 #include <cstring>

 using namespace LAMMPS_NS;

 /* ----------------------------------------------------------------------
-   mdi command: engine or plugin
+   mdi command: engine or plugin or connect or exit
+   engine is used when LAMMPS is an MDI engine, to start listening for requests
+   plugin is used when LAMMPS is an MDI driver to load a plugin library
+   connect and exit are used when LAMMPS is an MDI driver to
+     (a) connect = setup comm with a stand-alone MDI engine
+     (b) exit = terminate comm with a stand-alone MDI engine
 ---------------------------------------------------------------------- */

 void MDICommand::command(int narg, char **arg)
@ -31,8 +37,45 @@ void MDICommand::command(int narg, char **arg)

  if (strcmp(arg[0], "engine") == 0) {
    MDIEngine(lmp, narg - 1, &arg[1]);
+
  } else if (strcmp(arg[0], "plugin") == 0) {
    MDIPlugin(lmp, narg - 1, &arg[1]);
-  } else
-    error->all(FLERR, "Illegal mdi command");
+
+  } else if (strcmp(arg[0], "connect") == 0) {
+
+    if (lmp->mdicomm != nullptr)
+      error->all(FLERR,"MDI cannot connect to already connected engine");
+
+    MDI_Comm mdicomm;
+    MDI_Get_communicator(&mdicomm, 0);
+
+    if (mdicomm == MDI_COMM_NULL) {
+      MDI_Accept_communicator(&mdicomm);
+      if (mdicomm == MDI_COMM_NULL)
+        error->all(FLERR, "MDI unable to connect to stand-alone engine");
+    } else error->all(FLERR, "Cannot use mdi connect with plugin engine");
+
+    int nbytes = sizeof(MDI_Comm);
+    char *ptrcomm = (char *) memory->smalloc(nbytes,"mdi:mdicomm");
+    memcpy(ptrcomm,&mdicomm,nbytes);
+
+    lmp->mdicomm = (void *) ptrcomm;
+
+  } else if (strcmp(arg[0], "exit") == 0) {
+
+    if (lmp->mdicomm == nullptr)
+      error->all(FLERR,"MDI cannot send exit to unconnected engine");
+
+    MDI_Comm mdicomm;
+    int nbytes = sizeof(MDI_Comm);
+    char *ptrcomm = (char *) lmp->mdicomm;
+    memcpy(&mdicomm,ptrcomm,nbytes);
+
+    int ierr = MDI_Send_command("EXIT", mdicomm);
+    if (ierr) error->all(FLERR, "MDI: EXIT command");
+
+    memory->sfree(ptrcomm);
+    lmp->mdicomm = nullptr;
+
+  } else error->all(FLERR, "Illegal mdi command");
 }
--- a/src/MDI/mdi_engine.cpp
+++ b/src/MDI/mdi_engine.cpp
@ -54,6 +54,8 @@ enum { DEFAULT, MD, OPT };       // top-level MDI engine modes

 enum { TYPE, CHARGE, MASS, COORD, VELOCITY, FORCE, ADDFORCE };

+#define MAXELEMENT 103           // used elsewhere in MDI package
+
 /* ----------------------------------------------------------------------
   trigger LAMMPS to start acting as an MDI engine
   either in standalone mode or plugin mode
@ -63,17 +65,47 @@ enum { TYPE, CHARGE, MASS, COORD, VELOCITY, FORCE, ADDFORCE };
   when EXIT command is received, mdi engine command exits
 ---------------------------------------------------------------------- */

-MDIEngine::MDIEngine(LAMMPS *_lmp, int narg, char ** /*arg*/) : Pointers(_lmp)
+MDIEngine::MDIEngine(LAMMPS *_lmp, int narg, char ** arg) : Pointers(_lmp)
 {
-  if (narg) error->all(FLERR, "Illegal mdi engine command");
-
  // check requirements for LAMMPS to work with MDI as an engine

-  if (atom->tag_enable == 0) error->all(FLERR, "Cannot use MDI engine without atom IDs");
+  if (atom->tag_enable == 0) error->all(FLERR, "MDI engine requires atom IDs");

  if (atom->natoms && atom->tag_consecutive() == 0)
    error->all(FLERR, "MDI engine requires consecutive atom IDs");

+  // optional args
+
+  elements = nullptr;
+
+  int iarg = 0;
+  while (iarg < narg) {
+    if (strcmp(arg[iarg],"elements") == 0) {
+      int ntypes = atom->ntypes;
+      delete [] elements;
+      elements = new int[ntypes+1];
+      if (iarg+ntypes+1 > narg) error->all(FLERR,"Illegal mdi engine command");
+      for (int i = 1; i <= ntypes; i++) {
+        elements[i] = utils::inumeric(FLERR,arg[iarg+i],false,lmp);
+        if (elements[i] < 0 || elements[i] > MAXELEMENT)
+          error->all(FLERR,"Illegal mdi engine command");
+      }
+      iarg += ntypes+1;
+    } else error->all(FLERR,"Illegal mdi engine command");
+  }
+
+  // error check an MDI element does not map to multiple atom types
+
+  if (elements) {
+    int ntypes = atom->ntypes;
+    for (int i = 1; i < ntypes; i++)
+      for (int j = i+1; j <= ntypes; j++) {
+        if (elements[i] == 0 || elements[j] == 0) continue;
+        if (elements[i] == elements[j])
+          error->all(FLERR,"MDI engine element cannot map to multiple types");
+      }
+  }
+
  // confirm LAMMPS is being run as an engine

  int role;
@ -135,7 +167,7 @@ MDIEngine::MDIEngine(LAMMPS *_lmp, int narg, char ** /*arg*/) : Pointers(_lmp)
  ibuf1 = ibuf1all = nullptr;

  maxatom = 0;
-  sys_natoms = atom->natoms;
+  sys_natoms = static_cast<int> (atom->natoms);
  reallocate();

  nsteps = 0;
@ -194,6 +226,8 @@ MDIEngine::MDIEngine(LAMMPS *_lmp, int narg, char ** /*arg*/) : Pointers(_lmp)

  // clean up

+  delete[] elements;
+
  delete[] mdicmd;
  delete[] node_engine;
  delete[] node_driver;
@ -299,6 +333,11 @@ int MDIEngine::execute_command(const char *command, MDI_Comm mdicomm)
  } else if (strcmp(command, ">COORDS") == 0) {
    receive_coords();

+  } else if (strcmp(command, ">ELEMENTS") == 0) {
+    if (!elements)
+      error->all(FLERR,"MDI engine command did not define element list");
+    receive_elements();
+
  } else if (strcmp(command, ">FORCES") == 0) {
    receive_double3(FORCE);

@ -323,7 +362,7 @@ int MDIEngine::execute_command(const char *command, MDI_Comm mdicomm)
    else
      receive_double3(VELOCITY);

-    // -----------------------------------------------
+  // -----------------------------------------------

  } else if (strcmp(command, "<@") == 0) {
    ierr = MDI_Send(node_engine, MDI_NAME_LENGTH, MDI_CHAR, mdicomm);
@ -372,9 +411,9 @@ int MDIEngine::execute_command(const char *command, MDI_Comm mdicomm)
  } else if (strcmp(command, "<VELOCITIES") == 0) {
    send_double3(VELOCITY);

-    // -----------------------------------------------
+  // -----------------------------------------------

-    // MDI action commands at @DEFAULT node
+  // MDI action commands at @DEFAULT node

  } else if (strcmp(command, "MD") == 0) {
    md();
@ -382,9 +421,9 @@ int MDIEngine::execute_command(const char *command, MDI_Comm mdicomm)
  } else if (strcmp(command, "OPTG") == 0) {
    optg();

-    // -----------------------------------------------
+  // -----------------------------------------------

-    // MDI node commands
+  // MDI node commands

  } else if (strcmp(command, "@INIT_MD") == 0) {
    if (mode != DEFAULT) error->all(FLERR, "MDI: MDI engine is already performing a simulation");
@ -419,14 +458,14 @@ int MDIEngine::execute_command(const char *command, MDI_Comm mdicomm)
    strncpy(node_driver, command, MDI_COMMAND_LENGTH);
    node_match = false;

-    // exit command
+  // exit command

  } else if (strcmp(command, "EXIT") == 0) {
    exit_command = true;

-    // -------------------------------------------------------
-    // custom LAMMPS commands
-    // -------------------------------------------------------
+  // -------------------------------------------------------
+  // custom LAMMPS commands
+  // -------------------------------------------------------

  } else if (strcmp(command, "NBYTES") == 0) {
    nbytes_command();
@ -439,9 +478,9 @@ int MDIEngine::execute_command(const char *command, MDI_Comm mdicomm)
  } else if (strcmp(command, "<KE") == 0) {
    send_ke();

-    // -------------------------------------------------------
-    // unknown command
-    // -------------------------------------------------------
+  // -------------------------------------------------------
+  // unknown command
+  // -------------------------------------------------------

  } else {
    error->all(FLERR, "MDI: Unknown command {} received from driver", command);
@ -479,6 +518,7 @@ void MDIEngine::mdi_commands()
  MDI_Register_command("@DEFAULT", ">CELL_DISPL");
  MDI_Register_command("@DEFAULT", ">CHARGES");
  MDI_Register_command("@DEFAULT", ">COORDS");
+  MDI_Register_command("@DEFAULT", ">ELEMENTS");
  MDI_Register_command("@DEFAULT", ">NATOMS");
  MDI_Register_command("@DEFAULT", ">NSTEPS");
  MDI_Register_command("@DEFAULT", ">TOLERANCE");
@ -914,7 +954,7 @@ void MDIEngine::evaluate()

 /* ----------------------------------------------------------------------
   create a new system
-   >CELL, >NATOMS, >TYPES, >COORDS commands are required
+   >CELL, >NATOMS, >TYPES or >ELEMENTS, >COORDS commands are required
   >CELL_DISPL, >CHARGES, >VELOCITIES commands are optional
 ---------------------------------------------------------------------- */

@ -924,8 +964,8 @@ void MDIEngine::create_system()

  if (flag_cell == 0 || flag_natoms == 0 || flag_types == 0 || flag_coords == 0)
    error->all(FLERR,
-               "MDI create_system requires >CELL, >NATOMS, >TYPES, >COORDS "
-               "MDI commands");
+               "MDI create_system requires >CELL, >NATOMS, "
+               ">TYPES or >ELEMENTS, >COORDS MDI commands");

  // remove all existing atoms via delete_atoms command

@ -955,16 +995,23 @@ void MDIEngine::create_system()
  lammps_reset_box(lmp, boxlo, boxhi, xy, yz, xz);

  // invoke lib->create_atoms()
+  // create list of 1 to sys_natoms IDs
  // optionally set charges if specified by ">CHARGES"

+  tagint* sys_ids;
+  memory->create(sys_ids, sys_natoms, "mdi:sys_ids");
+  for (int i = 0; i < sys_natoms; i++) sys_ids[i] = i+1;
+
  if (flag_velocities)
-    lammps_create_atoms(lmp, sys_natoms, nullptr, sys_types, sys_coords, sys_velocities, nullptr,
+    lammps_create_atoms(lmp, sys_natoms, sys_ids, sys_types, sys_coords, sys_velocities, nullptr,
                        1);
  else
-    lammps_create_atoms(lmp, sys_natoms, nullptr, sys_types, sys_coords, nullptr, nullptr, 1);
+    lammps_create_atoms(lmp, sys_natoms, sys_ids, sys_types, sys_coords, nullptr, nullptr, 1);

  if (flag_charges) lammps_scatter_atoms(lmp, (char *) "q", 1, 1, sys_charges);

+  memory->destroy(sys_ids);
+
  // new system

  update->ntimestep = 0;
@ -1153,6 +1200,38 @@ void MDIEngine::receive_coords()
  for (int i = 0; i < n; i++) sys_coords[i] *= mdi2lmp_length;
 }

+/* ----------------------------------------------------------------------
+   >ELEMENTS command
+   receive elements for each atom = atomic numbers
+   convert to LAMMPS atom types and store in sys_types
+---------------------------------------------------------------------- */
+
+void MDIEngine::receive_elements()
+{
+  actionflag = 0;
+  flag_types = 1;
+  int ierr = MDI_Recv(sys_types, sys_natoms, MDI_INT, mdicomm);
+  if (ierr) error->all(FLERR, "MDI: >ELEMENTS data");
+  MPI_Bcast(sys_types, sys_natoms, MPI_INT, 0, world);
+
+  // convert from element atomic numbers to LAMMPS atom types
+  // use maping provided by mdi engine command
+
+  int ntypes = atom->ntypes;
+  int itype;
+
+  for (int i = 0; i < sys_natoms; i++) {
+    for (itype = 1; itype <= ntypes; itype++) {
+      if (sys_types[i] == elements[itype]) {
+        sys_types[i] = itype;
+        break;
+      }
+    }
+    if (itype > ntypes)
+      error->all(FLERR,"MDI element not found in element list");
+  }
+}
+
 /* ----------------------------------------------------------------------
   >NATOMS command
   natoms cannot exceed 32-bit int for use with MDI
@ -1239,7 +1318,7 @@ void MDIEngine::receive_velocities()

 void MDIEngine::receive_double3(int which)
 {
-  int n = 3 * atom->natoms;
+  int n = 3 * sys_natoms;
  int ierr = MDI_Recv(buf3, n, MDI_DOUBLE, mdicomm);
  if (ierr) error->all(FLERR, "MDI: <double3 data");
  MPI_Bcast(buf3, n, MPI_DOUBLE, 0, world);
@ -1352,10 +1431,10 @@ void MDIEngine::send_total_energy()

 void MDIEngine::send_labels()
 {
-  auto labels = new char[atom->natoms * MDI_LABEL_LENGTH];
-  memset(labels, ' ', atom->natoms * MDI_LABEL_LENGTH);
+  auto labels = new char[sys_natoms * MDI_LABEL_LENGTH];
+  memset(labels, ' ', sys_natoms * MDI_LABEL_LENGTH);

-  memset(ibuf1, 0, atom->natoms * sizeof(int));
+  memset(ibuf1, 0, sys_natoms * sizeof(int));

  // use atomID to index into ordered ibuf1

@ -1370,17 +1449,17 @@ void MDIEngine::send_labels()
    ibuf1[ilocal] = type[i];
  }

-  MPI_Reduce(ibuf1, ibuf1all, atom->natoms, MPI_INT, MPI_SUM, 0, world);
+  MPI_Reduce(ibuf1, ibuf1all, sys_natoms, MPI_INT, MPI_SUM, 0, world);

  if (comm->me == 0) {
-    for (int iatom = 0; iatom < atom->natoms; iatom++) {
+    for (int iatom = 0; iatom < sys_natoms; iatom++) {
      std::string label = std::to_string(ibuf1all[iatom]);
      int label_len = std::min(int(label.length()), MDI_LABEL_LENGTH);
      strncpy(&labels[iatom * MDI_LABEL_LENGTH], label.c_str(), label_len);
    }
  }

-  int ierr = MDI_Send(labels, atom->natoms * MDI_LABEL_LENGTH, MDI_CHAR, mdicomm);
+  int ierr = MDI_Send(labels, sys_natoms * MDI_LABEL_LENGTH, MDI_CHAR, mdicomm);
  if (ierr) error->all(FLERR, "MDI: <LABELS data");

  delete[] labels;
@ -1393,8 +1472,7 @@ void MDIEngine::send_labels()

 void MDIEngine::send_natoms()
 {
-  int natoms = static_cast<int>(atom->natoms);
-  int ierr = MDI_Send(&natoms, 1, MDI_INT, mdicomm);
+  int ierr = MDI_Send(&sys_natoms, 1, MDI_INT, mdicomm);
  if (ierr != 0) error->all(FLERR, "MDI: <NATOMS data");
 }

@ -1414,16 +1492,21 @@ void MDIEngine::send_pe()

 /* ----------------------------------------------------------------------
   <STRESS command
-   send 6-component stress tensor (no kinetic energy term)
+   send 9-component stress tensor (no kinetic energy term)
 ---------------------------------------------------------------------- */

 void MDIEngine::send_stress()
 {
-  double vtensor[6];
+  double vtensor_full[9];
  press->compute_vector();
-  for (int i = 0; i < 6; i++) vtensor[i] = press->vector[i] * lmp2mdi_pressure;
+  vtensor_full[0] = press->vector[0] * lmp2mdi_pressure;
+  vtensor_full[4] = press->vector[1] * lmp2mdi_pressure;
+  vtensor_full[8] = press->vector[2] * lmp2mdi_pressure;
+  vtensor_full[1] = vtensor_full[3] = press->vector[3] * lmp2mdi_pressure;
+  vtensor_full[2] = vtensor_full[6] = press->vector[4] * lmp2mdi_pressure;
+  vtensor_full[5] = vtensor_full[7] = press->vector[5] * lmp2mdi_pressure;

-  int ierr = MDI_Send(vtensor, 6, MDI_DOUBLE, mdicomm);
+  int ierr = MDI_Send(vtensor_full, 9, MDI_DOUBLE, mdicomm);
  if (ierr) error->all(FLERR, "MDI: <STRESS data");
 }

@ -1435,7 +1518,7 @@ void MDIEngine::send_stress()

 void MDIEngine::send_double1(int which)
 {
-  memset(buf1, 0, atom->natoms * sizeof(double));
+  memset(buf1, 0, sys_natoms * sizeof(double));

  // use atomID to index into ordered buf1

@ -1467,9 +1550,9 @@ void MDIEngine::send_double1(int which)
    }
  }

-  MPI_Reduce(buf1, buf1all, atom->natoms, MPI_DOUBLE, MPI_SUM, 0, world);
+  MPI_Reduce(buf1, buf1all, sys_natoms, MPI_DOUBLE, MPI_SUM, 0, world);

-  int ierr = MDI_Send(buf1all, atom->natoms, MDI_DOUBLE, mdicomm);
+  int ierr = MDI_Send(buf1all, sys_natoms, MDI_DOUBLE, mdicomm);
  if (ierr) error->all(FLERR, "MDI: <double1 data");
 }

@ -1481,7 +1564,7 @@ void MDIEngine::send_double1(int which)

 void MDIEngine::send_int1(int which)
 {
-  memset(ibuf1, 0, atom->natoms * sizeof(int));
+  memset(ibuf1, 0, sys_natoms * sizeof(int));

  // use atomID to index into ordered ibuf1

@ -1498,9 +1581,9 @@ void MDIEngine::send_int1(int which)
    }
  }

-  MPI_Reduce(ibuf1, ibuf1all, atom->natoms, MPI_INT, MPI_SUM, 0, world);
+  MPI_Reduce(ibuf1, ibuf1all, sys_natoms, MPI_INT, MPI_SUM, 0, world);

-  int ierr = MDI_Send(ibuf1all, atom->natoms, MDI_INT, mdicomm);
+  int ierr = MDI_Send(ibuf1all, sys_natoms, MDI_INT, mdicomm);
  if (ierr) error->all(FLERR, "MDI: <int1 data");
 }

@ -1512,7 +1595,7 @@ void MDIEngine::send_int1(int which)

 void MDIEngine::send_double3(int which)
 {
-  memset(buf3, 0, 3 * atom->natoms * sizeof(double));
+  memset(buf3, 0, 3 * sys_natoms * sizeof(double));

  // use atomID to index into ordered buf3

@ -1547,9 +1630,9 @@ void MDIEngine::send_double3(int which)
    }
  }

-  MPI_Reduce(buf3, buf3all, 3 * atom->natoms, MPI_DOUBLE, MPI_SUM, 0, world);
+  MPI_Reduce(buf3, buf3all, 3 * sys_natoms, MPI_DOUBLE, MPI_SUM, 0, world);

-  int ierr = MDI_Send(buf3all, 3 * atom->natoms, MDI_DOUBLE, mdicomm);
+  int ierr = MDI_Send(buf3all, 3 * sys_natoms, MDI_DOUBLE, mdicomm);
  if (ierr) error->all(FLERR, "MDI: <double3 data");
 }

@ -1663,7 +1746,8 @@ void MDIEngine::reallocate()
 {
  if (sys_natoms <= maxatom) return;

-  if (3 * sys_natoms > MAXSMALLINT) error->all(FLERR, "Natoms too large to use with mdi engine");
+  bigint nsize = (bigint) sys_natoms * 3;
+  if (nsize > MAXSMALLINT) error->all(FLERR, "Natoms too large to use with mdi engine");

  maxatom = sys_natoms;

--- a/src/MDI/mdi_engine.h
+++ b/src/MDI/mdi_engine.h
@ -70,6 +70,8 @@ class MDIEngine : protected Pointers {

  int actionflag;    // 1 if MD or OPTG just completed, else 0

+  int *elements;
+
  // buffers for MDI comm

  int maxatom;
@ -106,6 +108,7 @@ class MDIEngine : protected Pointers {
  void receive_cell_displ();
  void receive_charges();
  void receive_coords();
+  void receive_elements();
  void receive_natoms();
  void receive_nsteps();
  void receive_tolerance();
--- a/src/MDI/mdi_plugin.cpp
+++ b/src/MDI/mdi_plugin.cpp
@ -19,7 +19,6 @@
 #include "mdi_plugin.h"

 #include "error.h"
-#include "fix_mdi_aimd.h"
 #include "input.h"
 #include "modify.h"

--- a/src/MEAM/meam.h
+++ b/src/MEAM/meam.h
@ -27,9 +27,11 @@ typedef enum { FCC, BCC, HCP, DIM, DIA, DIA3, B1, C11, L12, B2, CH4, LIN, ZIG, T
 class MEAM {
 public:
  MEAM(Memory *mem);
-  ~MEAM();
+  virtual ~MEAM();

- private:
+  int copymode;
+
+ protected:
  Memory *memory;

  // cutforce = force cutoff
@ -285,8 +287,8 @@ class MEAM {
                         double *rozero, int *ibar);
  void meam_setup_param(int which, double value, int nindex, int *index /*index(3)*/,
                        int *errorflag);
-  void meam_setup_done(double *cutmax);
-  void meam_dens_setup(int atom_nmax, int nall, int n_neigh);
+  virtual void meam_setup_done(double *cutmax);
+  virtual void meam_dens_setup(int atom_nmax, int nall, int n_neigh);
  void meam_dens_init(int i, int ntype, int *type, int *fmap, double **x, int numneigh,
                      int *firstneigh, int numneigh_full, int *firstneigh_full, int fnoffset);
  void meam_dens_final(int nlocal, int eflag_either, int eflag_global, int eflag_atom,
--- a/src/MEAM/meam_impl.cpp
+++ b/src/MEAM/meam_impl.cpp
@ -36,6 +36,7 @@ MEAM::MEAM(Memory* mem)

  maxneigh = 0;
  scrfcn = dscrfcn = fcpair = nullptr;
+  copymode = 0;

  neltypes = 0;
  for (int i = 0; i < maxelt; i++) {
@ -53,6 +54,8 @@ MEAM::MEAM(Memory* mem)

 MEAM::~MEAM()
 {
+  if (copymode) return;
+
  memory->destroy(this->phirar6);
  memory->destroy(this->phirar5);
  memory->destroy(this->phirar4);
--- a/src/MEAM/pair_meam.cpp
+++ b/src/MEAM/pair_meam.cpp
@ -73,7 +73,10 @@ PairMEAM::PairMEAM(LAMMPS *lmp) : Pair(lmp)

 PairMEAM::~PairMEAM()
 {
-  delete meam_inst;
+  if (copymode) return;
+
+  if (meam_inst)
+    delete meam_inst;

  if (allocated) {
    memory->destroy(setflag);
--- a/src/MEAM/pair_meam.h
+++ b/src/MEAM/pair_meam.h
@ -43,7 +43,7 @@ class PairMEAM : public Pair {
  void unpack_reverse_comm(int, int *, double *) override;
  double memory_usage() override;

- private:
+ protected:
  class MEAM *meam_inst;
  double cutmax;                           // max cutoff for all elements
  int nlibelements;                        // # of library elements
--- a/src/ML-SNAP/compute_grid.cpp
+++ b/src/ML-SNAP/compute_grid.cpp
@ -0,0 +1,242 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/ Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "compute_grid.h"
+
+#include "atom.h"
+#include "comm.h"
+#include "domain.h"
+#include "error.h"
+#include "force.h"
+#include "memory.h"
+#include "modify.h"
+#include "update.h"
+
+#include <cstring>
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+ComputeGrid::ComputeGrid(LAMMPS *lmp, int narg, char **arg) :
+    Compute(lmp, narg, arg), grid(nullptr), gridall(nullptr), gridlocal(nullptr)
+{
+  if (narg < 6) error->all(FLERR, "Illegal compute grid command");
+
+  array_flag = 1;
+  size_array_cols = 0;
+  size_array_rows = 0;
+  extarray = 0;
+
+  int iarg0 = 3;
+  int iarg = iarg0;
+  if (strcmp(arg[iarg], "grid") == 0) {
+    if (iarg + 4 > narg) error->all(FLERR, "Illegal compute grid command");
+    nx = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
+    ny = utils::inumeric(FLERR, arg[iarg + 2], false, lmp);
+    nz = utils::inumeric(FLERR, arg[iarg + 3], false, lmp);
+    if (nx <= 0 || ny <= 0 || nz <= 0) error->all(FLERR, "All grid dimensions must be positive");
+    iarg += 4;
+  } else
+    error->all(FLERR, "Illegal compute grid command");
+
+  nargbase = iarg - iarg0;
+
+  size_array_rows = nx * ny * nz;
+  size_array_cols_base = 3;
+  gridlocal_allocated = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+ComputeGrid::~ComputeGrid()
+{
+  deallocate();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeGrid::setup()
+{
+  deallocate();
+  set_grid_global();
+  set_grid_local();
+  allocate();
+}
+
+/* ----------------------------------------------------------------------
+   convert global array index to box coords
+------------------------------------------------------------------------- */
+
+void ComputeGrid::grid2x(int igrid, double *x)
+{
+  int iz = igrid / (nx * ny);
+  igrid -= iz * (nx * ny);
+  int iy = igrid / nx;
+  igrid -= iy * nx;
+  int ix = igrid;
+
+  x[0] = ix * delx;
+  x[1] = iy * dely;
+  x[2] = iz * delz;
+
+  if (triclinic) domain->lamda2x(x, x);
+}
+
+/* ----------------------------------------------------------------------
+   copy coords to global array
+------------------------------------------------------------------------- */
+
+void ComputeGrid::assign_coords_all()
+{
+  double x[3];
+  for (int igrid = 0; igrid < size_array_rows; igrid++) {
+    grid2x(igrid, x);
+    gridall[igrid][0] = x[0];
+    gridall[igrid][1] = x[1];
+    gridall[igrid][2] = x[2];
+  }
+}
+
+/* ----------------------------------------------------------------------
+   create arrays
+------------------------------------------------------------------------- */
+
+void ComputeGrid::allocate()
+{
+  // allocate arrays
+
+  memory->create(grid, size_array_rows, size_array_cols, "grid:grid");
+  memory->create(gridall, size_array_rows, size_array_cols, "grid:gridall");
+  if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) {
+    gridlocal_allocated = 1;
+    memory->create4d_offset(gridlocal, size_array_cols, nzlo, nzhi, nylo, nyhi, nxlo, nxhi,
+                            "grid:gridlocal");
+  }
+  array = gridall;
+}
+
+/* ----------------------------------------------------------------------
+   free arrays
+------------------------------------------------------------------------- */
+
+void ComputeGrid::deallocate()
+{
+  memory->destroy(grid);
+  memory->destroy(gridall);
+  if (gridlocal_allocated) {
+    gridlocal_allocated = 0;
+    memory->destroy4d_offset(gridlocal, nzlo, nylo, nxlo);
+  }
+  array = nullptr;
+}
+
+/* ----------------------------------------------------------------------
+   set global grid
+------------------------------------------------------------------------- */
+
+void ComputeGrid::set_grid_global()
+{
+  // calculate grid layout
+
+  triclinic = domain->triclinic;
+
+  if (triclinic == 0) {
+    prd = domain->prd;
+    boxlo = domain->boxlo;
+    sublo = domain->sublo;
+    subhi = domain->subhi;
+  } else {
+    prd = domain->prd_lamda;
+    boxlo = domain->boxlo_lamda;
+    sublo = domain->sublo_lamda;
+    subhi = domain->subhi_lamda;
+  }
+
+  double xprd = prd[0];
+  double yprd = prd[1];
+  double zprd = prd[2];
+
+  delxinv = nx / xprd;
+  delyinv = ny / yprd;
+  delzinv = nz / zprd;
+
+  delx = 1.0 / delxinv;
+  dely = 1.0 / delyinv;
+  delz = 1.0 / delzinv;
+}
+
+/* ----------------------------------------------------------------------
+   set local subset of grid that I own
+   n xyz lo/hi = 3d brick that I own (inclusive)
+------------------------------------------------------------------------- */
+
+void ComputeGrid::set_grid_local()
+{
+  // nx,ny,nz = extent of global grid
+  // indices into the global grid range from 0 to N-1 in each dim
+  // if grid point is inside my sub-domain I own it,
+  //   this includes sub-domain lo boundary but excludes hi boundary
+  // ixyz lo/hi = inclusive lo/hi bounds of global grid sub-brick I own
+  // if proc owns no grid cells in a dim, then ilo > ihi
+  // if 2 procs share a boundary a grid point is exactly on,
+  //   the 2 equality if tests insure a consistent decision
+  //   as to which proc owns it
+
+  double xfraclo, xfrachi, yfraclo, yfrachi, zfraclo, zfrachi;
+
+  if (comm->layout != Comm::LAYOUT_TILED) {
+    xfraclo = comm->xsplit[comm->myloc[0]];
+    xfrachi = comm->xsplit[comm->myloc[0] + 1];
+    yfraclo = comm->ysplit[comm->myloc[1]];
+    yfrachi = comm->ysplit[comm->myloc[1] + 1];
+    zfraclo = comm->zsplit[comm->myloc[2]];
+    zfrachi = comm->zsplit[comm->myloc[2] + 1];
+  } else {
+    xfraclo = comm->mysplit[0][0];
+    xfrachi = comm->mysplit[0][1];
+    yfraclo = comm->mysplit[1][0];
+    yfrachi = comm->mysplit[1][1];
+    zfraclo = comm->mysplit[2][0];
+    zfrachi = comm->mysplit[2][1];
+  }
+
+  nxlo = static_cast<int>(xfraclo * nx);
+  if (1.0 * nxlo != xfraclo * nx) nxlo++;
+  nxhi = static_cast<int>(xfrachi * nx);
+  if (1.0 * nxhi == xfrachi * nx) nxhi--;
+
+  nylo = static_cast<int>(yfraclo * ny);
+  if (1.0 * nylo != yfraclo * ny) nylo++;
+  nyhi = static_cast<int>(yfrachi * ny);
+  if (1.0 * nyhi == yfrachi * ny) nyhi--;
+
+  nzlo = static_cast<int>(zfraclo * nz);
+  if (1.0 * nzlo != zfraclo * nz) nzlo++;
+  nzhi = static_cast<int>(zfrachi * nz);
+  if (1.0 * nzhi == zfrachi * nz) nzhi--;
+
+  ngridlocal = (nxhi - nxlo + 1) * (nyhi - nylo + 1) * (nzhi - nzlo + 1);
+}
+
+/* ----------------------------------------------------------------------
+   memory usage of local data
+------------------------------------------------------------------------- */
+
+double ComputeGrid::memory_usage()
+{
+  double nbytes = size_array_rows * size_array_cols * sizeof(double);    // grid
+  nbytes += size_array_rows * size_array_cols * sizeof(double);          // gridall
+  nbytes += size_array_cols * ngridlocal * sizeof(double);               // gridlocal
+  return nbytes;
+}
--- a/src/ML-SNAP/compute_grid.h
+++ b/src/ML-SNAP/compute_grid.h
@ -0,0 +1,58 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/ Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_COMPUTE_GRID_H
+#define LMP_COMPUTE_GRID_H
+
+#include "compute.h"
+
+namespace LAMMPS_NS {
+
+class ComputeGrid : public Compute {
+ public:
+  ComputeGrid(class LAMMPS *, int, char **);
+  ~ComputeGrid() override;
+  void setup() override;
+  void compute_array() override = 0;
+
+  double memory_usage() override;
+
+ protected:
+  int nx, ny, nz;                            // global grid dimensions
+  int nxlo, nxhi, nylo, nyhi, nzlo, nzhi;    // local grid bounds, inclusive
+  int ngridlocal;                            // number of local grid points
+  int nvalues;                               // number of values per grid point
+  double **grid;                             // global grid
+  double **gridall;                          // global grid summed over procs
+  double ****gridlocal;                      // local grid
+  int triclinic;                             // triclinic flag
+  double *boxlo, *prd;                       // box info (units real/ortho or reduced/tri)
+  double *sublo, *subhi;                     // subdomain info (units real/ortho or reduced/tri)
+  double delxinv, delyinv, delzinv;          // inverse grid spacing
+  double delx, dely, delz;                   // grid spacing
+  int nargbase;                              // number of base class args
+  double cutmax;                             // largest cutoff distance
+  int size_array_cols_base;                  // number of columns used for coords, etc.
+  int gridlocal_allocated;                   // shows if gridlocal allocated
+
+  void allocate();               // create arrays
+  void deallocate();             // free arrays
+  void grid2x(int, double *);    // convert grid point to coord
+  void assign_coords_all();      // assign coords for global grid
+  void set_grid_global();        // set global grid
+  void set_grid_local();         // set bounds for local grid
+};
+
+}    // namespace LAMMPS_NS
+
+#endif
--- a/src/ML-SNAP/compute_grid_local.cpp
+++ b/src/ML-SNAP/compute_grid_local.cpp
@ -0,0 +1,270 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/ Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "compute_grid_local.h"
+
+#include "atom.h"
+#include "comm.h"
+#include "domain.h"
+#include "error.h"
+#include "force.h"
+#include "memory.h"
+#include "modify.h"
+#include "update.h"
+
+#include <cstring>
+
+// For the subdomain test below; grid-points and subdomain boundaries
+// sometimes differ by minimal amounts (in the order of 2e-17).
+static constexpr double EPSILON = 1.0e-10;
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+ComputeGridLocal::ComputeGridLocal(LAMMPS *lmp, int narg, char **arg) :
+    Compute(lmp, narg, arg), alocal(nullptr)
+{
+  if (narg < 6) error->all(FLERR, "Illegal compute grid/local command");
+
+  local_flag = 1;
+  size_local_cols = 0;
+  size_local_rows = 0;
+  extarray = 0;
+
+  int iarg0 = 3;
+  int iarg = iarg0;
+  if (strcmp(arg[iarg], "grid") == 0) {
+    if (iarg + 4 > narg) error->all(FLERR, "Illegal compute grid/local command");
+    nx = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
+    ny = utils::inumeric(FLERR, arg[iarg + 2], false, lmp);
+    nz = utils::inumeric(FLERR, arg[iarg + 3], false, lmp);
+    if (nx <= 0 || ny <= 0 || nz <= 0)
+      error->all(FLERR, "All grid/local dimensions must be positive");
+    iarg += 4;
+  } else
+    error->all(FLERR, "Illegal compute grid/local command");
+
+  nargbase = iarg - iarg0;
+
+  size_local_cols_base = 6;
+  gridlocal_allocated = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+ComputeGridLocal::~ComputeGridLocal()
+{
+  deallocate();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeGridLocal::setup()
+{
+  deallocate();
+  set_grid_global();
+  set_grid_local();
+  allocate();
+  assign_coords();
+}
+
+/* ----------------------------------------------------------------------
+   convert global array indexes to box coords
+------------------------------------------------------------------------- */
+
+void ComputeGridLocal::grid2x(int ix, int iy, int iz, double *x)
+{
+  x[0] = ix * delx;
+  x[1] = iy * dely;
+  x[2] = iz * delz;
+
+  if (triclinic) domain->lamda2x(x, x);
+}
+
+/* ----------------------------------------------------------------------
+   convert global array indexes to lamda coords; for orthorombic
+   cells defaults to grid2x.
+------------------------------------------------------------------------- */
+
+void ComputeGridLocal::grid2lamda(int ix, int iy, int iz, double *x)
+{
+  x[0] = ix * delx;
+  x[1] = iy * dely;
+  x[2] = iz * delz;
+}
+
+/* ----------------------------------------------------------------------
+   create arrays
+------------------------------------------------------------------------- */
+
+void ComputeGridLocal::allocate()
+{
+  if (nxlo <= nxhi && nylo <= nyhi && nzlo <= nzhi) {
+    gridlocal_allocated = 1;
+    memory->create(alocal, size_local_rows, size_local_cols, "compute/grid/local:alocal");
+    array_local = alocal;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   free arrays
+------------------------------------------------------------------------- */
+
+void ComputeGridLocal::deallocate()
+{
+  if (gridlocal_allocated) {
+    gridlocal_allocated = 0;
+    memory->destroy(alocal);
+  }
+  array_local = nullptr;
+}
+
+/* ----------------------------------------------------------------------
+   set global grid
+------------------------------------------------------------------------- */
+
+void ComputeGridLocal::set_grid_global()
+{
+  // calculate grid layout
+
+  triclinic = domain->triclinic;
+
+  if (triclinic == 0) {
+    prd = domain->prd;
+    boxlo = domain->boxlo;
+    sublo = domain->sublo;
+    subhi = domain->subhi;
+  } else {
+    prd = domain->prd_lamda;
+    boxlo = domain->boxlo_lamda;
+    sublo = domain->sublo_lamda;
+    subhi = domain->subhi_lamda;
+  }
+
+  double xprd = prd[0];
+  double yprd = prd[1];
+  double zprd = prd[2];
+
+  delxinv = nx / xprd;
+  delyinv = ny / yprd;
+  delzinv = nz / zprd;
+
+  delx = 1.0 / delxinv;
+  dely = 1.0 / delyinv;
+  delz = 1.0 / delzinv;
+}
+
+/* ----------------------------------------------------------------------
+   set local subset of grid that I own
+   n xyz lo/hi = 3d brick that I own (inclusive)
+------------------------------------------------------------------------- */
+
+void ComputeGridLocal::set_grid_local()
+{
+  // nx,ny,nz = extent of global grid
+  // indices into the global grid range from 0 to N-1 in each dim
+  // if grid point is inside my sub-domain I own it,
+  //   this includes sub-domain lo boundary but excludes hi boundary
+  // ixyz lo/hi = inclusive lo/hi bounds of global grid sub-brick I own
+  // if proc owns no grid cells in a dim, then ilo > ihi
+  // if 2 procs share a boundary a grid point is exactly on,
+  //   the 2 equality if tests insure a consistent decision
+  //   as to which proc owns it
+
+  double xfraclo, xfrachi, yfraclo, yfrachi, zfraclo, zfrachi;
+
+  if (comm->layout != Comm::LAYOUT_TILED) {
+    xfraclo = comm->xsplit[comm->myloc[0]];
+    xfrachi = comm->xsplit[comm->myloc[0] + 1];
+    yfraclo = comm->ysplit[comm->myloc[1]];
+    yfrachi = comm->ysplit[comm->myloc[1] + 1];
+    zfraclo = comm->zsplit[comm->myloc[2]];
+    zfrachi = comm->zsplit[comm->myloc[2] + 1];
+  } else {
+    xfraclo = comm->mysplit[0][0];
+    xfrachi = comm->mysplit[0][1];
+    yfraclo = comm->mysplit[1][0];
+    yfrachi = comm->mysplit[1][1];
+    zfraclo = comm->mysplit[2][0];
+    zfrachi = comm->mysplit[2][1];
+  }
+
+  nxlo = static_cast<int>(xfraclo * nx);
+  if (1.0 * nxlo != xfraclo * nx) nxlo++;
+  nxhi = static_cast<int>(xfrachi * nx);
+  if (1.0 * nxhi == xfrachi * nx) nxhi--;
+
+  nylo = static_cast<int>(yfraclo * ny);
+  if (1.0 * nylo != yfraclo * ny) nylo++;
+  nyhi = static_cast<int>(yfrachi * ny);
+  if (1.0 * nyhi == yfrachi * ny) nyhi--;
+
+  nzlo = static_cast<int>(zfraclo * nz);
+  if (1.0 * nzlo != zfraclo * nz) nzlo++;
+  nzhi = static_cast<int>(zfrachi * nz);
+  if (1.0 * nzhi == zfrachi * nz) nzhi--;
+
+  size_local_rows = (nxhi - nxlo + 1) * (nyhi - nylo + 1) * (nzhi - nzlo + 1);
+}
+
+/* ----------------------------------------------------------------------
+   copy coords to local array
+------------------------------------------------------------------------- */
+
+void ComputeGridLocal::assign_coords()
+{
+  int igrid = 0;
+  for (int iz = nzlo; iz <= nzhi; iz++)
+    for (int iy = nylo; iy <= nyhi; iy++)
+      for (int ix = nxlo; ix <= nxhi; ix++) {
+        alocal[igrid][0] = ix;
+        alocal[igrid][1] = iy;
+        alocal[igrid][2] = iz;
+        double xgrid[3];
+
+        // for triclinic: create gridpoint in lamda coordinates and transform after check.
+        // for orthorombic: create gridpoint in box coordinates.
+
+        if (triclinic)
+          grid2lamda(ix, iy, iz, xgrid);
+        else
+          grid2x(ix, iy, iz, xgrid);
+
+        // ensure gridpoint is not strictly outside subdomain
+
+        if ((sublo[0] - xgrid[0]) > EPSILON || (xgrid[0] - subhi[0]) > EPSILON ||
+            (sublo[1] - xgrid[1]) > EPSILON || (xgrid[1] - subhi[1]) > EPSILON ||
+            (sublo[2] - xgrid[2]) > EPSILON || (xgrid[2] - subhi[2]) > EPSILON)
+          error->one(FLERR, "Invalid gridpoint position in compute grid/local");
+
+        // convert lamda to x, y, z, after sudomain check
+
+        if (triclinic) domain->lamda2x(xgrid, xgrid);
+
+        alocal[igrid][3] = xgrid[0];
+        alocal[igrid][4] = xgrid[1];
+        alocal[igrid][5] = xgrid[2];
+        igrid++;
+      }
+}
+
+/* ----------------------------------------------------------------------
+   memory usage of local data
+------------------------------------------------------------------------- */
+
+double ComputeGridLocal::memory_usage()
+{
+  int nbytes = size_local_rows * size_local_cols * sizeof(double);    // gridlocal
+  return nbytes;
+}
--- a/src/ML-SNAP/compute_grid_local.h
+++ b/src/ML-SNAP/compute_grid_local.h
@ -0,0 +1,56 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/ Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_COMPUTE_GRID_LOCAL_H
+#define LMP_COMPUTE_GRID_LOCAL_H
+
+#include "compute.h"
+
+namespace LAMMPS_NS {
+
+class ComputeGridLocal : public Compute {
+ public:
+  ComputeGridLocal(class LAMMPS *, int, char **);
+  ~ComputeGridLocal() override;
+  void setup() override;
+  void compute_local() override = 0;
+
+  double memory_usage() override;
+
+ protected:
+  int nx, ny, nz;                            // global grid dimensions
+  int nxlo, nxhi, nylo, nyhi, nzlo, nzhi;    // local grid bounds, inclusive
+  int nvalues;                               // number of values per grid point
+  double **alocal;                           // pointer to Compute::array_local
+  int triclinic;                             // triclinic flag
+  double *boxlo, *prd;                       // box info (units real/ortho or reduced/tri)
+  double *sublo, *subhi;                     // subdomain info (units real/ortho or reduced/tri)
+  double delxinv, delyinv, delzinv;          // inverse grid spacing
+  double delx, dely, delz;                   // grid spacing
+  int nargbase;                              // number of base class args
+  double cutmax;                             // largest cutoff distance
+  int size_local_cols_base;                  // number of columns used for coords, etc.
+  int gridlocal_allocated;                   // shows if gridlocal allocated
+
+  void allocate();                             // create arrays
+  void deallocate();                           // free arrays
+  void grid2x(int, int, int, double *);        // convert global indices to coordinates
+  void grid2lamda(int, int, int, double *);    // convert global indices to lamda coordinates
+  void set_grid_global();                      // set global grid
+  void set_grid_local();                       // set bounds for local grid
+  void assign_coords();                        // assign coords for grid
+};
+
+}    // namespace LAMMPS_NS
+
+#endif
--- a/src/ML-SNAP/compute_sna_atom.cpp
+++ b/src/ML-SNAP/compute_sna_atom.cpp
@ -35,20 +35,21 @@ ComputeSNAAtom::ComputeSNAAtom(LAMMPS *lmp, int narg, char **arg) :
  radelem(nullptr), wjelem(nullptr), sinnerelem(nullptr), dinnerelem(nullptr)

 {
-  double rmin0, rfac0;
+  // begin code common to all SNAP computes
+
+  double rfac0, rmin0;
  int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;

  int ntypes = atom->ntypes;
-  int nargmin = 6+2*ntypes;
+  int nargmin = 6 + 2 * ntypes;

-  if (narg < nargmin) error->all(FLERR,"Illegal compute sna/atom command");
+  if (narg < nargmin) error->all(FLERR, "Illegal compute {} command", style);

  // default values

  rmin0 = 0.0;
  switchflag = 1;
  bzeroflag = 1;
-  bnormflag = 0;
  quadraticflag = 0;
  chemflag = 0;
  bnormflag = 0;
@ -56,32 +57,34 @@ ComputeSNAAtom::ComputeSNAAtom(LAMMPS *lmp, int narg, char **arg) :
  switchinnerflag = 0;
  nelements = 1;

-  // offset by 1 to match up with types
+  // process required arguments

-  memory->create(radelem,ntypes+1,"sna/atom:radelem");
-  memory->create(wjelem,ntypes+1,"sna/atom:wjelem");
+  memory->create(radelem, ntypes + 1, "sna/atom:radelem"); // offset by 1 to match up with types
+  memory->create(wjelem, ntypes + 1, "sna/atom:wjelem");

-  rcutfac = atof(arg[3]);
-  rfac0 = atof(arg[4]);
-  twojmax = atoi(arg[5]);
+  rcutfac = utils::numeric(FLERR, arg[3], false, lmp);
+  rfac0 = utils::numeric(FLERR, arg[4], false, lmp);
+  twojmax = utils::inumeric(FLERR, arg[5], false, lmp);

  for (int i = 0; i < ntypes; i++)
-    radelem[i+1] = atof(arg[6+i]);
+    radelem[i + 1] =
+        utils::numeric(FLERR, arg[6 + i], false, lmp);
  for (int i = 0; i < ntypes; i++)
-    wjelem[i+1] = atof(arg[6+ntypes+i]);
+    wjelem[i + 1] =
+        utils::numeric(FLERR, arg[6 + ntypes + i], false, lmp);

  // construct cutsq

  double cut;
  cutmax = 0.0;
-  memory->create(cutsq,ntypes+1,ntypes+1,"sna/atom:cutsq");
+  memory->create(cutsq, ntypes + 1, ntypes + 1, "sna/atom:cutsq");
  for (int i = 1; i <= ntypes; i++) {
-    cut = 2.0*radelem[i]*rcutfac;
+    cut = 2.0 * radelem[i] * rcutfac;
    if (cut > cutmax) cutmax = cut;
-    cutsq[i][i] = cut*cut;
-    for (int j = i+1; j <= ntypes; j++) {
-      cut = (radelem[i]+radelem[j])*rcutfac;
-      cutsq[i][j] = cutsq[j][i] = cut*cut;
+    cutsq[i][i] = cut * cut;
+    for (int j = i + 1; j <= ntypes; j++) {
+      cut = (radelem[i] + radelem[j]) * rcutfac;
+      cutsq[i][j] = cutsq[j][i] = cut * cut;
    }
  }

@ -95,89 +98,87 @@ ComputeSNAAtom::ComputeSNAAtom(LAMMPS *lmp, int narg, char **arg) :
  int iarg = nargmin;

  while (iarg < narg) {
-    if (strcmp(arg[iarg],"rmin0") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute sna/atom command");
-      rmin0 = atof(arg[iarg+1]);
+    if (strcmp(arg[iarg], "rmin0") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      rmin0 = utils::numeric(FLERR, arg[iarg + 1], false, lmp);
      iarg += 2;
-    } else if (strcmp(arg[iarg],"switchflag") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute sna/atom command");
-      switchflag = atoi(arg[iarg+1]);
+    } else if (strcmp(arg[iarg], "switchflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      switchflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
      iarg += 2;
-    } else if (strcmp(arg[iarg],"bzeroflag") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute sna/atom command");
-      bzeroflag = atoi(arg[iarg+1]);
+    } else if (strcmp(arg[iarg], "bzeroflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      bzeroflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
      iarg += 2;
-    } else if (strcmp(arg[iarg],"quadraticflag") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute sna/atom command");
-      quadraticflag = atoi(arg[iarg+1]);
+    } else if (strcmp(arg[iarg], "quadraticflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      quadraticflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
      iarg += 2;
-    } else if (strcmp(arg[iarg],"chem") == 0) {
-      if (iarg+2+ntypes > narg)
-        error->all(FLERR,"Illegal compute sna/atom command");
+    } else if (strcmp(arg[iarg], "chem") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
      chemflag = 1;
-      memory->create(map,ntypes+1,"compute_sna_atom:map");
-      nelements = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
+      memory->create(map, ntypes + 1, "compute_sna_grid:map");
+      nelements = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
      for (int i = 0; i < ntypes; i++) {
-        int jelem = utils::inumeric(FLERR,arg[iarg+2+i],false,lmp);
-        if (jelem < 0 || jelem >= nelements)
-          error->all(FLERR,"Illegal compute sna/atom command");
-        map[i+1] = jelem;
+        int jelem = utils::inumeric(FLERR, arg[iarg + 2 + i], false, lmp);
+        if (jelem < 0 || jelem >= nelements) error->all(FLERR, "Illegal compute {} command", style);
+        map[i + 1] = jelem;
      }
-      iarg += 2+ntypes;
-    } else if (strcmp(arg[iarg],"bnormflag") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute sna/atom command");
-      bnormflag = atoi(arg[iarg+1]);
+      iarg += 2 + ntypes;
+    } else if (strcmp(arg[iarg], "bnormflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      bnormflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
      iarg += 2;
-    } else if (strcmp(arg[iarg],"wselfallflag") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute sna/atom command");
-      wselfallflag = atoi(arg[iarg+1]);
+    } else if (strcmp(arg[iarg], "wselfallflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      wselfallflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
      iarg += 2;
-    } else if (strcmp(arg[iarg],"switchinnerflag") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute sna/atom command");
-      switchinnerflag = atoi(arg[iarg+1]);
+    } else if (strcmp(arg[iarg], "switchinnerflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      switchinnerflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
      iarg += 2;
-    } else if (strcmp(arg[iarg],"sinner") == 0) {
+    } else if (strcmp(arg[iarg], "sinner") == 0) {
      iarg++;
-      if (iarg+ntypes > narg)
-        error->all(FLERR,"Illegal compute sna/atom command");
-      memory->create(sinnerelem,ntypes+1,"sna/atom:sinnerelem");
+      if (iarg + ntypes > narg) error->all(FLERR, "Illegal compute {} command", style);
+      memory->create(sinnerelem, ntypes + 1, "snap:sinnerelem");
      for (int i = 0; i < ntypes; i++)
-        sinnerelem[i+1] = utils::numeric(FLERR,arg[iarg+i],false,lmp);
+        sinnerelem[i + 1] = utils::numeric(FLERR, arg[iarg + i], false, lmp);
      sinnerflag = 1;
      iarg += ntypes;
-    } else if (strcmp(arg[iarg],"dinner") == 0) {
+    } else if (strcmp(arg[iarg], "dinner") == 0) {
      iarg++;
-      if (iarg+ntypes > narg)
-        error->all(FLERR,"Illegal compute sna/atom command");
-      memory->create(dinnerelem,ntypes+1,"sna/atom:dinnerelem");
+      if (iarg + ntypes > narg) error->all(FLERR, "Illegal compute {} command", style);
+      memory->create(dinnerelem, ntypes + 1, "snap:dinnerelem");
      for (int i = 0; i < ntypes; i++)
-        dinnerelem[i+1] = utils::numeric(FLERR,arg[iarg+i],false,lmp);
+        dinnerelem[i + 1] = utils::numeric(FLERR, arg[iarg + i], false, lmp);
      dinnerflag = 1;
      iarg += ntypes;
-    } else error->all(FLERR,"Illegal compute sna/atom command");
+    } else
+      error->all(FLERR, "Illegal compute {} command", style);
  }

  if (switchinnerflag && !(sinnerflag && dinnerflag))
-    error->all(FLERR,"Illegal compute sna/atom command: switchinnerflag = 1, missing sinner/dinner keyword");
+    error->all(
+        FLERR,
+        "Illegal compute {} command: switchinnerflag = 1, missing sinner/dinner keyword",
+        style);

  if (!switchinnerflag && (sinnerflag || dinnerflag))
-    error->all(FLERR,"Illegal compute sna/atom command: switchinnerflag = 0, unexpected sinner/dinner keyword");
+    error->all(
+        FLERR,
+        "Illegal compute {} command: switchinnerflag = 0, unexpected sinner/dinner keyword",
+        style);

-  snaptr = new SNA(lmp, rfac0, twojmax,
-                   rmin0, switchflag, bzeroflag,
-                   chemflag, bnormflag, wselfallflag,
-                   nelements, switchinnerflag);
+  snaptr = new SNA(lmp, rfac0, twojmax, rmin0, switchflag, bzeroflag, chemflag, bnormflag,
+                   wselfallflag, nelements, switchinnerflag);

  ncoeff = snaptr->ncoeff;
-  size_peratom_cols = ncoeff;
-  if (quadraticflag) size_peratom_cols += (ncoeff*(ncoeff+1))/2;
+  nvalues = ncoeff;
+  if (quadraticflag) nvalues += (ncoeff * (ncoeff + 1)) / 2;
+
+  // end code common to all SNAP computes
+
+  size_peratom_cols = nvalues;
  peratom_flag = 1;

  nmax = 0;
--- a/src/ML-SNAP/compute_sna_atom.h
+++ b/src/ML-SNAP/compute_sna_atom.h
@ -50,6 +50,7 @@ class ComputeSNAAtom : public Compute {
  class SNA *snaptr;
  double cutmax;
  int quadraticflag;
+  int nvalues;
 };

 }    // namespace LAMMPS_NS
--- a/src/ML-SNAP/compute_sna_grid.cpp
+++ b/src/ML-SNAP/compute_sna_grid.cpp
@ -0,0 +1,320 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/ Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "compute_sna_grid.h"
+
+#include "atom.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "memory.h"
+#include "modify.h"
+#include "sna.h"
+#include "update.h"
+
+#include <cmath>
+#include <cstring>
+
+using namespace LAMMPS_NS;
+
+ComputeSNAGrid::ComputeSNAGrid(LAMMPS *lmp, int narg, char **arg) :
+    ComputeGrid(lmp, narg, arg), cutsq(nullptr), radelem(nullptr), wjelem(nullptr)
+{
+  // skip over arguments used by base class
+  // so that argument positions are identical to
+  // regular per-atom compute
+
+  arg += nargbase;
+  narg -= nargbase;
+
+  // begin code common to all SNAP computes
+
+  double rfac0, rmin0;
+  int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
+
+  int ntypes = atom->ntypes;
+  int nargmin = 6 + 2 * ntypes;
+
+  if (narg < nargmin) error->all(FLERR, "Illegal compute {} command", style);
+
+  // default values
+
+  rmin0 = 0.0;
+  switchflag = 1;
+  bzeroflag = 1;
+  quadraticflag = 0;
+  chemflag = 0;
+  bnormflag = 0;
+  wselfallflag = 0;
+  switchinnerflag = 0;
+  nelements = 1;
+
+  // process required arguments
+
+  memory->create(radelem, ntypes + 1, "sna/atom:radelem");    // offset by 1 to match up with types
+  memory->create(wjelem, ntypes + 1, "sna/atom:wjelem");
+
+  rcutfac = utils::numeric(FLERR, arg[3], false, lmp);
+  rfac0 = utils::numeric(FLERR, arg[4], false, lmp);
+  twojmax = utils::inumeric(FLERR, arg[5], false, lmp);
+
+  for (int i = 0; i < ntypes; i++) radelem[i + 1] = utils::numeric(FLERR, arg[6 + i], false, lmp);
+  for (int i = 0; i < ntypes; i++)
+    wjelem[i + 1] = utils::numeric(FLERR, arg[6 + ntypes + i], false, lmp);
+
+  // construct cutsq
+
+  double cut;
+  cutmax = 0.0;
+  memory->create(cutsq, ntypes + 1, ntypes + 1, "sna/atom:cutsq");
+  for (int i = 1; i <= ntypes; i++) {
+    cut = 2.0 * radelem[i] * rcutfac;
+    if (cut > cutmax) cutmax = cut;
+    cutsq[i][i] = cut * cut;
+    for (int j = i + 1; j <= ntypes; j++) {
+      cut = (radelem[i] + radelem[j]) * rcutfac;
+      cutsq[i][j] = cutsq[j][i] = cut * cut;
+    }
+  }
+
+  // set local input checks
+
+  int sinnerflag = 0;
+  int dinnerflag = 0;
+
+  // process optional args
+
+  int iarg = nargmin;
+
+  while (iarg < narg) {
+    if (strcmp(arg[iarg], "rmin0") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      rmin0 = utils::numeric(FLERR, arg[iarg + 1], false, lmp);
+      iarg += 2;
+    } else if (strcmp(arg[iarg], "switchflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      switchflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
+      iarg += 2;
+    } else if (strcmp(arg[iarg], "bzeroflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      bzeroflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
+      iarg += 2;
+    } else if (strcmp(arg[iarg], "quadraticflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      quadraticflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
+      iarg += 2;
+    } else if (strcmp(arg[iarg], "chem") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      chemflag = 1;
+      memory->create(map, ntypes + 1, "compute_sna_grid:map");
+      nelements = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
+      for (int i = 0; i < ntypes; i++) {
+        int jelem = utils::inumeric(FLERR, arg[iarg + 2 + i], false, lmp);
+        if (jelem < 0 || jelem >= nelements) error->all(FLERR, "Illegal compute {} command", style);
+        map[i + 1] = jelem;
+      }
+      iarg += 2 + ntypes;
+    } else if (strcmp(arg[iarg], "bnormflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      bnormflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
+      iarg += 2;
+    } else if (strcmp(arg[iarg], "wselfallflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      wselfallflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
+      iarg += 2;
+    } else if (strcmp(arg[iarg], "switchinnerflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      switchinnerflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
+      iarg += 2;
+    } else if (strcmp(arg[iarg], "sinner") == 0) {
+      iarg++;
+      if (iarg + ntypes > narg) error->all(FLERR, "Illegal compute {} command", style);
+      memory->create(sinnerelem, ntypes + 1, "snap:sinnerelem");
+      for (int i = 0; i < ntypes; i++)
+        sinnerelem[i + 1] = utils::numeric(FLERR, arg[iarg + i], false, lmp);
+      sinnerflag = 1;
+      iarg += ntypes;
+    } else if (strcmp(arg[iarg], "dinner") == 0) {
+      iarg++;
+      if (iarg + ntypes > narg) error->all(FLERR, "Illegal compute {} command", style);
+      memory->create(dinnerelem, ntypes + 1, "snap:dinnerelem");
+      for (int i = 0; i < ntypes; i++)
+        dinnerelem[i + 1] = utils::numeric(FLERR, arg[iarg + i], false, lmp);
+      dinnerflag = 1;
+      iarg += ntypes;
+    } else
+      error->all(FLERR, "Illegal compute {} command", style);
+  }
+
+  if (switchinnerflag && !(sinnerflag && dinnerflag))
+    error->all(FLERR,
+               "Illegal compute {} command: switchinnerflag = 1, missing sinner/dinner keyword",
+               style);
+
+  if (!switchinnerflag && (sinnerflag || dinnerflag))
+    error->all(FLERR,
+               "Illegal compute {} command: switchinnerflag = 0, unexpected sinner/dinner keyword",
+               style);
+
+  snaptr = new SNA(lmp, rfac0, twojmax, rmin0, switchflag, bzeroflag, chemflag, bnormflag,
+                   wselfallflag, nelements, switchinnerflag);
+
+  ncoeff = snaptr->ncoeff;
+  nvalues = ncoeff;
+  if (quadraticflag) nvalues += (ncoeff * (ncoeff + 1)) / 2;
+
+  // end code common to all SNAP computes
+
+  size_array_cols = size_array_cols_base + nvalues;
+  array_flag = 1;
+}
+
+/* ---------------------------------------------------------------------- */
+
+ComputeSNAGrid::~ComputeSNAGrid()
+{
+  memory->destroy(radelem);
+  memory->destroy(wjelem);
+  memory->destroy(cutsq);
+  delete snaptr;
+
+  if (chemflag) memory->destroy(map);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeSNAGrid::init()
+{
+  if ((modify->get_compute_by_style("^sna/grid$").size() > 1) && (comm->me == 0))
+    error->warning(FLERR, "More than one instance of compute sna/grid");
+  snaptr->init();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeSNAGrid::compute_array()
+{
+  invoked_array = update->ntimestep;
+
+  // compute sna for each gridpoint
+
+  double **const x = atom->x;
+  const int *const mask = atom->mask;
+  int *const type = atom->type;
+  const int ntotal = atom->nlocal + atom->nghost;
+
+  // insure rij, inside, and typej are of size jnum
+
+  snaptr->grow_rij(ntotal);
+
+  for (int iz = nzlo; iz <= nzhi; iz++)
+    for (int iy = nylo; iy <= nyhi; iy++)
+      for (int ix = nxlo; ix <= nxhi; ix++) {
+        double xgrid[3];
+        const int igrid = iz * (nx * ny) + iy * nx + ix;
+        grid2x(igrid, xgrid);
+        const double xtmp = xgrid[0];
+        const double ytmp = xgrid[1];
+        const double ztmp = xgrid[2];
+
+        // currently, all grid points are type 1
+        // not clear what a better choice would be
+
+        const int itype = 1;
+        int ielem = 0;
+        if (chemflag) ielem = map[itype];
+
+        // rij[][3] = displacements between atom I and those neighbors
+        // inside = indices of neighbors of I within cutoff
+        // typej = types of neighbors of I within cutoff
+
+        int ninside = 0;
+        for (int j = 0; j < ntotal; j++) {
+
+          // check that j is in compute group
+
+          if (!(mask[j] & groupbit)) continue;
+
+          const double delx = xtmp - x[j][0];
+          const double dely = ytmp - x[j][1];
+          const double delz = ztmp - x[j][2];
+          const double rsq = delx * delx + dely * dely + delz * delz;
+          int jtype = type[j];
+          int jelem = 0;
+          if (chemflag) jelem = map[jtype];
+
+          if (rsq < cutsq[jtype][jtype] && rsq > 1e-20) {
+            snaptr->rij[ninside][0] = delx;
+            snaptr->rij[ninside][1] = dely;
+            snaptr->rij[ninside][2] = delz;
+            snaptr->inside[ninside] = j;
+            snaptr->wj[ninside] = wjelem[jtype];
+            snaptr->rcutij[ninside] = 2.0 * radelem[jtype] * rcutfac;
+            if (switchinnerflag) {
+              snaptr->sinnerij[ninside] = sinnerelem[jelem];
+              snaptr->dinnerij[ninside] = dinnerelem[jelem];
+            }
+            if (chemflag) snaptr->element[ninside] = jelem;
+            ninside++;
+          }
+        }
+
+        snaptr->compute_ui(ninside, ielem);
+        snaptr->compute_zi();
+        snaptr->compute_bi(ielem);
+
+        // linear contributions
+
+        for (int icoeff = 0; icoeff < ncoeff; icoeff++)
+          gridlocal[size_array_cols_base + icoeff][iz][iy][ix] = snaptr->blist[icoeff];
+
+        // quadratic contributions
+
+        if (quadraticflag) {
+          int ncount = ncoeff;
+          for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+            double bveci = snaptr->blist[icoeff];
+            gridlocal[size_array_cols_base + ncount++][iz][iy][ix] = 0.5 * bveci * bveci;
+            for (int jcoeff = icoeff + 1; jcoeff < ncoeff; jcoeff++)
+              gridlocal[size_array_cols_base + ncount++][iz][iy][ix] =
+                  bveci * snaptr->blist[jcoeff];
+          }
+        }
+      }
+
+  memset(&grid[0][0], 0, size_array_rows * size_array_cols * sizeof(double));
+
+  for (int iz = nzlo; iz <= nzhi; iz++)
+    for (int iy = nylo; iy <= nyhi; iy++)
+      for (int ix = nxlo; ix <= nxhi; ix++) {
+        const int igrid = iz * (nx * ny) + iy * nx + ix;
+        for (int j = 0; j < nvalues; j++)
+          grid[igrid][size_array_cols_base + j] = gridlocal[size_array_cols_base + j][iz][iy][ix];
+      }
+  MPI_Allreduce(&grid[0][0], &gridall[0][0], size_array_rows * size_array_cols, MPI_DOUBLE, MPI_SUM,
+                world);
+  assign_coords_all();
+}
+
+/* ----------------------------------------------------------------------
+   memory usage
+------------------------------------------------------------------------- */
+
+double ComputeSNAGrid::memory_usage()
+{
+  double nbytes = snaptr->memory_usage();    // SNA object
+  int n = atom->ntypes + 1;
+  nbytes += (double) n * sizeof(int);    // map
+
+  return nbytes;
+}
--- a/src/ML-SNAP/compute_sna_grid.h
+++ b/src/ML-SNAP/compute_sna_grid.h
@ -0,0 +1,54 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/ Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef COMPUTE_CLASS
+// clang-format off
+ComputeStyle(sna/grid,ComputeSNAGrid);
+// clang-format on
+#else
+
+#ifndef LMP_COMPUTE_SNA_GRID_H
+#define LMP_COMPUTE_SNA_GRID_H
+
+#include "compute_grid.h"
+
+namespace LAMMPS_NS {
+
+class ComputeSNAGrid : public ComputeGrid {
+ public:
+  ComputeSNAGrid(class LAMMPS *, int, char **);
+  ~ComputeSNAGrid() override;
+  void init() override;
+  void compute_array() override;
+  double memory_usage() override;
+
+ private:
+  int ncoeff;
+  double **cutsq;
+  double rcutfac;
+  double *radelem;
+  double *wjelem;
+  int *map;    // map types to [0,nelements)
+  int nelements, chemflag;
+  int switchinnerflag;
+  double *sinnerelem;
+  double *dinnerelem;
+  class SNA *snaptr;
+  double cutmax;
+  int quadraticflag;
+};
+
+}    // namespace LAMMPS_NS
+
+#endif
+#endif
--- a/src/ML-SNAP/compute_sna_grid_local.cpp
+++ b/src/ML-SNAP/compute_sna_grid_local.cpp
@ -0,0 +1,306 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/ Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "compute_sna_grid_local.h"
+
+#include "atom.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "memory.h"
+#include "modify.h"
+#include "sna.h"
+#include "update.h"
+
+#include <cmath>
+#include <cstring>
+
+using namespace LAMMPS_NS;
+
+ComputeSNAGridLocal::ComputeSNAGridLocal(LAMMPS *lmp, int narg, char **arg) :
+    ComputeGridLocal(lmp, narg, arg), cutsq(nullptr), radelem(nullptr), wjelem(nullptr)
+{
+  // skip over arguments used by base class
+  // so that argument positions are identical to
+  // regular per-atom compute
+
+  arg += nargbase;
+  narg -= nargbase;
+
+  // begin code common to all SNAP computes
+
+  double rfac0, rmin0;
+  int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;
+
+  int ntypes = atom->ntypes;
+  int nargmin = 6 + 2 * ntypes;
+
+  if (narg < nargmin) error->all(FLERR, "Illegal compute {} command", style);
+
+  // default values
+
+  rmin0 = 0.0;
+  switchflag = 1;
+  bzeroflag = 1;
+  quadraticflag = 0;
+  chemflag = 0;
+  bnormflag = 0;
+  wselfallflag = 0;
+  switchinnerflag = 0;
+  nelements = 1;
+
+  // process required arguments
+
+  memory->create(radelem, ntypes + 1, "sna/atom:radelem");    // offset by 1 to match up with types
+  memory->create(wjelem, ntypes + 1, "sna/atom:wjelem");
+
+  rcutfac = utils::numeric(FLERR, arg[3], false, lmp);
+  rfac0 = utils::numeric(FLERR, arg[4], false, lmp);
+  twojmax = utils::inumeric(FLERR, arg[5], false, lmp);
+
+  for (int i = 0; i < ntypes; i++) radelem[i + 1] = utils::numeric(FLERR, arg[6 + i], false, lmp);
+  for (int i = 0; i < ntypes; i++)
+    wjelem[i + 1] = utils::numeric(FLERR, arg[6 + ntypes + i], false, lmp);
+
+  // construct cutsq
+
+  double cut;
+  cutmax = 0.0;
+  memory->create(cutsq, ntypes + 1, ntypes + 1, "sna/atom:cutsq");
+  for (int i = 1; i <= ntypes; i++) {
+    cut = 2.0 * radelem[i] * rcutfac;
+    if (cut > cutmax) cutmax = cut;
+    cutsq[i][i] = cut * cut;
+    for (int j = i + 1; j <= ntypes; j++) {
+      cut = (radelem[i] + radelem[j]) * rcutfac;
+      cutsq[i][j] = cutsq[j][i] = cut * cut;
+    }
+  }
+
+  // set local input checks
+
+  int sinnerflag = 0;
+  int dinnerflag = 0;
+
+  // process optional args
+
+  int iarg = nargmin;
+
+  while (iarg < narg) {
+    if (strcmp(arg[iarg], "rmin0") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      rmin0 = utils::numeric(FLERR, arg[iarg + 1], false, lmp);
+      iarg += 2;
+    } else if (strcmp(arg[iarg], "switchflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      switchflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
+      iarg += 2;
+    } else if (strcmp(arg[iarg], "bzeroflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      bzeroflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
+      iarg += 2;
+    } else if (strcmp(arg[iarg], "quadraticflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      quadraticflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
+      iarg += 2;
+    } else if (strcmp(arg[iarg], "chem") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      chemflag = 1;
+      memory->create(map, ntypes + 1, "compute_sna_grid:map");
+      nelements = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
+      for (int i = 0; i < ntypes; i++) {
+        int jelem = utils::inumeric(FLERR, arg[iarg + 2 + i], false, lmp);
+        if (jelem < 0 || jelem >= nelements) error->all(FLERR, "Illegal compute {} command", style);
+        map[i + 1] = jelem;
+      }
+      iarg += 2 + ntypes;
+    } else if (strcmp(arg[iarg], "bnormflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      bnormflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
+      iarg += 2;
+    } else if (strcmp(arg[iarg], "wselfallflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      wselfallflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
+      iarg += 2;
+    } else if (strcmp(arg[iarg], "switchinnerflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      switchinnerflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
+      iarg += 2;
+    } else if (strcmp(arg[iarg], "sinner") == 0) {
+      iarg++;
+      if (iarg + ntypes > narg) error->all(FLERR, "Illegal compute {} command", style);
+      memory->create(sinnerelem, ntypes + 1, "snap:sinnerelem");
+      for (int i = 0; i < ntypes; i++)
+        sinnerelem[i + 1] = utils::numeric(FLERR, arg[iarg + i], false, lmp);
+      sinnerflag = 1;
+      iarg += ntypes;
+    } else if (strcmp(arg[iarg], "dinner") == 0) {
+      iarg++;
+      if (iarg + ntypes > narg) error->all(FLERR, "Illegal compute {} command", style);
+      memory->create(dinnerelem, ntypes + 1, "snap:dinnerelem");
+      for (int i = 0; i < ntypes; i++)
+        dinnerelem[i + 1] = utils::numeric(FLERR, arg[iarg + i], false, lmp);
+      dinnerflag = 1;
+      iarg += ntypes;
+    } else
+      error->all(FLERR, "Illegal compute {} command", style);
+  }
+
+  if (switchinnerflag && !(sinnerflag && dinnerflag))
+    error->all(FLERR,
+               "Illegal compute {} command: switchinnerflag = 1, missing sinner/dinner keyword",
+               style);
+
+  if (!switchinnerflag && (sinnerflag || dinnerflag))
+    error->all(FLERR,
+               "Illegal compute {} command: switchinnerflag = 0, unexpected sinner/dinner keyword",
+               style);
+
+  snaptr = new SNA(lmp, rfac0, twojmax, rmin0, switchflag, bzeroflag, chemflag, bnormflag,
+                   wselfallflag, nelements, switchinnerflag);
+
+  ncoeff = snaptr->ncoeff;
+  nvalues = ncoeff;
+  if (quadraticflag) nvalues += (ncoeff * (ncoeff + 1)) / 2;
+
+  // end code common to all SNAP computes
+
+  size_local_cols = size_local_cols_base + nvalues;
+}
+
+/* ---------------------------------------------------------------------- */
+
+ComputeSNAGridLocal::~ComputeSNAGridLocal()
+{
+  memory->destroy(radelem);
+  memory->destroy(wjelem);
+  memory->destroy(cutsq);
+  delete snaptr;
+
+  if (chemflag) memory->destroy(map);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeSNAGridLocal::init()
+{
+  if ((modify->get_compute_by_style("^sna/grid/local$").size() > 1) && (comm->me == 0))
+    error->warning(FLERR, "More than one instance of compute sna/grid/local");
+  snaptr->init();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeSNAGridLocal::compute_local()
+{
+  invoked_local = update->ntimestep;
+
+  // compute sna for each gridpoint
+
+  double **const x = atom->x;
+  const int *const mask = atom->mask;
+  int *const type = atom->type;
+  const int ntotal = atom->nlocal + atom->nghost;
+
+  // insure rij, inside, and typej are of size jnum
+
+  snaptr->grow_rij(ntotal);
+
+  int igrid = 0;
+  for (int iz = nzlo; iz <= nzhi; iz++)
+    for (int iy = nylo; iy <= nyhi; iy++)
+      for (int ix = nxlo; ix <= nxhi; ix++) {
+        double xgrid[3];
+        grid2x(ix, iy, iz, xgrid);
+        const double xtmp = xgrid[0];
+        const double ytmp = xgrid[1];
+        const double ztmp = xgrid[2];
+
+        // currently, all grid points are type 1
+	// not clear what a better choice would be
+
+        const int itype = 1;
+        int ielem = 0;
+        if (chemflag) ielem = map[itype];
+
+        // rij[][3] = displacements between atom I and those neighbors
+        // inside = indices of neighbors of I within cutoff
+        // typej = types of neighbors of I within cutoff
+
+        int ninside = 0;
+        for (int j = 0; j < ntotal; j++) {
+
+          // check that j is in compute group
+
+          if (!(mask[j] & groupbit)) continue;
+
+          const double delx = xtmp - x[j][0];
+          const double dely = ytmp - x[j][1];
+          const double delz = ztmp - x[j][2];
+          const double rsq = delx * delx + dely * dely + delz * delz;
+          int jtype = type[j];
+          int jelem = 0;
+          if (chemflag) jelem = map[jtype];
+          if (rsq < cutsq[jtype][jtype] && rsq > 1e-20) {
+            snaptr->rij[ninside][0] = delx;
+            snaptr->rij[ninside][1] = dely;
+            snaptr->rij[ninside][2] = delz;
+            snaptr->inside[ninside] = j;
+            snaptr->wj[ninside] = wjelem[jtype];
+            snaptr->rcutij[ninside] = 2.0 * radelem[jtype] * rcutfac;
+            if (switchinnerflag) {
+              snaptr->sinnerij[ninside] = sinnerelem[jelem];
+              snaptr->dinnerij[ninside] = dinnerelem[jelem];
+            }
+            if (chemflag)
+              snaptr->element[ninside] = jelem;    // element index for multi-element snap
+            ninside++;
+          }
+        }
+
+        snaptr->compute_ui(ninside, ielem);
+        snaptr->compute_zi();
+        snaptr->compute_bi(ielem);
+
+        // linear contributions
+
+        for (int icoeff = 0; icoeff < ncoeff; icoeff++)
+          alocal[igrid][size_local_cols_base + icoeff] = snaptr->blist[icoeff];
+
+        // quadratic contributions
+
+        if (quadraticflag) {
+          int ncount = ncoeff;
+          for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
+            double bveci = snaptr->blist[icoeff];
+            alocal[igrid][size_local_cols_base + ncount++] = 0.5 * bveci * bveci;
+            for (int jcoeff = icoeff + 1; jcoeff < ncoeff; jcoeff++)
+              alocal[igrid][size_local_cols_base + ncount++] = bveci * snaptr->blist[jcoeff];
+          }
+        }
+        igrid++;
+      }
+}
+
+/* ----------------------------------------------------------------------
+   memory usage
+------------------------------------------------------------------------- */
+
+double ComputeSNAGridLocal::memory_usage()
+{
+  double nbytes = snaptr->memory_usage();    // SNA object
+  int n = atom->ntypes + 1;
+  nbytes += (double) n * sizeof(int);    // map
+
+  return nbytes;
+}
--- a/src/ML-SNAP/compute_sna_grid_local.h
+++ b/src/ML-SNAP/compute_sna_grid_local.h
@ -0,0 +1,54 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/ Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef COMPUTE_CLASS
+// clang-format off
+ComputeStyle(sna/grid/local,ComputeSNAGridLocal);
+// clang-format on
+#else
+
+#ifndef LMP_COMPUTE_SNA_GRID_LOCAL_H
+#define LMP_COMPUTE_SNA_GRID_LOCAL_H
+
+#include "compute_grid_local.h"
+
+namespace LAMMPS_NS {
+
+class ComputeSNAGridLocal : public ComputeGridLocal {
+ public:
+  ComputeSNAGridLocal(class LAMMPS *, int, char **);
+  ~ComputeSNAGridLocal() override;
+  void init() override;
+  void compute_local() override;
+  double memory_usage() override;
+
+ private:
+  int ncoeff;
+  double **cutsq;
+  double rcutfac;
+  double *radelem;
+  double *wjelem;
+  int *map;    // map types to [0,nelements)
+  int nelements, chemflag;
+  int switchinnerflag;
+  double *sinnerelem;
+  double *dinnerelem;
+  class SNA *snaptr;
+  double cutmax;
+  int quadraticflag;
+};
+
+}    // namespace LAMMPS_NS
+
+#endif
+#endif
--- a/src/ML-SNAP/compute_snad_atom.cpp
+++ b/src/ML-SNAP/compute_snad_atom.cpp
@ -34,20 +34,22 @@ ComputeSNADAtom::ComputeSNADAtom(LAMMPS *lmp, int narg, char **arg) :
  Compute(lmp, narg, arg), cutsq(nullptr), list(nullptr), snad(nullptr),
  radelem(nullptr), wjelem(nullptr), sinnerelem(nullptr), dinnerelem(nullptr)
 {
+
+  // begin code common to all SNAP computes
+
  double rfac0, rmin0;
  int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;

  int ntypes = atom->ntypes;
-  int nargmin = 6+2*ntypes;
+  int nargmin = 6 + 2 * ntypes;

-  if (narg < nargmin) error->all(FLERR,"Illegal compute snad/atom command");
+  if (narg < nargmin) error->all(FLERR, "Illegal compute {} command", style);

  // default values

  rmin0 = 0.0;
  switchflag = 1;
  bzeroflag = 1;
-  bnormflag = 0;
  quadraticflag = 0;
  chemflag = 0;
  bnormflag = 0;
@ -57,28 +59,32 @@ ComputeSNADAtom::ComputeSNADAtom(LAMMPS *lmp, int narg, char **arg) :

  // process required arguments

-  memory->create(radelem,ntypes+1,"snad/atom:radelem"); // offset by 1 to match up with types
-  memory->create(wjelem,ntypes+1,"snad/atom:wjelem");
-  rcutfac = atof(arg[3]);
-  rfac0 = atof(arg[4]);
-  twojmax = atoi(arg[5]);
+  memory->create(radelem, ntypes + 1, "sna/atom:radelem"); // offset by 1 to match up with types
+  memory->create(wjelem, ntypes + 1, "sna/atom:wjelem");
+
+  rcutfac = utils::numeric(FLERR, arg[3], false, lmp);
+  rfac0 = utils::numeric(FLERR, arg[4], false, lmp);
+  twojmax = utils::inumeric(FLERR, arg[5], false, lmp);
+
  for (int i = 0; i < ntypes; i++)
-    radelem[i+1] = atof(arg[6+i]);
+    radelem[i + 1] =
+        utils::numeric(FLERR, arg[6 + i], false, lmp);
  for (int i = 0; i < ntypes; i++)
-    wjelem[i+1] = atof(arg[6+ntypes+i]);
+    wjelem[i + 1] =
+        utils::numeric(FLERR, arg[6 + ntypes + i], false, lmp);

  // construct cutsq

  double cut;
  cutmax = 0.0;
-  memory->create(cutsq,ntypes+1,ntypes+1,"snad/atom:cutsq");
+  memory->create(cutsq, ntypes + 1, ntypes + 1, "sna/atom:cutsq");
  for (int i = 1; i <= ntypes; i++) {
-    cut = 2.0*radelem[i]*rcutfac;
+    cut = 2.0 * radelem[i] * rcutfac;
    if (cut > cutmax) cutmax = cut;
-    cutsq[i][i] = cut*cut;
-    for (int j = i+1; j <= ntypes; j++) {
-      cut = (radelem[i]+radelem[j])*rcutfac;
-      cutsq[i][j] = cutsq[j][i] = cut*cut;
+    cutsq[i][i] = cut * cut;
+    for (int j = i + 1; j <= ntypes; j++) {
+      cut = (radelem[i] + radelem[j]) * rcutfac;
+      cutsq[i][j] = cutsq[j][i] = cut * cut;
    }
  }

@ -92,93 +98,89 @@ ComputeSNADAtom::ComputeSNADAtom(LAMMPS *lmp, int narg, char **arg) :
  int iarg = nargmin;

  while (iarg < narg) {
-    if (strcmp(arg[iarg],"rmin0") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute snad/atom command");
-      rmin0 = atof(arg[iarg+1]);
+    if (strcmp(arg[iarg], "rmin0") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      rmin0 = utils::numeric(FLERR, arg[iarg + 1], false, lmp);
      iarg += 2;
-    } else if (strcmp(arg[iarg],"bzeroflag") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute snad/atom command");
-      bzeroflag = atoi(arg[iarg+1]);
+    } else if (strcmp(arg[iarg], "switchflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      switchflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
      iarg += 2;
-    } else if (strcmp(arg[iarg],"switchflag") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute snad/atom command");
-      switchflag = atoi(arg[iarg+1]);
+    } else if (strcmp(arg[iarg], "bzeroflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      bzeroflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
      iarg += 2;
-    } else if (strcmp(arg[iarg],"quadraticflag") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute snad/atom command");
-      quadraticflag = atoi(arg[iarg+1]);
+    } else if (strcmp(arg[iarg], "quadraticflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      quadraticflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
      iarg += 2;
-    } else if (strcmp(arg[iarg],"chem") == 0) {
-      if (iarg+2+ntypes > narg)
-        error->all(FLERR,"Illegal compute snad/atom command");
+    } else if (strcmp(arg[iarg], "chem") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
      chemflag = 1;
-      memory->create(map,ntypes+1,"compute_snad_atom:map");
-      nelements = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
+      memory->create(map, ntypes + 1, "compute_sna_grid:map");
+      nelements = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
      for (int i = 0; i < ntypes; i++) {
-        int jelem = utils::inumeric(FLERR,arg[iarg+2+i],false,lmp);
-        if (jelem < 0 || jelem >= nelements)
-          error->all(FLERR,"Illegal compute snad/atom command");
-        map[i+1] = jelem;
+        int jelem = utils::inumeric(FLERR, arg[iarg + 2 + i], false, lmp);
+        if (jelem < 0 || jelem >= nelements) error->all(FLERR, "Illegal compute {} command", style);
+        map[i + 1] = jelem;
      }
-      iarg += 2+ntypes;
-    } else if (strcmp(arg[iarg],"bnormflag") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute snad/atom command");
-      bnormflag = atoi(arg[iarg+1]);
+      iarg += 2 + ntypes;
+    } else if (strcmp(arg[iarg], "bnormflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      bnormflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
      iarg += 2;
-    } else if (strcmp(arg[iarg],"wselfallflag") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute snad/atom command");
-      wselfallflag = atoi(arg[iarg+1]);
+    } else if (strcmp(arg[iarg], "wselfallflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      wselfallflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
      iarg += 2;
-    } else if (strcmp(arg[iarg],"switchinnerflag") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute snad/atom command");
-      switchinnerflag = atoi(arg[iarg+1]);
+    } else if (strcmp(arg[iarg], "switchinnerflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      switchinnerflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
      iarg += 2;
-    } else if (strcmp(arg[iarg],"sinner") == 0) {
+    } else if (strcmp(arg[iarg], "sinner") == 0) {
      iarg++;
-      if (iarg+ntypes > narg)
-        error->all(FLERR,"Illegal compute snad/atom command");
-      memory->create(sinnerelem,ntypes+1,"snad/atom:sinnerelem");
+      if (iarg + ntypes > narg) error->all(FLERR, "Illegal compute {} command", style);
+      memory->create(sinnerelem, ntypes + 1, "snap:sinnerelem");
      for (int i = 0; i < ntypes; i++)
-        sinnerelem[i+1] = utils::numeric(FLERR,arg[iarg+i],false,lmp);
+        sinnerelem[i + 1] = utils::numeric(FLERR, arg[iarg + i], false, lmp);
      sinnerflag = 1;
      iarg += ntypes;
-    } else if (strcmp(arg[iarg],"dinner") == 0) {
+    } else if (strcmp(arg[iarg], "dinner") == 0) {
      iarg++;
-      if (iarg+ntypes > narg)
-        error->all(FLERR,"Illegal compute snad/atom command");
-      memory->create(dinnerelem,ntypes+1,"snad/atom:dinnerelem");
+      if (iarg + ntypes > narg) error->all(FLERR, "Illegal compute {} command", style);
+      memory->create(dinnerelem, ntypes + 1, "snap:dinnerelem");
      for (int i = 0; i < ntypes; i++)
-        dinnerelem[i+1] = utils::numeric(FLERR,arg[iarg+i],false,lmp);
+        dinnerelem[i + 1] = utils::numeric(FLERR, arg[iarg + i], false, lmp);
      dinnerflag = 1;
      iarg += ntypes;
-    } else error->all(FLERR,"Illegal compute snad/atom command");
+    } else
+      error->all(FLERR, "Illegal compute {} command", style);
  }

  if (switchinnerflag && !(sinnerflag && dinnerflag))
-    error->all(FLERR,"Illegal compute snad/atom command: switchinnerflag = 1, missing sinner/dinner keyword");
+    error->all(
+        FLERR,
+        "Illegal compute {} command: switchinnerflag = 1, missing sinner/dinner keyword",
+        style);

  if (!switchinnerflag && (sinnerflag || dinnerflag))
-    error->all(FLERR,"Illegal compute snad/atom command: switchinnerflag = 0, unexpected sinner/dinner keyword");
+    error->all(
+        FLERR,
+        "Illegal compute {} command: switchinnerflag = 0, unexpected sinner/dinner keyword",
+        style);

-
-  snaptr = new SNA(lmp, rfac0, twojmax,
-                   rmin0, switchflag, bzeroflag,
-                   chemflag, bnormflag, wselfallflag,
-                   nelements, switchinnerflag);
+  snaptr = new SNA(lmp, rfac0, twojmax, rmin0, switchflag, bzeroflag, chemflag, bnormflag,
+                   wselfallflag, nelements, switchinnerflag);

  ncoeff = snaptr->ncoeff;
-  nperdim = ncoeff;
-  if (quadraticflag) nperdim += (ncoeff*(ncoeff+1))/2;
-  yoffset = nperdim;
-  zoffset = 2*nperdim;
-  size_peratom_cols = 3*nperdim*atom->ntypes;
+  nvalues = ncoeff;
+  if (quadraticflag) nvalues += (ncoeff * (ncoeff + 1)) / 2;
+
+  // end code common to all SNAP computes
+
+  yoffset = nvalues;
+  zoffset = 2*nvalues;
+  size_peratom_cols = 3*nvalues*atom->ntypes;
  comm_reverse = size_peratom_cols;
  peratom_flag = 1;

@ -289,7 +291,7 @@ void ComputeSNADAtom::compute_peratom()
      // const int typeoffset = threencoeff*(atom->type[i]-1);
      // const int quadraticoffset = threencoeff*atom->ntypes +
      //   threencoeffq*(atom->type[i]-1);
-      const int typeoffset = 3*nperdim*(atom->type[i]-1);
+      const int typeoffset = 3*nvalues*(atom->type[i]-1);

      // insure rij, inside, and typej  are of size jnum

--- a/src/ML-SNAP/compute_snad_atom.h
+++ b/src/ML-SNAP/compute_snad_atom.h
@ -37,7 +37,7 @@ class ComputeSNADAtom : public Compute {

 private:
  int nmax;
-  int ncoeff, nperdim, yoffset, zoffset;
+  int ncoeff, nvalues, yoffset, zoffset;
  double **cutsq;
  class NeighList *list;
  double **snad;
--- a/src/ML-SNAP/compute_snap.cpp
+++ b/src/ML-SNAP/compute_snap.cpp
@ -41,13 +41,15 @@ ComputeSnap::ComputeSnap(LAMMPS *lmp, int narg, char **arg) :
  array_flag = 1;
  extarray = 0;

+  // begin code common to all SNAP computes
+
  double rfac0, rmin0;
  int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;

  int ntypes = atom->ntypes;
-  int nargmin = 6+2*ntypes;
+  int nargmin = 6 + 2 * ntypes;

-  if (narg < nargmin) error->all(FLERR,"Illegal compute snap command");
+  if (narg < nargmin) error->all(FLERR, "Illegal compute {} command", style);

  // default values

@ -64,28 +66,30 @@ ComputeSnap::ComputeSnap(LAMMPS *lmp, int narg, char **arg) :

  // process required arguments

-  memory->create(radelem,ntypes+1,"snap:radelem"); // offset by 1 to match up with types
-  memory->create(wjelem,ntypes+1,"snap:wjelem");
-  rcutfac = atof(arg[3]);
-  rfac0 = atof(arg[4]);
-  twojmax = atoi(arg[5]);
+  memory->create(radelem, ntypes + 1, "sna/atom:radelem"); // offset by 1 to match up with types
+  memory->create(wjelem, ntypes + 1, "sna/atom:wjelem");
+
+  rcutfac = utils::numeric(FLERR, arg[3], false, lmp);
+  rfac0 = utils::numeric(FLERR, arg[4], false, lmp);
+  twojmax = utils::inumeric(FLERR, arg[5], false, lmp);
+
  for (int i = 0; i < ntypes; i++)
-    radelem[i+1] = atof(arg[6+i]);
+    radelem[i + 1] = utils::numeric(FLERR, arg[6 + i], false, lmp);
  for (int i = 0; i < ntypes; i++)
-    wjelem[i+1] = atof(arg[6+ntypes+i]);
+    wjelem[i + 1] = utils::numeric(FLERR, arg[6 + ntypes + i], false, lmp);

  // construct cutsq

  double cut;
  cutmax = 0.0;
-  memory->create(cutsq,ntypes+1,ntypes+1,"snap:cutsq");
+  memory->create(cutsq, ntypes + 1, ntypes + 1, "sna/atom:cutsq");
  for (int i = 1; i <= ntypes; i++) {
-    cut = 2.0*radelem[i]*rcutfac;
+    cut = 2.0 * radelem[i] * rcutfac;
    if (cut > cutmax) cutmax = cut;
-    cutsq[i][i] = cut*cut;
-    for (int j = i+1; j <= ntypes; j++) {
-      cut = (radelem[i]+radelem[j])*rcutfac;
-      cutsq[i][j] = cutsq[j][i] = cut*cut;
+    cutsq[i][i] = cut * cut;
+    for (int j = i + 1; j <= ntypes; j++) {
+      cut = (radelem[i] + radelem[j]) * rcutfac;
+      cutsq[i][j] = cutsq[j][i] = cut * cut;
    }
  }

@ -99,107 +103,103 @@ ComputeSnap::ComputeSnap(LAMMPS *lmp, int narg, char **arg) :
  int iarg = nargmin;

  while (iarg < narg) {
-    if (strcmp(arg[iarg],"rmin0") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute snap command");
-      rmin0 = atof(arg[iarg+1]);
+    if (strcmp(arg[iarg], "rmin0") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      rmin0 = utils::numeric(FLERR, arg[iarg + 1], false, lmp);
      iarg += 2;
-    } else if (strcmp(arg[iarg],"bzeroflag") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute snap command");
-      bzeroflag = atoi(arg[iarg+1]);
+    } else if (strcmp(arg[iarg], "switchflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      switchflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
      iarg += 2;
-    } else if (strcmp(arg[iarg],"switchflag") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute snap command");
-      switchflag = atoi(arg[iarg+1]);
+    } else if (strcmp(arg[iarg], "bzeroflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      bzeroflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
      iarg += 2;
-    } else if (strcmp(arg[iarg],"quadraticflag") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute snap command");
-      quadraticflag = atoi(arg[iarg+1]);
+    } else if (strcmp(arg[iarg], "quadraticflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      quadraticflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
      iarg += 2;
-    } else if (strcmp(arg[iarg],"chem") == 0) {
-      if (iarg+2+ntypes > narg)
-        error->all(FLERR,"Illegal compute snap command");
+    } else if (strcmp(arg[iarg], "chem") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
      chemflag = 1;
-      memory->create(map,ntypes+1,"compute_snap:map");
-      nelements = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
+      memory->create(map, ntypes + 1, "compute_sna_grid:map");
+      nelements = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
      for (int i = 0; i < ntypes; i++) {
-        int jelem = utils::inumeric(FLERR,arg[iarg+2+i],false,lmp);
-        if (jelem < 0 || jelem >= nelements)
-          error->all(FLERR,"Illegal compute snap command");
-        map[i+1] = jelem;
+        int jelem = utils::inumeric(FLERR, arg[iarg + 2 + i], false, lmp);
+        if (jelem < 0 || jelem >= nelements) error->all(FLERR, "Illegal compute {} command", style);
+        map[i + 1] = jelem;
      }
-      iarg += 2+ntypes;
-    } else if (strcmp(arg[iarg],"bnormflag") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute snap command");
-      bnormflag = atoi(arg[iarg+1]);
+      iarg += 2 + ntypes;
+    } else if (strcmp(arg[iarg], "bnormflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      bnormflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
      iarg += 2;
-    } else if (strcmp(arg[iarg],"wselfallflag") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute snap command");
-      wselfallflag = atoi(arg[iarg+1]);
+    } else if (strcmp(arg[iarg], "wselfallflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      wselfallflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
      iarg += 2;
-    } else if (strcmp(arg[iarg],"bikflag") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute snap command");
-      bikflag = atoi(arg[iarg+1]);
+    } else if (strcmp(arg[iarg], "bikflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      bikflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
      iarg += 2;
-    } else if (strcmp(arg[iarg],"switchinnerflag") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute snap command");
-      switchinnerflag = atoi(arg[iarg+1]);
+    } else if (strcmp(arg[iarg], "switchinnerflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      switchinnerflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
      iarg += 2;
-    } else if (strcmp(arg[iarg],"sinner") == 0) {
+    } else if (strcmp(arg[iarg], "sinner") == 0) {
      iarg++;
-      if (iarg+ntypes > narg)
-        error->all(FLERR,"Illegal compute snap command");
-      memory->create(sinnerelem,ntypes+1,"snap:sinnerelem");
+      if (iarg + ntypes > narg) error->all(FLERR, "Illegal compute {} command", style);
+      memory->create(sinnerelem, ntypes + 1, "snap:sinnerelem");
      for (int i = 0; i < ntypes; i++)
-        sinnerelem[i+1] = utils::numeric(FLERR,arg[iarg+i],false,lmp);
+        sinnerelem[i + 1] = utils::numeric(FLERR, arg[iarg + i], false, lmp);
      sinnerflag = 1;
      iarg += ntypes;
-    } else if (strcmp(arg[iarg],"dinner") == 0) {
+    } else if (strcmp(arg[iarg], "dinner") == 0) {
      iarg++;
-      if (iarg+ntypes > narg)
-        error->all(FLERR,"Illegal compute snap command");
-      memory->create(dinnerelem,ntypes+1,"snap:dinnerelem");
+      if (iarg + ntypes > narg) error->all(FLERR, "Illegal compute {} command", style);
+      memory->create(dinnerelem, ntypes + 1, "snap:dinnerelem");
      for (int i = 0; i < ntypes; i++)
-        dinnerelem[i+1] = utils::numeric(FLERR,arg[iarg+i],false,lmp);
+        dinnerelem[i + 1] = utils::numeric(FLERR, arg[iarg + i], false, lmp);
      dinnerflag = 1;
      iarg += ntypes;
-    } else error->all(FLERR,"Illegal compute snap command");
+    } else
+      error->all(FLERR, "Illegal compute {} command", style);
  }

  if (switchinnerflag && !(sinnerflag && dinnerflag))
-    error->all(FLERR,"Illegal compute snap command: switchinnerflag = 1, missing sinner/dinner keyword");
+    error->all(
+        FLERR,
+        "Illegal compute {} command: switchinnerflag = 1, missing sinner/dinner keyword",
+        style);

  if (!switchinnerflag && (sinnerflag || dinnerflag))
-    error->all(FLERR,"Illegal compute snap command: switchinnerflag = 0, unexpected sinner/dinner keyword");
+    error->all(
+        FLERR,
+        "Illegal compute {} command: switchinnerflag = 0, unexpected sinner/dinner keyword",
+        style);

-  snaptr = new SNA(lmp, rfac0, twojmax,
-                   rmin0, switchflag, bzeroflag,
-                   chemflag, bnormflag, wselfallflag,
-                   nelements, switchinnerflag);
+  snaptr = new SNA(lmp, rfac0, twojmax, rmin0, switchflag, bzeroflag, chemflag, bnormflag,
+                   wselfallflag, nelements, switchinnerflag);

  ncoeff = snaptr->ncoeff;
-  nperdim = ncoeff;
-  if (quadraticflag) nperdim += (ncoeff*(ncoeff+1))/2;
+  nvalues = ncoeff;
+  if (quadraticflag) nvalues += (ncoeff * (ncoeff + 1)) / 2;
+
+  // end code common to all SNAP computes
+
  ndims_force = 3;
  ndims_virial = 6;
-  yoffset = nperdim;
-  zoffset = 2*nperdim;
+  yoffset = nvalues;
+  zoffset = 2*nvalues;
  natoms = atom->natoms;
  bik_rows = 1;
  if (bikflag) bik_rows = natoms;
  size_array_rows = bik_rows+ndims_force*natoms+ndims_virial;
-  size_array_cols = nperdim*atom->ntypes+1;
+  size_array_cols = nvalues*atom->ntypes+1;
  lastcol = size_array_cols-1;

  ndims_peratom = ndims_force;
-  size_peratom = ndims_peratom*nperdim*atom->ntypes;
+  size_peratom = ndims_peratom*nvalues*atom->ntypes;

  nmax = 0;
 }
@ -341,8 +341,8 @@ void ComputeSnap::compute_array()
      const double radi = radelem[itype];
      const int* const jlist = firstneigh[i];
      const int jnum = numneigh[i];
-      const int typeoffset_local = ndims_peratom*nperdim*(itype-1);
-      const int typeoffset_global = nperdim*(itype-1);
+      const int typeoffset_local = ndims_peratom*nvalues*(itype-1);
+      const int typeoffset_global = nvalues*(itype-1);

      // insure rij, inside, and typej  are of size jnum

@ -481,9 +481,9 @@ void ComputeSnap::compute_array()
  // accumulate bispectrum force contributions to global array

  for (int itype = 0; itype < atom->ntypes; itype++) {
-    const int typeoffset_local = ndims_peratom*nperdim*itype;
-    const int typeoffset_global = nperdim*itype;
-    for (int icoeff = 0; icoeff < nperdim; icoeff++) {
+    const int typeoffset_local = ndims_peratom*nvalues*itype;
+    const int typeoffset_global = nvalues*itype;
+    for (int icoeff = 0; icoeff < nvalues; icoeff++) {
      for (int i = 0; i < ntotal; i++) {
        double *snadi = snap_peratom[i]+typeoffset_local;
        int iglobal = atom->tag[i];
@ -549,10 +549,10 @@ void ComputeSnap::dbdotr_compute()
  int nall = atom->nlocal + atom->nghost;
  for (int i = 0; i < nall; i++)
    for (int itype = 0; itype < atom->ntypes; itype++) {
-      const int typeoffset_local = ndims_peratom*nperdim*itype;
-      const int typeoffset_global = nperdim*itype;
+      const int typeoffset_local = ndims_peratom*nvalues*itype;
+      const int typeoffset_global = nvalues*itype;
      double *snadi = snap_peratom[i]+typeoffset_local;
-      for (int icoeff = 0; icoeff < nperdim; icoeff++) {
+      for (int icoeff = 0; icoeff < nvalues; icoeff++) {
        double dbdx = snadi[icoeff];
        double dbdy = snadi[icoeff+yoffset];
        double dbdz = snadi[icoeff+zoffset];
--- a/src/ML-SNAP/compute_snap.h
+++ b/src/ML-SNAP/compute_snap.h
@ -35,7 +35,7 @@ class ComputeSnap : public Compute {

 private:
  int natoms, nmax, size_peratom, lastcol;
-  int ncoeff, nperdim, yoffset, zoffset;
+  int ncoeff, nvalues, yoffset, zoffset;
  int ndims_peratom, ndims_force, ndims_virial;
  double **cutsq;
  class NeighList *list;
--- a/src/ML-SNAP/compute_snav_atom.cpp
+++ b/src/ML-SNAP/compute_snav_atom.cpp
@ -21,6 +21,7 @@
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "force.h"
+#include "pair.h"
 #include "comm.h"
 #include "memory.h"
 #include "error.h"
@ -33,20 +34,22 @@ ComputeSNAVAtom::ComputeSNAVAtom(LAMMPS *lmp, int narg, char **arg) :
  Compute(lmp, narg, arg), cutsq(nullptr), list(nullptr), snav(nullptr),
  radelem(nullptr), wjelem(nullptr), sinnerelem(nullptr), dinnerelem(nullptr)
 {
+
+  // begin code common to all SNAP computes
+
  double rfac0, rmin0;
  int twojmax, switchflag, bzeroflag, bnormflag, wselfallflag;

  int ntypes = atom->ntypes;
-  int nargmin = 6+2*ntypes;
+  int nargmin = 6 + 2 * ntypes;

-  if (narg < nargmin) error->all(FLERR,"Illegal compute snav/atom command");
+  if (narg < nargmin) error->all(FLERR, "Illegal compute {} command", style);

  // default values

  rmin0 = 0.0;
  switchflag = 1;
  bzeroflag = 1;
-  bnormflag = 0;
  quadraticflag = 0;
  chemflag = 0;
  bnormflag = 0;
@ -56,24 +59,32 @@ ComputeSNAVAtom::ComputeSNAVAtom(LAMMPS *lmp, int narg, char **arg) :

  // process required arguments

-  memory->create(radelem,ntypes+1,"snav/atom:radelem"); // offset by 1 to match up with types
-  memory->create(wjelem,ntypes+1,"snav/atom:wjelem");
-  rcutfac = atof(arg[3]);
-  rfac0 = atof(arg[4]);
-  twojmax = atoi(arg[5]);
+  memory->create(radelem, ntypes + 1, "sna/atom:radelem"); // offset by 1 to match up with types
+  memory->create(wjelem, ntypes + 1, "sna/atom:wjelem");
+
+  rcutfac = utils::numeric(FLERR, arg[3], false, lmp);
+  rfac0 = utils::numeric(FLERR, arg[4], false, lmp);
+  twojmax = utils::inumeric(FLERR, arg[5], false, lmp);
+
  for (int i = 0; i < ntypes; i++)
-    radelem[i+1] = atof(arg[6+i]);
+    radelem[i + 1] =
+        utils::numeric(FLERR, arg[6 + i], false, lmp);
  for (int i = 0; i < ntypes; i++)
-    wjelem[i+1] = atof(arg[6+ntypes+i]);
+    wjelem[i + 1] =
+        utils::numeric(FLERR, arg[6 + ntypes + i], false, lmp);
+
  // construct cutsq
+
  double cut;
-  memory->create(cutsq,ntypes+1,ntypes+1,"snav/atom:cutsq");
+  cutmax = 0.0;
+  memory->create(cutsq, ntypes + 1, ntypes + 1, "sna/atom:cutsq");
  for (int i = 1; i <= ntypes; i++) {
-    cut = 2.0*radelem[i]*rcutfac;
-    cutsq[i][i] = cut*cut;
-    for (int j = i+1; j <= ntypes; j++) {
-      cut = (radelem[i]+radelem[j])*rcutfac;
-      cutsq[i][j] = cutsq[j][i] = cut*cut;
+    cut = 2.0 * radelem[i] * rcutfac;
+    if (cut > cutmax) cutmax = cut;
+    cutsq[i][i] = cut * cut;
+    for (int j = i + 1; j <= ntypes; j++) {
+      cut = (radelem[i] + radelem[j]) * rcutfac;
+      cutsq[i][j] = cutsq[j][i] = cut * cut;
    }
  }

@ -87,90 +98,87 @@ ComputeSNAVAtom::ComputeSNAVAtom(LAMMPS *lmp, int narg, char **arg) :
  int iarg = nargmin;

  while (iarg < narg) {
-    if (strcmp(arg[iarg],"rmin0") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute snav/atom command");
-      rmin0 = atof(arg[iarg+1]);
+    if (strcmp(arg[iarg], "rmin0") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      rmin0 = utils::numeric(FLERR, arg[iarg + 1], false, lmp);
      iarg += 2;
-    } else if (strcmp(arg[iarg],"switchflag") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute snav/atom command");
-      switchflag = atoi(arg[iarg+1]);
+    } else if (strcmp(arg[iarg], "switchflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      switchflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
      iarg += 2;
-    } else if (strcmp(arg[iarg],"bzeroflag") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute snav/atom command");
-      bzeroflag = atoi(arg[iarg+1]);
+    } else if (strcmp(arg[iarg], "bzeroflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      bzeroflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
      iarg += 2;
-    } else if (strcmp(arg[iarg],"quadraticflag") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute snav/atom command");
-      quadraticflag = atoi(arg[iarg+1]);
+    } else if (strcmp(arg[iarg], "quadraticflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      quadraticflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
      iarg += 2;
-    } else if (strcmp(arg[iarg],"chem") == 0) {
-      if (iarg+2+ntypes > narg)
-        error->all(FLERR,"Illegal compute snav/atom command");
+    } else if (strcmp(arg[iarg], "chem") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
      chemflag = 1;
-      memory->create(map,ntypes+1,"compute_sna_atom:map");
-      nelements = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
+      memory->create(map, ntypes + 1, "compute_sna_grid:map");
+      nelements = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
      for (int i = 0; i < ntypes; i++) {
-        int jelem = utils::inumeric(FLERR,arg[iarg+2+i],false,lmp);
-        if (jelem < 0 || jelem >= nelements)
-          error->all(FLERR,"Illegal compute snav/atom command");
-        map[i+1] = jelem;
+        int jelem = utils::inumeric(FLERR, arg[iarg + 2 + i], false, lmp);
+        if (jelem < 0 || jelem >= nelements) error->all(FLERR, "Illegal compute {} command", style);
+        map[i + 1] = jelem;
      }
-      iarg += 2+ntypes;
-    } else if (strcmp(arg[iarg],"bnormflag") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute snav/atom command");
-      bnormflag = atoi(arg[iarg+1]);
+      iarg += 2 + ntypes;
+    } else if (strcmp(arg[iarg], "bnormflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      bnormflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
      iarg += 2;
-    } else if (strcmp(arg[iarg],"wselfallflag") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute snav/atom command");
-      wselfallflag = atoi(arg[iarg+1]);
+    } else if (strcmp(arg[iarg], "wselfallflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      wselfallflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
      iarg += 2;
-    } else if (strcmp(arg[iarg],"switchinnerflag") == 0) {
-      if (iarg+2 > narg)
-        error->all(FLERR,"Illegal compute snav/atom command");
-      switchinnerflag = atoi(arg[iarg+1]);
+    } else if (strcmp(arg[iarg], "switchinnerflag") == 0) {
+      if (iarg + 2 > narg) error->all(FLERR, "Illegal compute {} command", style);
+      switchinnerflag = utils::inumeric(FLERR, arg[iarg + 1], false, lmp);
      iarg += 2;
-    } else if (strcmp(arg[iarg],"sinner") == 0) {
+    } else if (strcmp(arg[iarg], "sinner") == 0) {
      iarg++;
-      if (iarg+ntypes > narg)
-        error->all(FLERR,"Illegal compute snav/atom command");
-      memory->create(sinnerelem,ntypes+1,"snav/atom:sinnerelem");
+      if (iarg + ntypes > narg) error->all(FLERR, "Illegal compute {} command", style);
+      memory->create(sinnerelem, ntypes + 1, "snap:sinnerelem");
      for (int i = 0; i < ntypes; i++)
-        sinnerelem[i+1] = utils::numeric(FLERR,arg[iarg+i],false,lmp);
+        sinnerelem[i + 1] = utils::numeric(FLERR, arg[iarg + i], false, lmp);
      sinnerflag = 1;
      iarg += ntypes;
-    } else if (strcmp(arg[iarg],"dinner") == 0) {
+    } else if (strcmp(arg[iarg], "dinner") == 0) {
      iarg++;
-      if (iarg+ntypes > narg)
-        error->all(FLERR,"Illegal compute snav/atom command");
-      memory->create(dinnerelem,ntypes+1,"snav/atom:dinnerelem");
+      if (iarg + ntypes > narg) error->all(FLERR, "Illegal compute {} command", style);
+      memory->create(dinnerelem, ntypes + 1, "snap:dinnerelem");
      for (int i = 0; i < ntypes; i++)
-        dinnerelem[i+1] = utils::numeric(FLERR,arg[iarg+i],false,lmp);
+        dinnerelem[i + 1] = utils::numeric(FLERR, arg[iarg + i], false, lmp);
      dinnerflag = 1;
      iarg += ntypes;
-    } else error->all(FLERR,"Illegal compute snav/atom command");
+    } else
+      error->all(FLERR, "Illegal compute {} command", style);
  }

  if (switchinnerflag && !(sinnerflag && dinnerflag))
-    error->all(FLERR,"Illegal compute snav/atom command: switchinnerflag = 1, missing sinner/dinner keyword");
+    error->all(
+        FLERR,
+        "Illegal compute {} command: switchinnerflag = 1, missing sinner/dinner keyword",
+        style);

  if (!switchinnerflag && (sinnerflag || dinnerflag))
-    error->all(FLERR,"Illegal compute snav/atom command: switchinnerflag = 0, unexpected sinner/dinner keyword");
+    error->all(
+        FLERR,
+        "Illegal compute {} command: switchinnerflag = 0, unexpected sinner/dinner keyword",
+        style);

-  snaptr = new SNA(lmp, rfac0, twojmax,
-                   rmin0, switchflag, bzeroflag,
-                   chemflag, bnormflag, wselfallflag,
-                   nelements, switchinnerflag);
+  snaptr = new SNA(lmp, rfac0, twojmax, rmin0, switchflag, bzeroflag, chemflag, bnormflag,
+                   wselfallflag, nelements, switchinnerflag);

  ncoeff = snaptr->ncoeff;
-  nperdim = ncoeff;
-  if (quadraticflag) nperdim += (ncoeff*(ncoeff+1))/2;
-  size_peratom_cols = 6*nperdim*atom->ntypes;
+  nvalues = ncoeff;
+  if (quadraticflag) nvalues += (ncoeff * (ncoeff + 1)) / 2;
+
+  // end code common to all SNAP computes
+
+  size_peratom_cols = 6*nvalues*atom->ntypes;
  comm_reverse = size_peratom_cols;
  peratom_flag = 1;

@ -203,10 +211,9 @@ void ComputeSNAVAtom::init()
 {
  if (force->pair == nullptr)
    error->all(FLERR,"Compute snav/atom requires a pair style be defined");
-   // TODO: Not sure what to do with this error check since cutoff radius is not
-  // a single number
- //if (sqrt(cutsq) > force->pair->cutforce)
-   // error->all(FLERR,"Compute snav/atom cutoff is longer than pairwise cutoff");
+
+  if (cutmax > force->pair->cutforce)
+    error->all(FLERR,"Compute snav/atom cutoff is longer than pairwise cutoff");

  // need an occasional full neighbor list

@ -280,7 +287,7 @@ void ComputeSNAVAtom::compute_peratom()
      const int* const jlist = firstneigh[i];
      const int jnum = numneigh[i];

-      const int typeoffset = 6*nperdim*(atom->type[i]-1);
+      const int typeoffset = 6*nvalues*(atom->type[i]-1);

      // insure rij, inside, and typej  are of size jnum

@ -339,17 +346,17 @@ void ComputeSNAVAtom::compute_peratom()

        for (int icoeff = 0; icoeff < ncoeff; icoeff++) {
          snavi[icoeff]           += snaptr->dblist[icoeff][0]*xtmp;
-          snavi[icoeff+nperdim]   += snaptr->dblist[icoeff][1]*ytmp;
-          snavi[icoeff+2*nperdim] += snaptr->dblist[icoeff][2]*ztmp;
-          snavi[icoeff+3*nperdim] += snaptr->dblist[icoeff][1]*ztmp;
-          snavi[icoeff+4*nperdim] += snaptr->dblist[icoeff][0]*ztmp;
-          snavi[icoeff+5*nperdim] += snaptr->dblist[icoeff][0]*ytmp;
+          snavi[icoeff+nvalues]   += snaptr->dblist[icoeff][1]*ytmp;
+          snavi[icoeff+2*nvalues] += snaptr->dblist[icoeff][2]*ztmp;
+          snavi[icoeff+3*nvalues] += snaptr->dblist[icoeff][1]*ztmp;
+          snavi[icoeff+4*nvalues] += snaptr->dblist[icoeff][0]*ztmp;
+          snavi[icoeff+5*nvalues] += snaptr->dblist[icoeff][0]*ytmp;
          snavj[icoeff]           -= snaptr->dblist[icoeff][0]*x[j][0];
-          snavj[icoeff+nperdim]   -= snaptr->dblist[icoeff][1]*x[j][1];
-          snavj[icoeff+2*nperdim] -= snaptr->dblist[icoeff][2]*x[j][2];
-          snavj[icoeff+3*nperdim] -= snaptr->dblist[icoeff][1]*x[j][2];
-          snavj[icoeff+4*nperdim] -= snaptr->dblist[icoeff][0]*x[j][2];
-          snavj[icoeff+5*nperdim] -= snaptr->dblist[icoeff][0]*x[j][1];
+          snavj[icoeff+nvalues]   -= snaptr->dblist[icoeff][1]*x[j][1];
+          snavj[icoeff+2*nvalues] -= snaptr->dblist[icoeff][2]*x[j][2];
+          snavj[icoeff+3*nvalues] -= snaptr->dblist[icoeff][1]*x[j][2];
+          snavj[icoeff+4*nvalues] -= snaptr->dblist[icoeff][0]*x[j][2];
+          snavj[icoeff+5*nvalues] -= snaptr->dblist[icoeff][0]*x[j][1];
        }

        if (quadraticflag) {
@ -369,17 +376,17 @@ void ComputeSNAVAtom::compute_peratom()
            double dbytmp = bi*biy;
            double dbztmp = bi*biz;
            snavi[ncount] +=           dbxtmp*xtmp;
-            snavi[ncount+nperdim] +=   dbytmp*ytmp;
-            snavi[ncount+2*nperdim] += dbztmp*ztmp;
-            snavi[ncount+3*nperdim] += dbytmp*ztmp;
-            snavi[ncount+4*nperdim] += dbxtmp*ztmp;
-            snavi[ncount+5*nperdim] += dbxtmp*ytmp;
+            snavi[ncount+nvalues] +=   dbytmp*ytmp;
+            snavi[ncount+2*nvalues] += dbztmp*ztmp;
+            snavi[ncount+3*nvalues] += dbytmp*ztmp;
+            snavi[ncount+4*nvalues] += dbxtmp*ztmp;
+            snavi[ncount+5*nvalues] += dbxtmp*ytmp;
            snavj[ncount] -=            dbxtmp*x[j][0];
-            snavj[ncount+nperdim] -=    dbytmp*x[j][1];
-            snavj[ncount+2*nperdim] -=  dbztmp*x[j][2];
-            snavj[ncount+3*nperdim] -=  dbytmp*x[j][2];
-            snavj[ncount+4*nperdim] -=  dbxtmp*x[j][2];
-            snavj[ncount+5*nperdim] -=  dbxtmp*x[j][1];
+            snavj[ncount+nvalues] -=    dbytmp*x[j][1];
+            snavj[ncount+2*nvalues] -=  dbztmp*x[j][2];
+            snavj[ncount+3*nvalues] -=  dbytmp*x[j][2];
+            snavj[ncount+4*nvalues] -=  dbxtmp*x[j][2];
+            snavj[ncount+5*nvalues] -=  dbxtmp*x[j][1];
            ncount++;

            // upper-triangular elements of quadratic matrix
@ -392,17 +399,17 @@ void ComputeSNAVAtom::compute_peratom()
              double dbztmp = bi*snaptr->dblist[jcoeff][2]
                + biz*snaptr->blist[jcoeff];
              snavi[ncount] +=           dbxtmp*xtmp;
-              snavi[ncount+nperdim] +=   dbytmp*ytmp;
-              snavi[ncount+2*nperdim] += dbztmp*ztmp;
-              snavi[ncount+3*nperdim] += dbytmp*ztmp;
-              snavi[ncount+4*nperdim] += dbxtmp*ztmp;
-              snavi[ncount+5*nperdim] += dbxtmp*ytmp;
+              snavi[ncount+nvalues] +=   dbytmp*ytmp;
+              snavi[ncount+2*nvalues] += dbztmp*ztmp;
+              snavi[ncount+3*nvalues] += dbytmp*ztmp;
+              snavi[ncount+4*nvalues] += dbxtmp*ztmp;
+              snavi[ncount+5*nvalues] += dbxtmp*ytmp;
              snavj[ncount] -=           dbxtmp*x[j][0];
-              snavj[ncount+nperdim] -=   dbytmp*x[j][1];
-              snavj[ncount+2*nperdim] -= dbztmp*x[j][2];
-              snavj[ncount+3*nperdim] -= dbytmp*x[j][2];
-              snavj[ncount+4*nperdim] -= dbxtmp*x[j][2];
-              snavj[ncount+5*nperdim] -= dbxtmp*x[j][1];
+              snavj[ncount+nvalues] -=   dbytmp*x[j][1];
+              snavj[ncount+2*nvalues] -= dbztmp*x[j][2];
+              snavj[ncount+3*nvalues] -= dbytmp*x[j][2];
+              snavj[ncount+4*nvalues] -= dbxtmp*x[j][2];
+              snavj[ncount+5*nvalues] -= dbxtmp*x[j][1];
              ncount++;
            }
          }
--- a/src/ML-SNAP/compute_snav_atom.h
+++ b/src/ML-SNAP/compute_snav_atom.h
@ -37,7 +37,7 @@ class ComputeSNAVAtom : public Compute {

 private:
  int nmax;
-  int ncoeff, nperdim;
+  int ncoeff, nvalues;
  double **cutsq;
  class NeighList *list;
  double **snav;
@ -50,6 +50,7 @@ class ComputeSNAVAtom : public Compute {
  double *sinnerelem;
  double *dinnerelem;
  class SNA *snaptr;
+  double cutmax;
  int quadraticflag;
 };

--- a/src/ML-SNAP/pair_snap.cpp
+++ b/src/ML-SNAP/pair_snap.cpp
@ -63,11 +63,8 @@ PairSNAP::~PairSNAP()
  memory->destroy(radelem);
  memory->destroy(wjelem);
  memory->destroy(coeffelem);
-
-  if (switchinnerflag) {
-    memory->destroy(sinnerelem);
-    memory->destroy(dinnerelem);
-  }
+  memory->destroy(sinnerelem);
+  memory->destroy(dinnerelem);

  memory->destroy(beta);
  memory->destroy(bispectrum);
--- a/src/dump.cpp
+++ b/src/dump.cpp
@ -332,6 +332,7 @@ void Dump::write()
  // if file per timestep, open new file

  if (multifile) openfile();
+  if (fp) clearerr(fp);

  // simulation box bounds

@ -519,6 +520,8 @@ void Dump::write()

  if (filewriter && fp != nullptr) write_footer();

+  if (fp && ferror(fp)) error->one(FLERR,"Error writing dump {}: {}", id, utils::getsyserror());
+
  // if file per timestep, close file if I am filewriter

  if (multifile) {
--- a/src/lammps.cpp
+++ b/src/lammps.cpp
@ -129,9 +129,8 @@ LAMMPS::LAMMPS(int narg, char **arg, MPI_Comm communicator) :
  version = (const char *) LAMMPS_VERSION;
  num_ver = utils::date2num(version);

-  clientserver = 0;
-  cslib = nullptr;
-  cscomm = 0;
+  external_comm = 0;
+  mdicomm = nullptr;

  skiprunflag = 0;

@ -155,19 +154,16 @@ LAMMPS::LAMMPS(int narg, char **arg, MPI_Comm communicator) :
 #endif

  // check if -mpicolor is first arg
-  // if so, then 2 apps were launched with one mpirun command
+  // if so, then 2 or more apps were launched with one mpirun command
  //   this means passed communicator (e.g. MPI_COMM_WORLD) is bigger than LAMMPS
-  //     e.g. for client/server coupling with another code
-  //     in the future LAMMPS might leverage this in other ways
  //   universe communicator needs to shrink to be just LAMMPS
  // syntax: -mpicolor color
-  //   color = integer for this app, different than other app(s)
+  //   color = integer for this app, different than any other app(s)
  // do the following:
  //   perform an MPI_Comm_split() to create a new LAMMPS-only subcomm
-  //   NOTE: this assumes other app(s) does same thing, else will hang!
+  //   NOTE: this assumes other app(s) make same call, else will hang!
  //   re-create universe with subcomm
-  //   store full multi-app comm in cscomm
-  //   cscomm is used by CSLIB package to exchange messages w/ other app
+  //   store comm that all apps belong to in external_comm

  int iarg = 1;
  if (narg-iarg >= 2 && (strcmp(arg[iarg],"-mpicolor") == 0 ||
@ -178,7 +174,7 @@ LAMMPS::LAMMPS(int narg, char **arg, MPI_Comm communicator) :
    int color = atoi(arg[iarg+1]);
    MPI_Comm subcomm;
    MPI_Comm_split(communicator,color,me,&subcomm);
-    cscomm = communicator;
+    external_comm = communicator;
    communicator = subcomm;
    delete universe;
    universe = new Universe(this,communicator);
@ -290,7 +286,7 @@ LAMMPS::LAMMPS(int narg, char **arg, MPI_Comm communicator) :
      logflag = iarg + 1;
      iarg += 2;

-    } else if (strcmp(arg[iarg],"-mpi") == 0 ||
+    } else if (strcmp(arg[iarg],"-mpicolor") == 0 ||
               strcmp(arg[iarg],"-m") == 0) {
      if (iarg+2 > narg)
        error->universe_all(FLERR,"Invalid command-line argument");
@ -766,13 +762,13 @@ LAMMPS::~LAMMPS()
  delete [] suffix2;
  delete [] suffixp;

-  // free the MPI comm created by -mpi command-line arg processed in constructor
+  // free the MPI comm created by -mpicolor cmdline arg processed in constructor
  // it was passed to universe as if original universe world
  // may have been split later by partitions, universe will free the splits
  // free a copy of uorig here, so check in universe destructor will still work

  MPI_Comm copy = universe->uorig;
-  if (cscomm) MPI_Comm_free(&copy);
+  if (external_comm) MPI_Comm_free(&copy);

  delete input;
  delete universe;
--- a/src/lammps.h
+++ b/src/lammps.h
@ -61,13 +61,15 @@ class LAMMPS {
  char *suffix, *suffix2, *suffixp;    // suffixes to add to input script style names
  int suffix_enable;                   // 1 if suffixes are enabled, 0 if disabled
  char *exename;                       // pointer to argv[0]
-                                       //
+
  char ***packargs;                    // arguments for cmdline package commands
  int num_package;                     // number of cmdline package commands
-                                       //
-  int clientserver;                    // 0 = neither, 1 = client, 2 = server
-  void *cslib;                         // client/server messaging via CSlib
-  MPI_Comm cscomm;                     // MPI comm for client+server in mpi/one mode
+
+  MPI_Comm external_comm;      // MPI comm encompassing external programs
+                               // when multiple programs launched by mpirun
+                               // set by -mpicolor command line arg
+
+  void *mdicomm;               // for use with MDI code coupling library

  const char *match_style(const char *style, const char *name);
  static const char *installed_packages[];
--- a/src/pair.cpp
+++ b/src/pair.cpp
@ -126,6 +126,7 @@ Pair::Pair(LAMMPS *lmp) : Pointers(lmp)
  datamask_modify = ALL_MASK;

  kokkosable = 0;
+  reverse_comm_device = 0;
  copymode = 0;
 }

--- a/src/pair.h
+++ b/src/pair.h
@ -123,6 +123,7 @@ class Pair : protected Pointers {
  ExecutionSpace execution_space;
  unsigned int datamask_read, datamask_modify;
  int kokkosable;    // 1 if Kokkos pair
+  int reverse_comm_device;    // 1 if reverse comm on Device

  Pair(class LAMMPS *);
  ~Pair() override;
--- a/src/variable.cpp
+++ b/src/variable.cpp
@ -4770,12 +4770,14 @@ double Variable::evaluate_boolean(char *str)
        }

        if (opprevious == NOT) {
-          if (flag2) error->all(FLERR,"Invalid Boolean syntax in if command");
+          if (flag2)
+            error->all(FLERR,"If command boolean not cannot operate on string");
          if (value2 == 0.0) argstack[nargstack].value = 1.0;
          else argstack[nargstack].value = 0.0;
+
        } else if (opprevious == EQ) {
          if (flag1 != flag2)
-            error->all(FLERR,"Invalid Boolean syntax in if command");
+            error->all(FLERR,"If command boolean is comparing string to number");
          if (flag2 == 0) {
            if (value1 == value2) argstack[nargstack].value = 1.0;
            else argstack[nargstack].value = 0.0;
@ -4787,7 +4789,7 @@ double Variable::evaluate_boolean(char *str)
          }
        } else if (opprevious == NE) {
          if (flag1 != flag2)
-            error->all(FLERR,"Invalid Boolean syntax in if command");
+            error->all(FLERR,"If command boolean is comparing string to number");
          if (flag2 == 0) {
            if (value1 != value2) argstack[nargstack].value = 1.0;
            else argstack[nargstack].value = 0.0;
@ -4797,32 +4799,41 @@ double Variable::evaluate_boolean(char *str)
            delete[] str1;
            delete[] str2;
          }
+
        } else if (opprevious == LT) {
-          if (flag2) error->all(FLERR,"Invalid Boolean syntax in if command");
+          if (flag1 || flag2)
+            error->all(FLERR,"If command boolean can only operate on numbers");
          if (value1 < value2) argstack[nargstack].value = 1.0;
          else argstack[nargstack].value = 0.0;
        } else if (opprevious == LE) {
-          if (flag2) error->all(FLERR,"Invalid Boolean syntax in if command");
+          if (flag1 || flag2)
+            error->all(FLERR,"If command boolean can only operate on numbers");
          if (value1 <= value2) argstack[nargstack].value = 1.0;
          else argstack[nargstack].value = 0.0;
        } else if (opprevious == GT) {
-          if (flag2) error->all(FLERR,"Invalid Boolean syntax in if command");
+          if (flag1 || flag2)
+            error->all(FLERR,"If command boolean can only operate on numbers");
          if (value1 > value2) argstack[nargstack].value = 1.0;
          else argstack[nargstack].value = 0.0;
        } else if (opprevious == GE) {
-          if (flag2) error->all(FLERR,"Invalid Boolean syntax in if command");
+          if (flag1 || flag2)
+            error->all(FLERR,"If command boolean can only operate on numbers");
          if (value1 >= value2) argstack[nargstack].value = 1.0;
          else argstack[nargstack].value = 0.0;
+
        } else if (opprevious == AND) {
-          if (flag2) error->all(FLERR,"Invalid Boolean syntax in if command");
+          if (flag1 || flag2)
+            error->all(FLERR,"If command boolean can only operate on numbers");
          if (value1 != 0.0 && value2 != 0.0) argstack[nargstack].value = 1.0;
          else argstack[nargstack].value = 0.0;
        } else if (opprevious == OR) {
-          if (flag2) error->all(FLERR,"Invalid Boolean syntax in if command");
+          if (flag1 || flag2)
+            error->all(FLERR,"If command boolean can only operate on numbers");
          if (value1 != 0.0 || value2 != 0.0) argstack[nargstack].value = 1.0;
          else argstack[nargstack].value = 0.0;
        } else if (opprevious == XOR) {
-          if (flag2) error->all(FLERR,"Invalid Boolean syntax in if command");
+          if (flag1 || flag2)
+            error->all(FLERR,"If command boolean can only operate on numbers");
          if ((value1 == 0.0 && value2 != 0.0) ||
              (value1 != 0.0 && value2 == 0.0))
            argstack[nargstack].value = 1.0;
@ -4845,6 +4856,13 @@ double Variable::evaluate_boolean(char *str)

  if (nopstack) error->all(FLERR,"Invalid Boolean syntax in if command");
  if (nargstack != 1) error->all(FLERR,"Invalid Boolean syntax in if command");
+
+  // if flag == 1, Boolean expression was a single string with no operator
+  // error b/c invalid, only single number with no operator is allowed
+
+  if (argstack[0].flag == 1)
+    error->all(FLERR,"If command boolean cannot be single string");
+
  return argstack[0].value;
 }