Merge pull request #3606 from bathmatt/kokkos-mliap-unified

Add MLIAP-KOKKOS version of the Unified model/descriptor
2023-01-26 13:57:42 -05:00
parent f7725242fa 49b639acd0
commit 30f459da92
13 changed files with 1108 additions and 73 deletions
--- a/cmake/Modules/Packages/KOKKOS.cmake
+++ b/cmake/Modules/Packages/KOKKOS.cmake
@ -144,6 +144,7 @@ if(PKG_ML-IAP)
                                 ${KOKKOS_PKG_SOURCES_DIR}/mliap_descriptor_so3_kokkos.cpp
                                 ${KOKKOS_PKG_SOURCES_DIR}/mliap_model_linear_kokkos.cpp
                                 ${KOKKOS_PKG_SOURCES_DIR}/mliap_model_python_kokkos.cpp
+                                 ${KOKKOS_PKG_SOURCES_DIR}/mliap_unified_kokkos.cpp
                                 ${KOKKOS_PKG_SOURCES_DIR}/mliap_so3_kokkos.cpp)

  # Add KOKKOS version of ML-IAP Python coupling if non-KOKKOS version is included
--- a/python/lammps/mliap/init.py
+++ b/python/lammps/mliap/init.py
@ -32,7 +32,7 @@ if not pylib.Py_IsInitialized():
 else:
    from .loader import load_model, load_unified, activate_mliappy
    try:
-         from .loader import  load_model_kokkos,  activate_mliappy_kokkos
+         from .loader import  load_model_kokkos, load_unified_kokkos,  activate_mliappy_kokkos
    except Exception as ee:
        # ignore import error, it means that the KOKKOS package was not included in LAMMPS
        pass
--- a/python/lammps/mliap/loader.py
+++ b/python/lammps/mliap/loader.py
@ -75,7 +75,7 @@ def activate_mliappy(lmp):
 def activate_mliappy_kokkos(lmp):
    try:
        library = lmp.lib
-        module_names = ["mliap_model_python_couple_kokkos"]
+        module_names = ["mliap_model_python_couple_kokkos", "mliap_unified_couple_kokkos"]
        api_version = library.lammps_python_api_version()

        for module_name in module_names:
@ -118,3 +118,12 @@ def load_unified(model):
                          ) from ie
    mliap_unified_couple.load_from_python(model)

+def load_unified_kokkos(model):
+    try:
+        import mliap_unified_couple_kokkos
+    except ImportError as ie:
+        raise ImportError("ML-IAP python module must be activated before loading\n"
+                          "the pair style. Call lammps.mliap.activate_mliappy(lmp)."
+                          ) from ie
+    mliap_unified_couple_kokkos.load_from_python(model)
+
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@ -204,6 +204,8 @@ action mliap_model_linear_kokkos.h mliap_model_linear.h
 action mliap_model_python_kokkos.cpp mliap_model_linear.cpp
 action mliap_model_python_kokkos.h mliap_model_linear.h
 action mliap_model_kokkos.h mliap_model.h
+action mliap_unified_kokkos.cpp mliap_unified.cpp
+action mliap_unified_kokkos.h mliap_unified.h
 action mliap_so3_kokkos.cpp mliap_so3.cpp
 action mliap_so3_kokkos.h mliap_so3.h
 action modify_kokkos.cpp
@ -365,6 +367,7 @@ action verlet_kokkos.h

 # Install cython pyx file only if non-KOKKOS version is present
 action mliap_model_python_couple_kokkos.pyx mliap_model_python_couple.pyx
+action mliap_unified_couple_kokkos.pyx mliap_unified_couple.pyx

 # edit 2 Makefile.package files to include/exclude package info

@ -423,15 +426,19 @@ fi
 if (test $1 = 1) then
  if (type cythonize > /dev/null 2>&1 && test -e ../python_impl.cpp) then
    cythonize -3 ../mliap_model_python_couple_kokkos.pyx
+    cythonize -3 ../mliap_unified_couple_kokkos.pyx
  fi

 elif (test $1 = 0) then
  rm -f ../mliap_model_python_couple_kokkos.cpp ../mliap_model_python_couple_kokkos.h
+  rm -f ../mliap_unified_couple_kokkos.cpp ../mliap_unified_couple_kokkos.h

 elif (test $1 = 2) then
  if (type cythonize > /dev/null 2>&1 && test -e ../python_impl.cpp) then
    cythonize -3 ../mliap_model_python_couple_kokkos.pyx
+    cythonize -3 ../mliap_unified_couple_kokkos.pyx
  else
    rm -f ../mliap_model_python_couple_kokkos.cpp ../mliap_model_python_couple_kokkos.h
+    rm -f ../mliap_unified_couple_kokkos.cpp ../mliap_unified_couple_kokkos.h
  fi
 fi
--- a/src/KOKKOS/mliap_data_kokkos.cpp
+++ b/src/KOKKOS/mliap_data_kokkos.cpp
@ -56,7 +56,9 @@ MLIAPDataKokkos<DeviceType>::~MLIAPDataKokkos() {
  memoryKK->destroy_kokkos(k_ielems,ielems);
  memoryKK->destroy_kokkos(k_numneighs,numneighs);
  memoryKK->destroy_kokkos(k_jatoms,jatoms);
+  memoryKK->destroy_kokkos(k_pair_i,pair_i);
  memoryKK->destroy_kokkos(k_jelems,jelems);
+  memoryKK->destroy_kokkos(k_elems,elems);
  memoryKK->destroy_kokkos(k_ij);
  memoryKK->destroy_kokkos(k_rij,rij);
  memoryKK->destroy_kokkos(k_graddesc,graddesc);
@ -75,13 +77,17 @@ void MLIAPDataKokkos<DeviceType>::generate_neighdata(class NeighList *list_in, i
    nmax = atom->nmax;
    memoryKK->destroy_kokkos(k_gradforce,gradforce);
    memoryKK->create_kokkos(k_gradforce, gradforce, nmax, size_gradforce, "mliap_data:gradforce");
-  }
+    memoryKK->destroy_kokkos(k_elems,elems);
+    memoryKK->create_kokkos(k_elems, elems, nmax, "mliap_data:elems");  }

  // clear gradforce array

+  int nall = atom->nlocal + atom->nghost;
+  ntotal = nall;
  auto d_gradforce = k_gradforce.template view<DeviceType>();
  Kokkos::deep_copy(d_gradforce, 0.);
-
+  auto d_elems = k_elems.template view<DeviceType>();
+  Kokkos::deep_copy(d_elems, 0.);
  // grow arrays if necessary

  nlistatoms = list->inum;
@ -122,6 +128,7 @@ void MLIAPDataKokkos<DeviceType>::generate_neighdata(class NeighList *list_in, i
  auto d_ij = k_ij.template view<DeviceType>();
  auto d_numneighs = k_numneighs.template view<DeviceType>();
  auto d_jatoms = k_jatoms.template view<DeviceType>();
+  auto d_pair_i= k_pair_i.template view<DeviceType>();
  auto d_jelems= k_jelems.template view<DeviceType>();
  auto d_rij= k_rij.template view<DeviceType>();

@ -162,6 +169,7 @@ void MLIAPDataKokkos<DeviceType>::generate_neighdata(class NeighList *list_in, i
      const int jelem = map(jtype);
      if (rsq < d_cutsq(itype,jtype)) {
        d_jatoms(ij) = j;
+        d_pair_i(ij) = i;
        d_jelems(ij) = jelem;
        d_rij(ij, 0) = delx;
        d_rij(ij, 1) = dely;
@ -172,8 +180,11 @@ void MLIAPDataKokkos<DeviceType>::generate_neighdata(class NeighList *list_in, i
    d_iatoms[ii] = i;
    d_ielems[ii] = ielem;
  });
-
-  modified(execution_space, NUMNEIGHS_MASK | IATOMS_MASK | IELEMS_MASK | JATOMS_MASK | JELEMS_MASK | RIJ_MASK | IJ_MASK );
+  Kokkos::parallel_for(nmax, KOKKOS_LAMBDA (int i)  {
+    const int itype = type(i);
+    d_elems(i) = map(itype);
+  });
+  modified(execution_space, NUMNEIGHS_MASK | IATOMS_MASK | IELEMS_MASK | ELEMS_MASK | JATOMS_MASK | PAIR_I_MASK | JELEMS_MASK | RIJ_MASK | IJ_MASK );
  eflag = eflag_in;
  vflag = vflag_in;
 }
@ -183,7 +194,8 @@ void MLIAPDataKokkos<DeviceType>::generate_neighdata(class NeighList *list_in, i
 template<class DeviceType>
 void MLIAPDataKokkos<DeviceType>::grow_neigharrays() {
  AtomKokkos *atomKK = (AtomKokkos *) atom;
-
+  f = atom->f;
+  f_device = atomKK->k_f.view<DeviceType>().data();
  // grow neighbor arrays if necessary

  if (natomneigh_max < nlistatoms) {
@ -207,6 +219,7 @@ void MLIAPDataKokkos<DeviceType>::grow_neigharrays() {
  auto x = atomKK->k_x.view<DeviceType>();
  auto type = atomKK->k_type.view<DeviceType>();
  auto d_cutsq=k_pairmliap->k_cutsq.template view<DeviceType>();
+  auto h_cutsq=k_pairmliap->k_cutsq.template view<LMPHostType>();
  auto d_numneighs = k_numneighs.template view<DeviceType>();
  Kokkos::parallel_reduce(nlistatoms, KOKKOS_LAMBDA (int ii, int &contrib) {
    const int i = d_ilist[ii];
@ -229,22 +242,24 @@ void MLIAPDataKokkos<DeviceType>::grow_neigharrays() {
    }
    d_numneighs(ii) = count;
    contrib += count;
-  }, nij_total);
+  }, npairs);
  modified(execution_space, NUMNEIGHS_MASK);

-  if (nneigh_max < nij_total) {
+  if (nneigh_max < npairs) {
    memoryKK->destroy_kokkos(k_jatoms,jatoms);
-    memoryKK->create_kokkos(k_jatoms, jatoms, nij_total, "mliap_data:jatoms");
+    memoryKK->create_kokkos(k_jatoms, jatoms, npairs, "mliap_data:jatoms");
+    memoryKK->destroy_kokkos(k_pair_i,pair_i);
+    memoryKK->create_kokkos(k_pair_i, pair_i, npairs, "mliap_data:pair_i");
    memoryKK->destroy_kokkos(k_jelems,jelems);
-    memoryKK->create_kokkos(k_jelems, jelems, nij_total, "mliap_data:jelems");
+    memoryKK->create_kokkos(k_jelems, jelems, npairs, "mliap_data:jelems");
    memoryKK->destroy_kokkos(k_rij,rij);
-    memoryKK->create_kokkos(k_rij, rij, nij_total, 3, "mliap_data:rij");
+    memoryKK->create_kokkos(k_rij, rij, npairs, 3, "mliap_data:rij");

    if (gradgradflag == 0){
      memoryKK->destroy_kokkos(k_graddesc,graddesc);
-      memoryKK->create_kokkos(k_graddesc, graddesc, nij_total, ndescriptors,3, "mliap_data:graddesc");
+      memoryKK->create_kokkos(k_graddesc, graddesc, npairs, ndescriptors,3, "mliap_data:graddesc");
    }
-    nneigh_max = nij_total;
+    nneigh_max = npairs;
   }
 }

@ -256,7 +271,9 @@ void MLIAPDataKokkos<DeviceType>::modified(ExecutionSpace space, unsigned int ma
    if (mask & IATOMS_MASK      ) k_iatoms         .modify<LMPDeviceType>();
    if (mask & IELEMS_MASK      ) k_ielems         .modify<LMPDeviceType>();
    if (mask & JATOMS_MASK      ) k_jatoms         .modify<LMPDeviceType>();
+    if (mask & PAIR_I_MASK      ) k_pair_i         .modify<LMPDeviceType>();
    if (mask & JELEMS_MASK      ) k_jelems         .modify<LMPDeviceType>();
+    if (mask & ELEMS_MASK       ) k_elems          .modify<LMPDeviceType>();
    if (mask & IJ_MASK          ) k_ij             .modify<LMPDeviceType>();
    if (mask & BETAS_MASK       ) k_betas          .modify<LMPDeviceType>();
    if (mask & DESCRIPTORS_MASK ) k_descriptors    .modify<LMPDeviceType>();
@ -274,7 +291,9 @@ void MLIAPDataKokkos<DeviceType>::modified(ExecutionSpace space, unsigned int ma
    if (mask & IATOMS_MASK      ) k_iatoms         .modify<LMPHostType>();
    if (mask & IELEMS_MASK      ) k_ielems         .modify<LMPHostType>();
    if (mask & JATOMS_MASK      ) k_jatoms         .modify<LMPHostType>();
+    if (mask & PAIR_I_MASK      ) k_pair_i         .modify<LMPHostType>();
    if (mask & JELEMS_MASK      ) k_jelems         .modify<LMPHostType>();
+    if (mask & ELEMS_MASK       ) k_elems          .modify<LMPHostType>();
    if (mask & IJ_MASK          ) k_ij             .modify<LMPHostType>();
    if (mask & BETAS_MASK       ) k_betas          .modify<LMPHostType>();
    if (mask & DESCRIPTORS_MASK ) k_descriptors    .modify<LMPHostType>();
@ -300,7 +319,9 @@ void MLIAPDataKokkos<DeviceType>::sync(ExecutionSpace space, unsigned int mask,
    if (mask & IATOMS_MASK      ) k_iatoms         .sync<LMPDeviceType>();
    if (mask & IELEMS_MASK      ) k_ielems         .sync<LMPDeviceType>();
    if (mask & JATOMS_MASK      ) k_jatoms         .sync<LMPDeviceType>();
+    if (mask & PAIR_I_MASK      ) k_pair_i         .sync<LMPDeviceType>();
    if (mask & JELEMS_MASK      ) k_jelems         .sync<LMPDeviceType>();
+    if (mask & ELEMS_MASK       ) k_elems          .sync<LMPDeviceType>();
    if (mask & IJ_MASK          ) k_ij             .sync<LMPDeviceType>();
    if (mask & BETAS_MASK       ) k_betas          .sync<LMPDeviceType>();
    if (mask & DESCRIPTORS_MASK ) k_descriptors    .sync<LMPDeviceType>();
@ -317,7 +338,9 @@ void MLIAPDataKokkos<DeviceType>::sync(ExecutionSpace space, unsigned int mask,
    if (mask & IATOMS_MASK      ) k_iatoms         .sync<LMPHostType>();
    if (mask & IELEMS_MASK      ) k_ielems         .sync<LMPHostType>();
    if (mask & JATOMS_MASK      ) k_jatoms         .sync<LMPHostType>();
+    if (mask & PAIR_I_MASK      ) k_pair_i         .sync<LMPHostType>();
    if (mask & JELEMS_MASK      ) k_jelems         .sync<LMPHostType>();
+    if (mask & ELEMS_MASK       ) k_elems          .sync<LMPHostType>();
    if (mask & IJ_MASK          ) k_ij             .sync<LMPHostType>();
    if (mask & BETAS_MASK       ) k_betas          .sync<LMPHostType>();
    if (mask & DESCRIPTORS_MASK ) k_descriptors    .sync<LMPHostType>();
--- a/src/KOKKOS/mliap_data_kokkos.h
+++ b/src/KOKKOS/mliap_data_kokkos.h
@ -43,6 +43,8 @@ enum {
  GAMMA_MASK_MASK  = 0x00001000,
  GAMMA_ROW_MASK   = 0x00002000,
  GAMMA_COL_MASK   = 0x00004000,
+  PAIR_I_MASK      = 0x00008000,
+  ELEMS_MASK       = 0x00010000,
 };
 // clang-format on

@ -65,6 +67,8 @@ template <class DeviceType> class MLIAPDataKokkos : public MLIAPData {
  DAT::tdual_int_1d k_iatoms;           // index of each atom
  DAT::tdual_int_1d k_ielems;           // element of each atom
  DAT::tdual_int_1d k_jatoms;           // index of each neighbor
+  DAT::tdual_int_1d k_elems;            // element of each atom in or not in the neighborlist
+  DAT::tdual_int_1d k_pair_i;           // index of each i atom for each ij pair
  DAT::tdual_int_1d k_jelems;           // element of each neighbor
  DAT::tdual_int_1d k_ij;               // Start location for each particle
  DAT::tdual_float_2d k_betas;          // betas for all atoms in list
@ -78,10 +82,123 @@ template <class DeviceType> class MLIAPDataKokkos : public MLIAPData {
  DAT::tdual_int_2d k_gamma_row_index;    // row (parameter) index
  DAT::tdual_int_2d k_gamma_col_index;    // column (descriptor) index

-  int nij_total;
+  // Just cached for python interface
+  double *f_device;

 protected:
  class LAMMPS *lmp;
 };
+
+// Now we need a specific device version for communication with python
+class MLIAPDataKokkosDevice {
+public:
+
+  MLIAPDataKokkosDevice(MLIAPDataKokkos<LMPDeviceType> &base) :
+    size_array_rows(base.size_array_rows),
+    size_array_cols(base.size_array_cols),
+    natoms(base.natoms),
+    yoffset(base.yoffset),
+    zoffset(base.zoffset),
+    ndims_force(base.ndims_force),
+    ndims_virial(base.ndims_virial),
+    size_gradforce(base.size_gradforce),
+    f(base.f_device),
+    gradforce(base.k_gradforce.d_view.data()),
+    betas(base.k_betas.d_view.data()),
+    descriptors(base.k_descriptors.d_view.data()),
+    eatoms(base.k_eatoms.d_view.data()),
+    energy(&base.energy),
+    ndescriptors(base.ndescriptors),
+    nparams(base.nparams),
+    nelements(base.nelements),
+    gamma_nnz(base.gamma_nnz),
+    gamma(base.k_gamma.d_view.data()),
+    gamma_row_index(base.k_gamma_row_index.d_view.data()),
+    gamma_col_index(base.k_gamma_col_index.d_view.data()),
+    egradient(nullptr),
+    ntotal(base.ntotal),
+    nlistatoms(base.nlistatoms),
+    natomneigh(base.natomneigh),
+    numneighs(base.numneighs),
+    iatoms(base.k_iatoms.d_view.data()),
+    pair_i(base.k_pair_i.d_view.data()),
+    ielems(base.k_ielems.d_view.data()),
+    nneigh_max(base.nneigh_max),
+    npairs(base.npairs),
+    jatoms(base.k_jatoms.d_view.data()),
+    jelems(base.k_jelems.d_view.data()),
+    elems(base.k_elems.d_view.data()),
+    rij(base.k_rij.d_view.data()),
+    graddesc(base.k_graddesc.d_view.data()),
+    eflag(base.eflag),
+    vflag(base.vflag),
+    pairmliap(dynamic_cast<PairMLIAPKokkos<LMPDeviceType> *>(base.pairmliap)),
+#if defined(KOKKOS_ENABLE_CUDA)
+    dev(1)
+#else
+    dev(0)
+#endif
+    {  }
+  int size_array_rows;
+  int size_array_cols;
+  int natoms;
+  int yoffset;
+  int zoffset;
+  int ndims_force;
+  int ndims_virial;
+  int size_gradforce;
+
+  //Write only
+  double *f;
+  double *gradforce;
+  double *betas;
+  double *descriptors;
+  double *eatoms;
+  double *energy;
+
+  // sizing
+  const int ndescriptors;
+  const int nparams;
+  const int nelements;
+
+  //Ignored for now
+  int gamma_nnz;
+  double *gamma;
+  int *gamma_row_index;
+  int *gamma_col_index;
+  double *egradient;
+
+  // Neighborlist stuff
+  const int ntotal;
+  const int nlistatoms;
+  const int natomneigh;
+  int *numneighs;
+  int *iatoms;
+  int *pair_i;
+  int *ielems;
+  const int nneigh_max;
+  const int npairs;
+  int *jatoms;
+  int *jelems;
+  int *elems;
+  double *rij;
+  double *graddesc;
+  int eflag;
+  int vflag;
+
+  class PairMLIAPKokkos<LMPDeviceType> *pairmliap;    // access to pair tally functions
+
+  int dev;
+
+#ifdef LMP_KOKKOS_GPU
+  MLIAPDataKokkosDevice(MLIAPDataKokkos<LMPHostType> &base) : ndescriptors(-1),nparams(-1),nelements(-1),ntotal(-1),nlistatoms(-1),natomneigh(-1),
+      nneigh_max(-1),npairs(-1)
+  {
+    // It cannot get here, but needed for compilation
+  }
+#endif
+};
+
+
 }    // namespace LAMMPS_NS
 #endif
--- a/src/KOKKOS/mliap_descriptor_so3_kokkos.cpp
+++ b/src/KOKKOS/mliap_descriptor_so3_kokkos.cpp
@ -58,7 +58,7 @@ void MLIAPDescriptorSO3Kokkos<DeviceType>::compute_descriptors(class MLIAPData *
 {
  auto data = static_cast<MLIAPDataKokkos<DeviceType>*>(data_);
  so3ptr_kokkos->spectrum(data->nlistatoms, data->k_numneighs, data->k_jelems, this->k_wjelem, data->k_rij, data->k_ij,
-                          nmax, lmax, rcutfac, alpha, data->nij_total, data->ndescriptors);
+                          nmax, lmax, rcutfac, alpha, data->npairs, data->ndescriptors);

  Kokkos::deep_copy(data->k_descriptors.template view<DeviceType>(), so3ptr_kokkos->m_plist_r);
  Kokkos::deep_copy(data->k_descriptors.h_view, so3ptr_kokkos->m_plist_r);
@ -70,7 +70,7 @@ template <class DeviceType>
 void MLIAPDescriptorSO3Kokkos<DeviceType>::compute_forces(class MLIAPData *data_)
 {
  auto data = static_cast<MLIAPDataKokkos<DeviceType>*>(data_);
-  int npairs = data->nij_total;
+  int npairs = data->npairs;
  auto d_numneighs = data->k_numneighs.template view<DeviceType>();
  so3ptr_kokkos->spectrum_dxdr(data->nlistatoms, data->k_numneighs, data->k_jelems, this->k_wjelem, data->k_rij, data->k_ij,
                               nmax, lmax, rcutfac, alpha, npairs, data->ndescriptors);
@ -186,7 +186,7 @@ void MLIAPDescriptorSO3Kokkos<DeviceType>::compute_force_gradients(class MLIAPDa
  error->all(FLERR,"This has not been tested in cuda/kokkos");

  auto data = static_cast<MLIAPDataKokkos<DeviceType>*>(data_);
-  int npairs = data->nij_total;
+  int npairs = data->npairs;
  so3ptr_kokkos->spectrum_dxdr(data->nlistatoms, data->k_numneighs, data->k_jelems, this->k_wjelem, data->k_rij, data->k_ij,
                               nmax, lmax, rcutfac, alpha, npairs, data->ndescriptors);
  auto d_dplist_r = so3ptr_kokkos->k_dplist_r;
@ -239,7 +239,7 @@ template <class DeviceType>
 void MLIAPDescriptorSO3Kokkos<DeviceType>::compute_descriptor_gradients(class MLIAPData *data_)
 {
  auto data = static_cast<MLIAPDataKokkos<DeviceType>*>(data_);
-  bigint npairs = data->nij_total;
+  bigint npairs = data->npairs;
  so3ptr_kokkos->spectrum_dxdr(data->nlistatoms, data->k_numneighs, data->k_jelems, this->k_wjelem, data->k_rij, data->k_ij,
                               nmax, lmax, rcutfac, alpha, npairs, data->ndescriptors);
  auto graddesc = data->k_graddesc.template view<DeviceType>();
--- a/src/KOKKOS/mliap_model_python_kokkos.h
+++ b/src/KOKKOS/mliap_model_python_kokkos.h
@ -36,51 +36,11 @@ class MLIAPModelPythonKokkos : public MLIAPModelPython, public MLIAPModelKokkos<
  void compute_force_gradients(class MLIAPData *) override;
  void connect_param_counts();
 };
-}    // namespace LAMMPS_NS

-
-
-
-#include "mliap_data_kokkos.h"
-
-namespace LAMMPS_NS {
 class  MLIAPModelPythonKokkosDevice: public MLIAPModelPythonKokkos<LMPDeviceType> {
 };

-class MLIAPDataKokkosDevice {
-public:
+}    // namespace LAMMPS_NS

-  MLIAPDataKokkosDevice(MLIAPDataKokkos<LMPDeviceType> &base) :
-    ndescriptors(base.ndescriptors),
-    nlistatoms(base.nlistatoms),
-    ielems(base.k_ielems.d_view.data()),
-    descriptors(base.k_descriptors.d_view.data()),
-    betas(base.k_betas.d_view.data()),
-    eatoms(base.k_eatoms.d_view.data()),
-    energy(&base.energy),
-#if defined(KOKKOS_ENABLE_CUDA)
-    dev(1)
-#else
-    dev(0)
-#endif
-    {  }
-
-  const int ndescriptors;
-  const int nlistatoms;
-  int *ielems;
-  double *descriptors;
-  double *betas;
-  double *eatoms;
-  double *energy;
-  int dev;
-
-#ifdef LMP_KOKKOS_GPU
-  MLIAPDataKokkosDevice(MLIAPDataKokkos<LMPHostType> &base) : ndescriptors(-1),nlistatoms(-1)
-  {
-    // It cannot get here, but needed for compilation
-  }
-#endif
-};
-}

 #endif
--- a/src/KOKKOS/mliap_unified_couple_kokkos.pyx
+++ b/src/KOKKOS/mliap_unified_couple_kokkos.pyx
@ -0,0 +1,445 @@
+# cython: language_level=3
+# distutils: language = c++
+
+import pickle
+import numpy as np
+import lammps.mliap
+try:
+    import cupy
+except ImportError:
+    pass
+from libc.stdint cimport uintptr_t
+
+cimport cython
+from cpython.ref cimport PyObject
+from libc.stdlib cimport malloc, free
+
+
+cdef extern from "lammps.h" namespace "LAMMPS_NS":
+    cdef cppclass LAMMPS:
+        pass
+
+
+cdef extern from "mliap_data_kokkos.h" namespace "LAMMPS_NS":
+    cdef cppclass MLIAPDataKokkosDevice:
+        # ----- may not need -----
+        int size_array_rows
+        int size_array_cols
+        int natoms
+        int yoffset
+        int zoffset
+        int ndims_force
+        int ndims_virial
+        # -END- may not need -END-
+        int size_gradforce
+        # ----- write only -----
+        double * f
+        double * gradforce
+        double * betas         # betas for all atoms in list
+        double * descriptors   # descriptors for all atoms in list
+        double * eatoms         # energies for all atoms in list
+        double * energy
+        # -END- write only -END-
+        int ndescriptors        # number of descriptors
+        int nparams             # number of model parameters per element
+        int nelements           # number of elements
+
+        # data structures for grad-grad list (gamma)
+
+        # ----- ignore for now -----
+        int gamma_nnz           # number of non-zero entries in gamma
+        double * gamma         # gamma element
+        int * gamma_row_index  # row (parameter) index
+        int * gamma_col_index  # column (descriptor) index
+        double * egradient      # energy gradient w.r.t. parameters
+        # -END- ignore for now -END-
+
+        # data structures for mliap neighbor list
+        # only neighbors strictly inside descriptor cutoff
+
+        int ntotal              # total number of owned and ghost atoms on this proc
+        int nlistatoms          # current number of atoms in local atom lists
+        int natomneigh          # current number of atoms and ghosts in atom neighbor arrays
+        int * numneighs         # neighbors count for each atom
+        int * iatoms            # index of each atom
+        int * pair_i            # index of each i atom for each ij pair
+        int * ielems            # element of each atom
+        int nneigh_max          # number of ij neighbors allocated
+        int npairs              # number of ij neighbor pairs
+        int * jatoms            # index of each neighbor
+        int * jelems            # element of each neighbor
+        int * elems             # element of each atom in or not in the neighborlist
+        double * rij           # distance vector of each neighbor
+        # ----- write only -----
+        double * graddesc     # descriptor gradient w.r.t. each neighbor
+        # -END- write only -END-
+        int eflag               # indicates if energy is needed
+        int vflag               # indicates if virial is needed
+        void * pairmliap        # pointer to base class
+        int dev
+
+cdef extern from "mliap_unified_kokkos.h" namespace "LAMMPS_NS":
+    cdef cppclass MLIAPDummyDescriptor:
+        MLIAPDummyDescriptor(PyObject *, LAMMPS *) except +
+        int ndescriptors    # number of descriptors
+        int nelements       # # of unique elements
+        char *elements     # names of unique elements
+        double cutmax       # maximum cutoff needed
+        double rcutfac
+        double *radelem     # element radii
+
+        void compute_descriptors(MLIAPDataKokkosDevice *)
+        void compute_forces(MLIAPDataKokkosDevice *)
+        void set_elements(char **, int)
+
+    cdef cppclass MLIAPDummyModel:
+        MLIAPDummyModel(PyObject *, LAMMPS *, char * = NULL) except +
+        int ndescriptors    # number of descriptors
+        int nparams         # number of parameters per element
+        int nelements;      # # of unique elements
+
+        void compute_gradients(MLIAPDataKokkosDevice *)
+
+    cdef void update_pair_energy(MLIAPDataKokkosDevice *, double *) except +
+    cdef void update_pair_forces(MLIAPDataKokkosDevice *, double *) except +
+
+
+LOADED_MODEL = None
+
+
+# @property sans getter
+def write_only_property(fset):
+    return property(fget=None, fset=fset)
+
+cdef create_array(device, void *pointer, shape,is_int):
+    size=1
+    for i in shape:
+        size = size*i
+
+    if ( device == 1):
+        mem = cupy.cuda.UnownedMemory(ptr=int( <uintptr_t> pointer), owner=None, size=size)
+        memptr = cupy.cuda.MemoryPointer(mem, 0)
+        type=cupy.double
+        if (is_int):
+            type=cupy.int32
+        return cupy.ndarray(shape, type, memptr=memptr)
+    else:
+        if (len(shape) == 1 ):
+            if (is_int):
+                return np.asarray(<int[:shape[0]]>pointer)
+            else:
+                return np.asarray(<double[:shape[0]]>pointer)
+        else:
+            if (is_int):
+                return np.asarray(<int[:shape[0],:shape[1]]>pointer)
+            else:
+                return np.asarray(<double[:shape[0],:shape[1]]>pointer)
+            
+
+
+# Cython implementation of MLIAPData
+# Automatically converts between C arrays and numpy when needed
+cdef class MLIAPDataPy:
+    cdef MLIAPDataKokkosDevice * data
+    
+    def __cinit__(self):
+        self.data = NULL
+
+    def update_pair_energy_cpu(self, eij):
+        cdef double[:] eij_arr = eij
+        update_pair_energy(self.data, &eij_arr[0])
+    def update_pair_energy_gpu(self, eij):
+        cdef uintptr_t ptr = eij.data.ptr
+        update_pair_energy(self.data, <double*>ptr)  
+    def update_pair_energy(self, eij):
+        if self.data.dev==0:
+            self.update_pair_energy_cpu(eij)
+        else:
+            self.update_pair_energy_gpu(eij)
+
+    def update_pair_forces_cpu(self, fij):
+        cdef double[:, ::1] fij_arr = fij
+        update_pair_forces(self.data, &fij_arr[0][0])
+    def update_pair_forces_gpu(self, fij):
+        cdef uintptr_t ptr = fij.data.ptr
+        update_pair_forces(self.data, <double*>ptr)  
+    def update_pair_forces(self, fij):
+        if self.data.dev==0:
+            self.update_pair_forces_cpu(fij)
+        else:
+            self.update_pair_forces_gpu(fij)
+    @property
+    def f(self):
+        if self.data.f is NULL:
+            return None
+        return cupy.asarray(<double[:self.ntotal, :3]> self.data.f)
+    
+    @property
+    def size_gradforce(self):
+        return self.data.size_gradforce
+ 
+    @write_only_property
+    def gradforce(self, value):
+        if self.data.gradforce is NULL:
+            raise ValueError("attempt to set NULL gradforce")
+        cdef double[:, :] gradforce_view = <double[:self.ntotal, :self.size_gradforce]> &self.data.gradforce[0]
+        cdef double[:, :] value_view = value
+        gradforce_view[:] = value_view
+        print("This code has not been tested or optimized for the GPU, if you are getting this warning optimize gradforce")
+ 
+    @write_only_property
+    def betas(self, value):
+        if self.data.betas is NULL:
+            raise ValueError("attempt to set NULL betas")
+        cdef double[:, :] betas_view = <double[:self.nlistatoms, :self.ndescriptors]> &self.data.betas[0]
+        cdef double[:, :] value_view = value
+        betas_view[:] = value_view
+        print("This code has not been tested or optimized for the GPU, if you are getting this warning optimize ")
+
+    @write_only_property
+    def descriptors(self, value):
+        if self.data.descriptors is NULL:
+            raise ValueError("attempt to set NULL descriptors")
+        cdef double[:, :] descriptors_view = <double[:self.nlistatoms, :self.ndescriptors]> &self.data.descriptors[0]
+        cdef double[:, :] value_view = value
+        descriptors_view[:] = value_view
+        print("This code has not been tested or optimized for the GPU, if you are getting this warning optimize descriptors")
+
+    @write_only_property
+    def eatoms(self, value):
+        if self.data.eatoms is NULL:
+            raise ValueError("attempt to set NULL eatoms")
+        cdef double[:] eatoms_view = <double[:self.nlistatoms]> &self.data.eatoms[0]
+        cdef double[:] value_view = value
+        eatoms_view[:] = value_view
+        print("This code has not been tested or optimized for the GPU, if you are getting this warning optimize eatoms")
+
+
+    @write_only_property
+    def energy(self, value):
+        self.data.energy[0] = <double>value
+
+    @property
+    def ndescriptors(self):
+        return self.data.ndescriptors
+
+    @property
+    def nparams(self):
+        return self.data.nparams
+
+    @property
+    def nelements(self):
+        return self.data.nelements
+
+    # data structures for grad-grad list (gamma)
+
+    @property
+    def gamma_nnz(self):
+        return self.data.gamma_nnz
+
+    @property
+    def gamma(self):
+        if self.data.gamma is NULL:
+            return None
+        return create_array(self.data.dev, self.data.gamma, [self.nlistatoms, self.gama_nnz],False)
+
+    @property
+    def gamma_row_index(self):
+        if self.data.gamma_row_index is NULL:
+            return None
+        return create_array(self.data.dev, self.data.gamma_row_index, [self.nlistatoms, self.gamma_nnz],True)
+
+    @property
+    def gamma_col_index(self):
+        if self.data.gamma_col_index is NULL:
+            return None
+        return create_array(self.data.dev, self.data.gamma_col_index, [self.nlistatoms, self.gamma_nnz],True)
+
+    @property
+    def egradient(self):
+        if self.data.egradient is NULL:
+            return None
+        return create_array(self.data.dev, self.data.egradient, [self.nelements*self.nparams],False)
+
+    # data structures for mliap neighbor list
+    # only neighbors strictly inside descriptor cutoff
+
+    @property
+    def ntotal(self):
+        return self.data.ntotal
+    
+    @property
+    def elems(self):
+        if self.data.elems is NULL:
+            return None
+        return create_array(self.data.dev, self.data.elems, [self.ntotal],True)
+
+    @property
+    def nlistatoms(self):
+        return self.data.nlistatoms
+    
+    @property
+    def natomneigh(self):
+        return self.data.natomneigh
+
+    @property
+    def numneighs(self):
+        if self.data.numneighs is NULL:
+            return None
+        return create_array(self.data.dev, self.data.numneighs, [self.natomneigh],False)
+
+    @property
+    def iatoms(self):
+        if self.data.iatoms is NULL:
+            return None
+        return create_array(self.data.dev, self.data.iatoms, [self.natomneigh],True)
+    
+    @property
+    def ielems(self):
+        if self.data.ielems is NULL:
+            return None
+        return create_array(self.data.dev, self.data.ielems, [self.natomneigh],True)
+
+    @property
+    def npairs(self):
+        return self.data.npairs
+
+    @property
+    def pair_i(self):
+        if self.data.pair_i is NULL:
+            return None
+        return create_array(self.data.dev, self.data.pair_i, [self.npairs],True)
+    
+    @property
+    def pair_j(self):
+        return self.jatoms
+
+    @property
+    def jatoms(self):
+        if self.data.jatoms is NULL:
+            return None
+        return create_array(self.data.dev, self.data.jatoms, [self.npairs],True)
+    
+    @property
+    def jelems(self):
+        if self.data.jelems is NULL:
+            return None
+        return create_array(self.data.dev, self.data.jelems, [self.npairs],True)
+
+
+    @property
+    def rij(self):
+        if self.data.rij is NULL:
+            return None
+        return create_array(self.data.dev, self.data.rij, [self.npairs,3],False)
+
+    @write_only_property
+    def graddesc(self, value):
+        if self.data.graddesc is NULL:
+            raise ValueError("attempt to set NULL graddesc")
+        cdef double[:, :, :] graddesc_view = <double[:self.npairs, :self.ndescriptors, :3]> &self.data.graddesc[0]
+        cdef double[:, :, :] value_view = value
+        graddesc_view[:] = value_view
+
+    @property
+    def eflag(self):
+        return self.data.eflag
+
+    @property
+    def vflag(self):
+        return self.data.vflag
+
+
+# Interface between C and Python compute functions
+cdef class MLIAPUnifiedInterface:
+    cdef MLIAPDummyModel * model
+    cdef MLIAPDummyDescriptor * descriptor
+    cdef unified_impl
+
+    def __init__(self, unified_impl):
+        self.model = NULL
+        self.descriptor = NULL
+        self.unified_impl = unified_impl
+    
+    def compute_gradients(self, data):
+        self.unified_impl.compute_gradients(data)
+    
+    def compute_descriptors(self, data):
+        self.unified_impl.compute_descriptors(data)
+    
+    def compute_forces(self, data):
+        self.unified_impl.compute_forces(data)
+
+
+cdef public void compute_gradients_python_kokkos(unified_int, MLIAPDataKokkosDevice *data) except * with gil:
+    pydata = MLIAPDataPy()
+    pydata.data = data
+    unified_int.compute_gradients(pydata)
+
+
+cdef public void compute_descriptors_python_kokkos(unified_int, MLIAPDataKokkosDevice *data) except * with gil:
+    pydata = MLIAPDataPy()
+    pydata.data = data
+    unified_int.compute_descriptors(pydata)
+
+
+cdef public void compute_forces_python_kokkos(unified_int, MLIAPDataKokkosDevice *data) except * with gil:
+    pydata = MLIAPDataPy()
+    pydata.data = data
+    unified_int.compute_forces(pydata)
+
+
+# Create a MLIAPUnifiedInterface and connect it to the dummy model, descriptor
+cdef public object mliap_unified_connect_kokkos(char *fname, MLIAPDummyModel * model,
+                                                MLIAPDummyDescriptor * descriptor) with gil:
+    str_fname = fname.decode('utf-8')
+    if str_fname == 'EXISTS':
+        if LOADED_MODEL is None:
+            raise ValueError("No unified model loaded")
+        unified = LOADED_MODEL
+    elif str_fname.endswith(".pt") or str_fname.endswith('.pth'):
+        import torch
+        unified = torch.load(str_fname)
+    else:
+        with open(str_fname, 'rb') as pfile:
+            unified = pickle.load(pfile)
+
+    unified_int = MLIAPUnifiedInterface(unified)
+    unified_int.model = model
+    unified_int.descriptor = descriptor
+
+    unified.interface = unified_int
+
+    if unified.ndescriptors is None:
+        raise ValueError("no descriptors set")
+
+    unified_int.descriptor.ndescriptors = <int>unified.ndescriptors
+    unified_int.descriptor.rcutfac = <double>unified.rcutfac
+    unified_int.model.ndescriptors = <int>unified.ndescriptors
+    unified_int.model.nparams = <int>unified.nparams
+
+    if unified.element_types is None:
+        raise ValueError("no element type set")
+    
+    cdef int nelements = <int>len(unified.element_types)
+    cdef char **elements = <char**>malloc(nelements * sizeof(char*))
+
+    if not elements:
+        raise MemoryError("failed to allocate memory for element names")
+
+    cdef char *elem_name
+    for i, elem in enumerate(unified.element_types):
+        elem_name_bytes = elem.encode('UTF-8')
+        elem_name = elem_name_bytes
+        elements[i] = &elem_name[0]
+    unified_int.descriptor.set_elements(elements, nelements)
+    unified_int.model.nelements = nelements
+
+    free(elements)
+    return unified_int
+
+
+# For pre-loading a Python model
+def load_from_python(unified):
+    global LOADED_MODEL
+    LOADED_MODEL = unified
--- a/src/KOKKOS/mliap_unified_kokkos.cpp
+++ b/src/KOKKOS/mliap_unified_kokkos.cpp
@ -0,0 +1,388 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Matt Bettencourt (NVIDIA)
+------------------------------------------------------------------------- */
+
+#ifdef MLIAP_PYTHON
+
+#include "mliap_unified_kokkos.h"
+#include <Python.h>
+
+#include "error.h"
+#include "lmppython.h"
+#include "memory.h"
+#include "mliap_data.h"
+#include "mliap_unified_couple_kokkos.h"
+#include "pair_mliap.h"
+#include "python_compat.h"
+#include "utils.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+template <class DeviceType>
+MLIAPDummyDescriptorKokkos<DeviceType>::MLIAPDummyDescriptorKokkos(LAMMPS *_lmp) :
+  Pointers(_lmp), MLIAPDummyDescriptor(_lmp), MLIAPDescriptorKokkos<DeviceType>(lmp, this) {}
+
+template <class DeviceType>
+MLIAPDummyDescriptorKokkos<DeviceType>::~MLIAPDummyDescriptorKokkos()
+{
+  // done in base class
+  // Py_DECREF(unified_interface);
+}
+
+/* ----------------------------------------------------------------------
+   invoke compute_descriptors from Cython interface
+   ---------------------------------------------------------------------- */
+
+template <class DeviceType>
+void MLIAPDummyDescriptorKokkos<DeviceType>::compute_descriptors(class MLIAPData *data)
+{
+  PyGILState_STATE gstate = PyGILState_Ensure();
+  auto *kokkos_data = dynamic_cast<MLIAPDataKokkos<DeviceType>*>(data);
+  MLIAPDataKokkosDevice raw_data(*kokkos_data);
+  compute_descriptors_python_kokkos(unified_interface, &raw_data);
+  if (PyErr_Occurred()) {
+    PyErr_Print();
+    PyErr_Clear();
+    PyGILState_Release(gstate);
+    lmp->error->all(FLERR, "Running mliappy unified compute_descriptors failure.");
+  }
+  PyGILState_Release(gstate);
+}
+
+/* ----------------------------------------------------------------------
+   invoke compute_forces from Cython interface
+   ---------------------------------------------------------------------- */
+
+template <class DeviceType>
+void MLIAPDummyDescriptorKokkos<DeviceType>::compute_forces(class MLIAPData *data)
+{
+  PyGILState_STATE gstate = PyGILState_Ensure();
+  auto *kokkos_data = dynamic_cast<MLIAPDataKokkos<DeviceType>*>(data);
+  MLIAPDataKokkosDevice raw_data(*kokkos_data);
+  compute_forces_python_kokkos(unified_interface, &raw_data);
+  if (PyErr_Occurred()) {
+    PyErr_Print();
+    PyErr_Clear();
+    PyGILState_Release(gstate);
+    lmp->error->all(FLERR, "Running mliappy unified compute_forces failure.");
+  }
+  PyGILState_Release(gstate);
+}
+
+// not implemented
+template <class DeviceType>
+void MLIAPDummyDescriptorKokkos<DeviceType>::compute_force_gradients(class MLIAPData *)
+{
+  error->all(FLERR, "compute_force_gradients not implemented");
+}
+
+// not implemented
+template <class DeviceType>
+void MLIAPDummyDescriptorKokkos<DeviceType>::compute_descriptor_gradients(class MLIAPData *)
+{
+  error->all(FLERR, "compute_descriptor_gradients not implemented");
+}
+
+template <class DeviceType>
+void MLIAPDummyDescriptorKokkos<DeviceType>::init()
+{
+  memory->create(radelem, nelements, "mliap_dummy_descriptor:radelem");
+  for (int ielem = 0; ielem < nelements; ielem++) { radelem[ielem] = 1; }
+
+  double cut;
+  cutmax = 0.0;
+  memory->create(cutsq, nelements, nelements, "mliap/descriptor/dummy:cutsq");
+  memory->create(cutghost, nelements, nelements, "mliap/descriptor/dummy:cutghost");
+  for (int ielem = 0; ielem < nelements; ielem++) {
+    // rcutfac set from python, is global cutoff for all elements
+    cut = 2.0 * radelem[ielem] * rcutfac;
+    if (cut > cutmax) cutmax = cut;
+    cutsq[ielem][ielem] = cut * cut;
+    cutghost[ielem][ielem] = cut * cut;
+    for (int jelem = ielem + 1; jelem < nelements; jelem++) {
+      cut = (radelem[ielem] + radelem[jelem]) * rcutfac;
+      cutsq[ielem][jelem] = cutsq[jelem][ielem] = cut * cut;
+      cutghost[ielem][jelem] = cutghost[jelem][ielem] = cut * cut;
+    }
+  }
+}
+
+template <class DeviceType>
+void MLIAPDummyDescriptorKokkos<DeviceType>::set_elements(char **elems, int nelems)
+{
+  nelements = nelems;
+  elements = new char *[nelems];
+  for (int i = 0; i < nelems; i++) { elements[i] = utils::strdup(elems[i]); }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class DeviceType>
+MLIAPDummyModelKokkos<DeviceType>::MLIAPDummyModelKokkos(LAMMPS *lmp, char *coefffilename) :
+MLIAPDummyModel(lmp,coefffilename),
+MLIAPModelKokkos<DeviceType>(lmp, this)
+{
+  nonlinearflag = 1;
+}
+
+template <class DeviceType>
+MLIAPDummyModelKokkos<DeviceType>::~MLIAPDummyModelKokkos()
+{
+  // manually decrement borrowed reference from Python
+  Py_DECREF(unified_interface);
+}
+
+template <class DeviceType>
+int MLIAPDummyModelKokkos<DeviceType>::get_nparams()
+{
+  return nparams;
+}
+
+template <class DeviceType>
+int MLIAPDummyModelKokkos<DeviceType>::get_gamma_nnz(class MLIAPData *)
+{
+  // TODO: get_gamma_nnz
+  return 0;
+}
+
+/* ----------------------------------------------------------------------
+   invoke compute_gradients from Cython interface
+   ---------------------------------------------------------------------- */
+
+template <class DeviceType>
+void MLIAPDummyModelKokkos<DeviceType>::compute_gradients(class MLIAPData *data)
+{
+  PyGILState_STATE gstate = PyGILState_Ensure();
+  auto *kokkos_data = dynamic_cast<MLIAPDataKokkos<DeviceType>*>(data);
+  MLIAPDataKokkosDevice raw_data(*kokkos_data);
+  compute_gradients_python_kokkos(unified_interface, &raw_data);
+  if (PyErr_Occurred()) {
+    PyErr_Print();
+    PyErr_Clear();
+    PyGILState_Release(gstate);
+    MLIAPModelKokkos<DeviceType>::error->all(FLERR, "Running mliappy unified compute_gradients failure.");
+  }
+  PyGILState_Release(gstate);
+}
+
+// not implemented
+template <class DeviceType>
+void MLIAPDummyModelKokkos<DeviceType>::compute_gradgrads(class MLIAPData *)
+{
+  MLIAPModelKokkos<DeviceType>::error->all(FLERR, "compute_gradgrads not implemented");
+}
+
+// not implemented
+template <class DeviceType>
+void MLIAPDummyModelKokkos<DeviceType>::compute_force_gradients(class MLIAPData *)
+{
+  MLIAPModelKokkos<DeviceType>::error->all(FLERR, "compute_force_gradients not implemented");
+}
+
+/* ----------------------------------------------------------------------
+   memory usage unclear due to Cython/Python implementation
+   ---------------------------------------------------------------------- */
+
+template <class DeviceType>
+double MLIAPDummyModelKokkos<DeviceType>::memory_usage()
+{
+  // TODO: implement memory usage in Cython(?)
+  return 0;
+}
+
+// not implemented
+template <class DeviceType>
+void MLIAPDummyModelKokkos<DeviceType>::read_coeffs(char *)
+{
+  MLIAPModelKokkos<DeviceType>::error->all(FLERR, "read_coeffs not implemented");
+}
+
+/* ----------------------------------------------------------------------
+   build the unified interface object, connect to dummy model and descriptor
+   ---------------------------------------------------------------------- */
+
+template <class DeviceType>
+MLIAPBuildUnifiedKokkos_t<DeviceType> LAMMPS_NS::build_unified(char *unified_fname, MLIAPDataKokkos<DeviceType> *data, LAMMPS *lmp,
+                                             char *coefffilename)
+{
+  lmp->python->init();
+  PyGILState_STATE gstate = PyGILState_Ensure();
+
+  PyObject *pyMain = PyImport_AddModule("__main__");
+
+  if (!pyMain) {
+    PyGILState_Release(gstate);
+    lmp->error->all(FLERR, "Could not initialize embedded Python");
+  }
+
+  PyImport_ImportModule("mliap_unified_couple_kokkos");
+
+  if (PyErr_Occurred()) {
+    PyErr_Print();
+    PyErr_Clear();
+    PyGILState_Release(gstate);
+    lmp->error->all(FLERR, "Loading mliappy unified module failure.");
+  }
+
+  // Connect dummy model, dummy descriptor, data to Python unified
+  MLIAPDummyModelKokkos<DeviceType> *model = new MLIAPDummyModelKokkos<DeviceType>(lmp, coefffilename);
+  MLIAPDummyDescriptorKokkos<DeviceType> *descriptor = new MLIAPDummyDescriptorKokkos<DeviceType>(lmp);
+
+  PyObject *unified_interface = mliap_unified_connect_kokkos(unified_fname, model, descriptor);
+  if (PyErr_Occurred()) {
+    PyErr_Print();
+    PyErr_Clear();
+    PyGILState_Release(gstate);
+    lmp->error->all(FLERR, "Running mliappy unified module failure.");
+  }
+
+  // Borrowed references must be manually incremented
+  model->unified_interface = unified_interface;
+  Py_INCREF(unified_interface);
+  descriptor->unified_interface = unified_interface;
+  Py_INCREF(unified_interface);
+
+  PyGILState_Release(gstate);
+
+  MLIAPBuildUnifiedKokkos_t<DeviceType> build = {data, descriptor, model};
+  return build;
+}
+
+/* ----------------------------------------------------------------------
+   set energy for ij atom pairs
+   ---------------------------------------------------------------------- */
+
+void LAMMPS_NS::update_pair_energy(MLIAPDataKokkosDevice *data, double *eij)
+{
+  double e_total = 0.0;
+  auto d_eatoms = data->eatoms;
+  auto d_pair_i= data->pair_i;
+  const auto nlistatoms = data->nlistatoms;
+  Kokkos::parallel_for(nlistatoms, KOKKOS_LAMBDA(int ii){
+    d_eatoms[ii] = 0;
+  });
+
+  Kokkos::parallel_reduce(data->npairs, KOKKOS_LAMBDA(int ii, double &local_sum){
+    int i = d_pair_i[ii];
+    double e = 0.5 * eij[ii];
+
+    // must not count any contribution where i is not a local atom
+    if (i < nlistatoms) {
+      Kokkos::atomic_add(&d_eatoms[i], e);
+      local_sum += e;
+    }
+  },*data->energy);
+}
+
+/* ----------------------------------------------------------------------
+   set forces for ij atom pairs
+   ---------------------------------------------------------------------- */
+
+void LAMMPS_NS::update_pair_forces(MLIAPDataKokkosDevice *data, double *fij)
+{
+  const auto nlistatoms = data->nlistatoms;
+  auto *f = data->f;
+  auto pair_i = data->pair_i;
+  auto j_atoms = data->jatoms;
+  auto vflag = data->vflag;
+  auto rij = data->rij;
+  int vflag_either=data->pairmliap->vflag_either, vflag_global=data->pairmliap->vflag_global, vflag_atom=data->pairmliap->vflag_atom;
+  auto d_vatom = data->pairmliap->k_vatom.template view<LMPDeviceType>();
+  Kokkos::View<double[6], LMPDeviceType> virial("virial");
+
+  Kokkos::parallel_for(data->npairs,KOKKOS_LAMBDA (int ii) {
+
+    int ii3 = ii * 3;
+    int i = pair_i[ii];
+    int j = j_atoms[ii];
+
+    // must not count any contribution where i is not a local atom
+    if (i < nlistatoms) {
+      Kokkos::atomic_add(&f[i*3+0], fij[ii3+0]);
+      Kokkos::atomic_add(&f[i*3+1], fij[ii3+1]);
+      Kokkos::atomic_add(&f[i*3+2], fij[ii3+2]);
+      Kokkos::atomic_add(&f[j*3+0],-fij[ii3+0]);
+      Kokkos::atomic_add(&f[j*3+1],-fij[ii3+1]);
+      Kokkos::atomic_add(&f[j*3+2],-fij[ii3+2]);
+      if (vflag) {
+        double v[6];
+        v[0] = -rij[ii3+0]*fij[ii3+0];
+        v[1] = -rij[ii3+1]*fij[ii3+1];
+        v[2] = -rij[ii3+2]*fij[ii3+2];
+        v[3] = -rij[ii3+0]*fij[ii3+1];
+        v[4] = -rij[ii3+0]*fij[ii3+2];
+        v[5] = -rij[ii3+1]*fij[ii3+2];
+        if (vflag_global) {
+          Kokkos::atomic_add(&virial[0], v[0]);
+          Kokkos::atomic_add(&virial[1], v[1]);
+          Kokkos::atomic_add(&virial[2], v[2]);
+          Kokkos::atomic_add(&virial[3], v[3]);
+          Kokkos::atomic_add(&virial[4], v[4]);
+          Kokkos::atomic_add(&virial[5], v[5]);
+        }
+        if (vflag_atom) {
+          Kokkos::atomic_add(&d_vatom(i,0), 0.5*v[0]);
+          Kokkos::atomic_add(&d_vatom(i,1), 0.5*v[1]);
+          Kokkos::atomic_add(&d_vatom(i,2), 0.5*v[2]);
+          Kokkos::atomic_add(&d_vatom(i,3), 0.5*v[3]);
+          Kokkos::atomic_add(&d_vatom(i,4), 0.5*v[4]);
+          Kokkos::atomic_add(&d_vatom(i,5), 0.5*v[5]);
+
+          Kokkos::atomic_add(&d_vatom(j,0), 0.5*v[0]);
+          Kokkos::atomic_add(&d_vatom(j,1), 0.5*v[1]);
+          Kokkos::atomic_add(&d_vatom(j,2), 0.5*v[2]);
+          Kokkos::atomic_add(&d_vatom(j,3), 0.5*v[3]);
+          Kokkos::atomic_add(&d_vatom(j,4), 0.5*v[4]);
+          Kokkos::atomic_add(&d_vatom(j,5), 0.5*v[5]);
+        }
+      }
+    }
+  });
+
+  if (vflag) {
+    if (vflag_global) {
+      Kokkos::View<double[6], LMPHostType> h_virial("h_virial");
+      Kokkos::deep_copy(h_virial,virial);
+      for (int i=0;i<6;++i)
+        data->pairmliap->virial[i]+=h_virial[i];
+    }
+    if (vflag_atom) {
+      data->pairmliap->k_vatom.template modify<LMPDeviceType>();
+      data->pairmliap->k_vatom.template sync<LMPHostType>();
+    }
+  }
+}
+
+namespace LAMMPS_NS {
+template class MLIAPDummyModelKokkos<LMPDeviceType>;
+template class MLIAPDummyDescriptorKokkos<LMPDeviceType>;
+template MLIAPBuildUnifiedKokkos_t<LMPDeviceType> LAMMPS_NS::build_unified(char *unified_fname, MLIAPDataKokkos<LMPDeviceType> *data, LAMMPS *lmp,
+                                             char *coefffilename);
+//template void LAMMPS_NS::update_pair_energy(MLIAPDataKokkos<LMPDeviceType> *data, double *eij);
+//template void LAMMPS_NS::update_pair_forces(MLIAPDataKokkos<LMPDeviceType> *data, double *fij);
+#ifdef LMP_KOKKOS_GPU
+template class MLIAPDummyModelKokkos<LMPHostType>;
+template class MLIAPDummyDescriptorKokkos<LMPHostType>;
+template MLIAPBuildUnifiedKokkos_t<LMPHostType> LAMMPS_NS::build_unified(char *unified_fname, MLIAPDataKokkos<LMPHostType> *data, LAMMPS *lmp,
+                                             char *coefffilename);
+//template void LAMMPS_NS::update_pair_energy(MLIAPDataKokkos<LMPHostType> *data, double *eij);
+//template void LAMMPS_NS::update_pair_forces(MLIAPDataKokkos<LMPHostType> *data, double *fij);
+#endif
+}
+#endif
+
--- a/src/KOKKOS/mliap_unified_kokkos.h
+++ b/src/KOKKOS/mliap_unified_kokkos.h
@ -0,0 +1,66 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_MLIAP_UNIFIED_KOKKOS_H
+#define LMP_MLIAP_UNIFIED_KOKKOS_H
+
+#include "mliap_unified.h"
+#include "mliap_descriptor_kokkos.h"
+#include "mliap_model_kokkos.h"
+#include "mliap_data_kokkos.h"
+
+#include <Python.h>
+
+namespace LAMMPS_NS {
+template <class DeviceType>
+class MLIAPDummyDescriptorKokkos : public MLIAPDummyDescriptor, public MLIAPDescriptorKokkos<DeviceType>{
+ public:
+  MLIAPDummyDescriptorKokkos(LAMMPS *);
+  ~MLIAPDummyDescriptorKokkos() override;
+  void compute_descriptors(class MLIAPData *) override;
+  void compute_forces(class MLIAPData *) override;
+  void compute_force_gradients(class MLIAPData *) override;
+  void compute_descriptor_gradients(class MLIAPData *) override;
+  void init() override;
+  void set_elements(char **, int);
+};
+template <class DeviceType>
+class MLIAPDummyModelKokkos : public MLIAPDummyModel, public MLIAPModelKokkos<DeviceType> {
+ public:
+  MLIAPDummyModelKokkos(LAMMPS *, char * = nullptr);
+  ~MLIAPDummyModelKokkos() override;
+  int get_nparams() override;
+  int get_gamma_nnz(class MLIAPData *) override;
+  void compute_gradients(class MLIAPData *) override;
+  void compute_gradgrads(class MLIAPData *) override;
+  void compute_force_gradients(class MLIAPData *) override;
+  double memory_usage() override;
+
+ protected:
+  void read_coeffs(char *) override;
+};
+
+template <class DeviceType>
+struct MLIAPBuildUnifiedKokkos_t {
+  MLIAPDataKokkos<DeviceType> *data;
+  MLIAPDummyDescriptorKokkos<DeviceType> *descriptor;
+  MLIAPDummyModelKokkos<DeviceType> *model;
+};
+template <class DeviceType>
+MLIAPBuildUnifiedKokkos_t<DeviceType> build_unified(char *, MLIAPDataKokkos<DeviceType> *, LAMMPS *, char * = NULL);
+void update_pair_energy(MLIAPDataKokkosDevice *, double *);
+void update_pair_forces(MLIAPDataKokkosDevice *, double *);
+
+}    // namespace LAMMPS_NS
+
+#endif
--- a/src/KOKKOS/pair_mliap_kokkos.cpp
+++ b/src/KOKKOS/pair_mliap_kokkos.cpp
@ -24,6 +24,7 @@
 #include "mliap_model_linear_kokkos.h"
 #ifdef MLIAP_PYTHON
 #include "mliap_model_python_kokkos.h"
+#include "mliap_unified_kokkos.h"
 #endif
 #include "error.h"
 #include "neigh_request.h"
@ -66,7 +67,6 @@ PairMLIAPKokkos<DeviceType>::~PairMLIAPKokkos()
 template<class DeviceType>
 void PairMLIAPKokkos<DeviceType>::compute(int eflag, int vflag)
 {
-  atomKK->sync(Host,F_MASK | ENERGY_MASK | VIRIAL_MASK);
  atomKK->sync(execution_space,X_MASK | TYPE_MASK );
  MLIAPDataKokkos<DeviceType> *k_data = (MLIAPDataKokkos<DeviceType>*)(data);

@ -97,7 +97,7 @@ void PairMLIAPKokkos<DeviceType>::compute(int eflag, int vflag)

  // compute descriptors, if needed
  if (model->nonlinearflag || eflag)  {
-      k_data->sync(descriptor_space, NUMNEIGHS_MASK | IATOMS_MASK | IELEMS_MASK | JATOMS_MASK | JELEMS_MASK | RIJ_MASK );
+    k_data->sync(descriptor_space, NUMNEIGHS_MASK | IATOMS_MASK | IELEMS_MASK | ELEMS_MASK | JATOMS_MASK | PAIR_I_MASK | JELEMS_MASK | RIJ_MASK );
    descriptor->compute_descriptors(data);
    if (!is_kokkos_descriptor)
      k_data->modified(descriptor_space, DESCRIPTORS_MASK);
@ -109,12 +109,13 @@ void PairMLIAPKokkos<DeviceType>::compute(int eflag, int vflag)
  k_data->modified(model_space, BETAS_MASK);
  if (eflag_atom)
    k_data->modified(model_space, EATOMS_MASK);
-  e_tally(data);

  // calculate force contributions beta_i*dB_i/dR_j
-  k_data->sync(descriptor_space, NUMNEIGHS_MASK | IATOMS_MASK | IELEMS_MASK | BETAS_MASK | JATOMS_MASK | JELEMS_MASK | RIJ_MASK );
+  k_data->sync(descriptor_space, NUMNEIGHS_MASK | IATOMS_MASK | IELEMS_MASK | ELEMS_MASK | BETAS_MASK | JATOMS_MASK | PAIR_I_MASK | JELEMS_MASK | RIJ_MASK );
  descriptor->compute_forces(data);

+  e_tally(data);
+
  if (evflag) {
    atomKK->modified(descriptor_space,F_MASK | ENERGY_MASK | VIRIAL_MASK);
    atomKK->sync(execution_space,F_MASK | ENERGY_MASK | VIRIAL_MASK);
@ -181,6 +182,25 @@ void PairMLIAPKokkos<DeviceType>::settings(int narg, char ** arg)
        iarg += 3;
      } else
        new_args.push_back(arg[iarg++]);
+    } else if (strcmp(arg[iarg], "unified") == 0) {
+#ifdef MLIAP_PYTHON
+      printf("IN SETUP UNIFIED\n");
+      if (model != nullptr) error->all(FLERR,"Illegal multiple pair_style mliap model definitions");
+      if (descriptor != nullptr) error->all(FLERR,"Illegal multiple pair_style mliap descriptor definitions");
+      if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "pair_style mliap unified", error);
+      MLIAPBuildUnifiedKokkos_t<DeviceType> build = build_unified(arg[iarg+1], dynamic_cast<MLIAPDataKokkos<DeviceType>*>(data), lmp);
+      if (iarg+3 > narg) {
+        ghostneigh = 0;
+      } else {
+        ghostneigh = utils::logical(FLERR, arg[iarg+2], false, lmp);
+      }
+
+      iarg += 3;
+      model = build.model;
+      descriptor = build.descriptor;
+#else
+      error->all(FLERR,"Using pair_style mliap unified requires ML-IAP with python support");
+#endif
    } else
      new_args.push_back(arg[iarg++]);
  }
@ -226,13 +246,6 @@ void PairMLIAPKokkos<DeviceType>::coeff(int narg, char **arg) {
  k_map.modify<LMPHostType>();
  k_map.sync<LMPDeviceType>();

-  auto h_cutsq=k_cutsq.template view<LMPHostType>();
-  for (int itype=1; itype <= atom->ntypes; ++itype)
-    for (int jtype=1; jtype <= atom->ntypes; ++jtype)
-      h_cutsq(itype,jtype) = descriptor->cutsq[map[itype]][map[jtype]];
-  k_cutsq.modify<LMPHostType>();
-  k_cutsq.sync<DeviceType>();
-
  // clear setflag since coeff() called once with I,J = * *

  int n = atom->ntypes;
@ -257,6 +270,13 @@ void PairMLIAPKokkos<DeviceType>::coeff(int narg, char **arg) {
  // set up model, descriptor, and mliap data structures
  model->init();
  descriptor->init();
+
+  auto h_cutsq=k_cutsq.template view<LMPHostType>();
+  for (int itype=1; itype <= atom->ntypes; ++itype)
+    for (int jtype=1; jtype <= atom->ntypes; ++jtype)
+      h_cutsq(itype,jtype) = descriptor->cutsq[map[itype]][map[jtype]];
+  k_cutsq.modify<LMPHostType>();
+  k_cutsq.sync<DeviceType>();
  int gradgradflag = -1;
  delete data;
  data = new MLIAPDataKokkos<DeviceType>(lmp, gradgradflag, map, model, descriptor, this);
--- a/src/ML-IAP/pair_mliap.cpp
+++ b/src/ML-IAP/pair_mliap.cpp
@ -83,7 +83,6 @@ void PairMLIAP::compute(int eflag, int vflag)
 {

  // consistency checks
-
  if (data->ndescriptors != model->ndescriptors)
    error->all(FLERR, "Inconsistent model and descriptor descriptor count: {} vs {}",
               model->ndescriptors, data->ndescriptors);
@ -134,10 +133,10 @@ void PairMLIAP::allocate()

 void PairMLIAP::settings(int narg, char ** arg)
 {
-  if (narg < 2) utils::missing_cmd_args(FLERR, "pair_style mliap", error);

  // This is needed because the unit test calls settings twice
  if (!is_child) {
+    if (narg < 2) utils::missing_cmd_args(FLERR, "pair_style mliap", error);
    delete model;
    model = nullptr;
    delete descriptor;