USER-DPD: first attempt at fix_shardlow_kokkos... It compiles!

2017-02-24 13:35:48 -05:00
parent a341a6bca9
commit f7a48719ad
2 changed files with 872 additions and 0 deletions
--- a/src/KOKKOS/fix_shardlow_kokkos.cpp
+++ b/src/KOKKOS/fix_shardlow_kokkos.cpp
@ -0,0 +1,718 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors:
+   James Larentzos (U.S. Army Research Laboratory)
+   and Timothy I. Mattox (Engility Corporation)
+
+   Martin Lisal (Institute of Chemical Process Fundamentals
+   of the Czech Academy of Sciences and J. E. Purkinje University)
+
+   John Brennan, Joshua Moore and William Mattson (Army Research Lab)
+
+   Please cite the related publications:
+   J. P. Larentzos, J. K. Brennan, J. D. Moore, M. Lisal, W. D. Mattson,
+   "Parallel implementation of isothermal and isoenergetic Dissipative
+   Particle Dynamics using Shardlow-like splitting algorithms",
+   Computer Physics Communications, 2014, 185, pp 1987--1998.
+
+   M. Lisal, J. K. Brennan, J. Bonet Avalos, "Dissipative particle dynamics
+   at isothermal, isobaric, isoenergetic, and isoenthalpic conditions using
+   Shardlow-like splitting algorithms", Journal of Chemical Physics, 2011,
+   135, 204105.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "fix_shardlow_kokkos.h"
+#include "atom.h"
+#include "atom_masks.h"
+#include "atom_kokkos.h"
+#include "force.h"
+#include "update.h"
+#include "respa.h"
+#include "error.h"
+#include <math.h>
+#include "atom_vec.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "neigh_list_kokkos.h"
+#include "neigh_request.h"
+#include "random_mars.h"
+#include "memory.h"
+#include "domain.h"
+#include "modify.h"
+// #include "pair_dpd_fdt.h"
+#include "pair_dpd_fdt_energy_kokkos.h"
+#include "pair.h"
+#include "npair_ssa_kokkos.h"
+#include "citeme.h"
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+#define EPSILON 1.0e-10
+#define EPSILON_SQUARED ((EPSILON) * (EPSILON))
+
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+FixShardlowKokkos<DeviceType>::FixShardlowKokkos(LAMMPS *lmp, int narg, char **arg) :
+  FixShardlow(lmp, narg, arg), k_pairDPDE(NULL), ghostmax(0), nlocal(0) , nghost(0)
+{
+  kokkosable = 1;
+//  atomKK = (AtomKokkos *) atom;
+//  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+
+//  datamask_read = X_MASK | V_MASK | F_MASK | MASK_MASK | Q_MASK | TYPE_MASK;
+//  datamask_modify = Q_MASK | X_MASK;
+
+  if (narg != 3) error->all(FLERR,"Illegal fix shardlow command");
+
+//  k_pairDPD = NULL;
+  k_pairDPDE = NULL;
+//  k_pairDPD = (PairDPDfdtKokkos *) force->pair_match("dpd/fdt",1);
+  k_pairDPDE = (PairDPDfdtEnergyKokkos<DeviceType> *) force->pair_match("dpd/fdt/energy/kk",1);
+
+//   if(k_pairDPDE){
+    comm_forward = 3;
+    comm_reverse = 5;
+    p_rand_pool = &(k_pairDPDE->rand_pool);
+//   } else {
+//     comm_forward = 3;
+//     comm_reverse = 3;
+//     p_rand_pool = &(k_pairDPD->rand_pool);
+//   }
+
+
+  if(/* k_pairDPD == NULL &&*/ k_pairDPDE == NULL)
+    error->all(FLERR,"Must use pair_style "/*"dpd/fdt/kk or "*/"dpd/fdt/energy/kk with fix shardlow/kk");
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+FixShardlowKokkos<DeviceType>::~FixShardlowKokkos()
+{
+  ghostmax = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+int FixShardlowKokkos<DeviceType>::setmask()
+{
+  int mask = 0;
+  mask |= INITIAL_INTEGRATE | PRE_NEIGHBOR;
+  return mask;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixShardlowKokkos<DeviceType>::init()
+{
+  FixShardlow::init();
+
+  int irequest = neighbor->nrequest - 1;
+
+  neighbor->requests[irequest]->
+    kokkos_host = Kokkos::Impl::is_same<DeviceType,LMPHostType>::value &&
+    !Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+  neighbor->requests[irequest]->
+    kokkos_device = Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+
+//  neighbor->requests[irequest]->pair = 0;
+//  neighbor->requests[irequest]->fix  = 1;
+//  neighbor->requests[irequest]->ghost= 1;
+//  neighbor->requests[irequest]->ssa  = 1;
+
+  int ntypes = atom->ntypes;
+  k_params = Kokkos::DualView<params_ssa**,Kokkos::LayoutRight,DeviceType>
+    ("FixShardlowKokkos::params",ntypes+1,ntypes+1);
+  params = k_params.template view<DeviceType>();
+//FIXME either create cutsq and fill it in, or just point to pairDPD's...
+//  memory->destroy(cutsq); //FIXME
+//  memory->create_kokkos(k_cutsq,cutsq,ntypes+1,ntypes+1,"FixShardlowKokkos:cutsq");
+  d_cutsq = k_pairDPDE->k_cutsq.template view<DeviceType>(); //FIXME
+
+  const double boltz2 = 2.0*force->boltz;
+  for (int i = 1; i <= ntypes; i++) {
+    for (int j = i; j <= ntypes; j++) {
+      F_FLOAT cutone = k_pairDPDE->cut[i][j];
+//      k_cutsq.h_view(i,j) = k_cutsq.h_view(j,i) = cutone*cutone; //FIXME
+      if (cutone > EPSILON) k_params.h_view(i,j).cutinv = 1.0/cutone;
+      else k_params.h_view(i,j).cutinv = FLT_MAX;
+      k_params.h_view(i,j).halfsigma = 0.5*k_pairDPDE->sigma[i][j];
+      k_params.h_view(i,j).kappa = k_pairDPDE->kappa[i][j];
+      k_params.h_view(i,j).alpha = sqrt(boltz2*k_pairDPDE->kappa[i][j]);
+
+      k_params.h_view(j,i) = k_params.h_view(i,j);
+
+      if(i<MAX_TYPES_STACKPARAMS+1 && j<MAX_TYPES_STACKPARAMS+1) {
+        m_params[i][j] = m_params[j][i] = k_params.h_view(i,j);
+        m_cutsq[j][i] = m_cutsq[i][j] = k_pairDPDE->k_cutsq.h_view(i,j);
+      }
+    }
+  }
+
+  // k_cutsq.template modify<LMPHostType>();
+  k_params.template modify<LMPHostType>();
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixShardlowKokkos<DeviceType>::init_list(int id, NeighList *ptr)
+{
+  k_list = static_cast<NeighListKokkos<DeviceType>*>(ptr);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixShardlowKokkos<DeviceType>::pre_neighbor()
+{
+  // NOTE: this logic is specific to orthogonal boxes, not triclinic
+
+  // Enforce the constraint that ghosts must be contained in the nearest sub-domains
+  double bbx = domain->subhi[0] - domain->sublo[0];
+  double bby = domain->subhi[1] - domain->sublo[1];
+  double bbz = domain->subhi[2] - domain->sublo[2];
+
+  double rcut = 2.0*neighbor->cutneighmax;
+
+  if (domain->triclinic)
+    error->all(FLERR,"Fix shardlow does not yet support triclinic geometries");
+
+  if(rcut >= bbx || rcut >= bby || rcut>= bbz )
+  {
+    char fmt[] = {"Shardlow algorithm requires sub-domain length > 2*(rcut+skin). Either reduce the number of processors requested, or change the cutoff/skin: rcut= %e bbx= %e bby= %e bbz= %e\n"};
+    char *msg = (char *) malloc(sizeof(fmt) + 4*15);
+    sprintf(msg, fmt, rcut, bbx, bby, bbz);
+    error->one(FLERR, msg);
+  }
+
+  nlocal = atomKK->nlocal;
+  nghost = atomKK->nghost;
+
+  // Allocate memory for h_v_t0 to hold the initial velocities for the ghosts
+  if (nghost > ghostmax) {
+    ghostmax = nghost;
+    k_v_t0 = DAT::tdual_v_array("FixShardlowKokkos:v_t0", ghostmax);
+    // d_v_t0 = k_v_t0.template view<DeviceType>();
+    h_v_t0 = k_v_t0.h_view;
+  }
+
+  // Setup views of relevant data
+  x = atomKK->k_x.template view<DeviceType>();
+  v = atomKK->k_v.template view<DeviceType>();
+  h_v = atomKK->k_v.h_view;
+  uCond = atomKK->k_uCond.template view<DeviceType>();
+  h_uCond = atomKK->k_uCond.h_view;
+  uMech = atomKK->k_uMech.template view<DeviceType>();
+  h_uMech = atomKK->k_uMech.h_view;
+  type = atomKK->k_type.view<DeviceType>();
+  if (atomKK->rmass) {
+    massPerI = true;
+    masses = atomKK->k_rmass.view<DeviceType>();
+  } else {
+    massPerI = false;
+    masses = atomKK->k_mass.view<DeviceType>();
+  }
+//   if(k_pairDPDE){
+  dpdTheta = atomKK->k_dpdTheta.view<DeviceType>();
+
+//} else {
+//}
+}
+
+template<class DeviceType>
+void FixShardlowKokkos<DeviceType>::setup_pre_neighbor()
+{
+  pre_neighbor();
+}
+
+/* ---------------------------------------------------------------------- */
+
+#ifdef NOTNOW
+/* ----------------------------------------------------------------------
+   Perform the stochastic integration and Shardlow update for constant temperature
+   Allow for both per-type and per-atom mass
+
+   NOTE: only implemented for orthogonal boxes, not triclinic
+------------------------------------------------------------------------- */
+template<class DeviceType>
+template<bool STACKPARAMS>
+void FixShardlowKokkos<DeviceType>::ssa_update_dpd(
+  int start_ii, int count
+)
+{
+  rand_type rand_gen = p_rand_pool->get_state();
+
+  const double theta_ij_inv = 1.0/k_pairDPD->temperature; // independent of i,j
+  const double boltz_inv = 1.0/force->boltz;
+  const double ftm2v = force->ftm2v;
+  const double dt     = update->dt;
+  int ct = count;
+  int ii = start_ii;
+
+  while (ct-- > 0) {
+    const int i = d_ilist(ii);
+    const int jlen = d_numneigh(ii);
+
+    const double xtmp = x(i, 0);
+    const double ytmp = x(i, 1);
+    const double ztmp = x(i, 2);
+
+    // load velocity for i from memory
+    double vxi = v(i, 0);
+    double vyi = v(i, 1);
+    double vzi = v(i, 2);
+
+    const int itype = type(i);
+
+    const double mass_i = masses(massPerI ? i : itype);
+    const double massinv_i = 1.0 / mass_i;
+
+    // Loop over Directional Neighbors only
+    for (int jj = 0; jj < jlen; jj++) {
+      const int j = d_neighbors(ii,jj) & NEIGHMASK;
+      int jtype = type[j];
+
+      const X_FLOAT delx = xtmp - x(j, 0);
+      const X_FLOAT dely = ytmp - x(j, 1);
+      const X_FLOAT delz = ztmp - x(j, 2);
+      const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+
+      // NOTE: r can be 0.0 in DPD systems, so do EPSILON_SQUARED test
+      if ((rsq < STACKPARAMS?m_cutsq[itype][jtype]:d_cutsq(itype,jtype))
+        && (rsq >= EPSILON_SQUARED)) {
+        double r = sqrt(rsq);
+        double rinv = 1.0/r;
+        double delx_rinv = delx*rinv;
+        double dely_rinv = dely*rinv;
+        double delz_rinv = delz*rinv;
+
+        double wr = 1.0 - r*(STACKPARAMS?m_params[itype][jtype].cutinv:params(itype,jtype).cutinv);
+        double wdt = wr*wr*dt;
+
+        double halfsigma_ij = STACKPARAMS?m_params[itype][jtype].halfsigma:params(itype,jtype).halfsigma;
+        double halfgamma_ij = halfsigma_ij*halfsigma_ij*boltz_inv*theta_ij_inv;
+
+        double sigmaRand = halfsigma_ij*wr*dtsqrt*ftm2v * pRNG->gaussian();
+
+        const double mass_j = masses(massPerI ? j : jtype);
+        double massinv_j = 1.0 / mass_j;
+
+        double gammaFactor = halfgamma_ij*wdt*ftm2v;
+        double inv_1p_mu_gammaFactor = 1.0/(1.0 + (massinv_i + massinv_j)*gammaFactor);
+
+        double vxj = v(j, 0);
+        double vyj = v(j, 1);
+        double vzj = v(j, 2);
+
+        // Compute the initial velocity difference between atom i and atom j
+        double delvx = vxi - vxj;
+        double delvy = vyi - vyj;
+        double delvz = vzi - vzj;
+        double dot_rinv = (delx_rinv*delvx + dely_rinv*delvy + delz_rinv*delvz);
+
+        // Compute momentum change between t and t+dt
+        double factorA = sigmaRand - gammaFactor*dot_rinv;
+
+        // Update the velocity on i
+        vxi += delx_rinv*factorA*massinv_i;
+        vyi += dely_rinv*factorA*massinv_i;
+        vzi += delz_rinv*factorA*massinv_i;
+
+        // Update the velocity on j
+        vxj -= delx_rinv*factorA*massinv_j;
+        vyj -= dely_rinv*factorA*massinv_j;
+        vzj -= delz_rinv*factorA*massinv_j;
+
+        //ii.   Compute the new velocity diff
+        delvx = vxi - vxj;
+        delvy = vyi - vyj;
+        delvz = vzi - vzj;
+        dot_rinv = delx_rinv*delvx + dely_rinv*delvy + delz_rinv*delvz;
+
+        // Compute the new momentum change between t and t+dt
+        double factorB = (sigmaRand - gammaFactor*dot_rinv)*inv_1p_mu_gammaFactor;
+
+        // Update the velocity on i
+        vxi += delx_rinv*factorB*massinv_i;
+        vyi += dely_rinv*factorB*massinv_i;
+        vzi += delz_rinv*factorB*massinv_i;
+
+        // Update the velocity on j
+        vxj -= delx_rinv*factorB*massinv_j;
+        vyj -= dely_rinv*factorB*massinv_j;
+        vzj -= delz_rinv*factorB*massinv_j;
+
+        // Store updated velocity for j
+        v(j, 0) = vxj;
+        v(j, 1) = vyj;
+        v(j, 2) = vzj;
+      }
+    }
+    // store updated velocity for i
+    v(i, 0) = vxi;
+    v(i, 1) = vyi;
+    v(i, 2) = vzi;
+  }
+
+  p_rand_pool->free_state(rand_gen);
+}
+#endif
+
+/* ----------------------------------------------------------------------
+   Perform the stochastic integration and Shardlow update for constant energy
+   Allow for both per-type and per-atom mass
+
+   NOTE: only implemented for orthogonal boxes, not triclinic
+------------------------------------------------------------------------- */
+template<class DeviceType>
+template<bool STACKPARAMS>
+void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
+  int start_ii, int count
+)
+{
+  rand_type rand_gen = p_rand_pool->get_state();
+
+  const double boltz_inv = 1.0/force->boltz;
+  const double ftm2v = force->ftm2v;
+  const double dt     = update->dt;
+  int ct = count;
+  int ii = start_ii;
+
+  while (ct-- > 0) {
+    const int i = d_ilist(ii);
+    const int jlen = d_numneigh(ii);
+
+    const double xtmp = x(i, 0);
+    const double ytmp = x(i, 1);
+    const double ztmp = x(i, 2);
+
+    // load velocity for i from memory
+    double vxi = v(i, 0);
+    double vyi = v(i, 1);
+    double vzi = v(i, 2);
+
+    double uMech_i = uMech(i);
+    double uCond_i = uCond(i);
+    const int itype = type(i);
+
+    const double theta_i_inv = 1.0/dpdTheta(i);
+    const double mass_i = masses(massPerI ? i : itype);
+    const double massinv_i = 1.0 / mass_i;
+    const double mass_i_div_neg4_ftm2v = mass_i*(-0.25)/ftm2v;
+
+    // Loop over Directional Neighbors only
+    for (int jj = 0; jj < jlen; jj++) {
+      const int j = d_neighbors(ii,jj) & NEIGHMASK;
+      const int jtype = type(j);
+
+      const X_FLOAT delx = xtmp - x(j, 0);
+      const X_FLOAT dely = ytmp - x(j, 1);
+      const X_FLOAT delz = ztmp - x(j, 2);
+      const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+
+      // NOTE: r can be 0.0 in DPD systems, so do EPSILON_SQUARED test
+      if ((rsq < STACKPARAMS?m_cutsq[itype][jtype]:d_cutsq(itype,jtype))
+        && (rsq >= EPSILON_SQUARED)) {
+        double r = sqrt(rsq);
+        double rinv = 1.0/r;
+        double delx_rinv = delx*rinv;
+        double dely_rinv = dely*rinv;
+        double delz_rinv = delz*rinv;
+
+        double wr = 1.0 - r*(STACKPARAMS?m_params[itype][jtype].cutinv:params(itype,jtype).cutinv);
+        double wdt = wr*wr*dt;
+
+        // Compute the current temperature
+        double theta_j_inv = 1.0/dpdTheta(j);
+        double theta_ij_inv = 0.5*(theta_i_inv + theta_j_inv);
+
+        double halfsigma_ij = STACKPARAMS?m_params[itype][jtype].halfsigma:params(itype,jtype).halfsigma;
+        double halfgamma_ij = halfsigma_ij*halfsigma_ij*boltz_inv*theta_ij_inv;
+
+        double sigmaRand = halfsigma_ij*wr*dtsqrt*ftm2v * rand_gen.normal();
+
+        const double mass_j = masses(massPerI ? j : jtype);
+        double mass_ij_div_neg4_ftm2v = mass_j*mass_i_div_neg4_ftm2v;
+        double massinv_j = 1.0 / mass_j;
+
+        // Compute uCond
+        double kappa_ij = STACKPARAMS?m_params[itype][jtype].kappa:params(itype,jtype).kappa;
+        double alpha_ij = STACKPARAMS?m_params[itype][jtype].alpha:params(itype,jtype).alpha;
+        double del_uCond = alpha_ij*wr*dtsqrt * rand_gen.normal();
+
+        del_uCond += kappa_ij*(theta_i_inv - theta_j_inv)*wdt;
+        uCond[j] -= del_uCond;
+        uCond_i += del_uCond;
+
+        double gammaFactor = halfgamma_ij*wdt*ftm2v;
+        double inv_1p_mu_gammaFactor = 1.0/(1.0 + (massinv_i + massinv_j)*gammaFactor);
+
+        double vxj = v(j, 0);
+        double vyj = v(j, 1);
+        double vzj = v(j, 2);
+        double dot4 = vxj*vxj + vyj*vyj + vzj*vzj;
+        double dot3 = vxi*vxi + vyi*vyi + vzi*vzi;
+
+        // Compute the initial velocity difference between atom i and atom j
+        double delvx = vxi - vxj;
+        double delvy = vyi - vyj;
+        double delvz = vzi - vzj;
+        double dot_rinv = (delx_rinv*delvx + dely_rinv*delvy + delz_rinv*delvz);
+
+        // Compute momentum change between t and t+dt
+        double factorA = sigmaRand - gammaFactor*dot_rinv;
+
+        // Update the velocity on i
+        vxi += delx_rinv*factorA*massinv_i;
+        vyi += dely_rinv*factorA*massinv_i;
+        vzi += delz_rinv*factorA*massinv_i;
+
+        // Update the velocity on j
+        vxj -= delx_rinv*factorA*massinv_j;
+        vyj -= dely_rinv*factorA*massinv_j;
+        vzj -= delz_rinv*factorA*massinv_j;
+
+        //ii.   Compute the new velocity diff
+        delvx = vxi - vxj;
+        delvy = vyi - vyj;
+        delvz = vzi - vzj;
+        dot_rinv = delx_rinv*delvx + dely_rinv*delvy + delz_rinv*delvz;
+
+        // Compute the new momentum change between t and t+dt
+        double factorB = (sigmaRand - gammaFactor*dot_rinv)*inv_1p_mu_gammaFactor;
+
+        // Update the velocity on i
+        vxi += delx_rinv*factorB*massinv_i;
+        vyi += dely_rinv*factorB*massinv_i;
+        vzi += delz_rinv*factorB*massinv_i;
+        double partial_uMech = (vxi*vxi + vyi*vyi + vzi*vzi - dot3)*massinv_j;
+
+        // Update the velocity on j
+        vxj -= delx_rinv*factorB*massinv_j;
+        vyj -= dely_rinv*factorB*massinv_j;
+        vzj -= delz_rinv*factorB*massinv_j;
+        partial_uMech += (vxj*vxj + vyj*vyj + vzj*vzj - dot4)*massinv_i;
+
+        // Store updated velocity for j
+        v(j, 0) = vxj;
+        v(j, 1) = vyj;
+        v(j, 2) = vzj;
+
+        // Compute uMech
+        double del_uMech = partial_uMech*mass_ij_div_neg4_ftm2v;
+        uMech_i += del_uMech;
+        uMech(j) += del_uMech;
+      }
+    }
+    // store updated velocity for i
+    v(i, 0) = vxi;
+    v(i, 1) = vyi;
+    v(i, 2) = vzi;
+    // store updated uMech and uCond for i
+    uMech(i) = uMech_i;
+    uCond(i) = uCond_i;
+    ii++;
+  }
+
+  p_rand_pool->free_state(rand_gen);
+}
+
+
+template<class DeviceType>
+void FixShardlowKokkos<DeviceType>::initial_integrate(int vflag)
+{
+  d_numneigh = k_list->d_numneigh;
+  d_neighbors = k_list->d_neighbors;
+  d_ilist = k_list->d_ilist;
+
+  k_list->clean_copy();
+  //cleanup_copy();
+  copymode = 1;
+
+  dtsqrt = sqrt(update->dt);
+
+  NPairSSAKokkos<DeviceType> *np_ssa = dynamic_cast<NPairSSAKokkos<DeviceType>*>(list->np);
+  if (!np_ssa) error->one(FLERR, "NPair wasn't a NPairSSAKokkos object");
+  ssa_phaseCt = np_ssa->ssa_phaseCt;
+  ssa_phaseLen = np_ssa->ssa_phaseLen;
+  ssa_itemLoc = np_ssa->ssa_itemLoc;
+  ssa_itemLen = np_ssa->ssa_itemLen;
+  ssa_gphaseCt = np_ssa->ssa_gphaseCt;
+  ssa_gphaseLen = np_ssa->ssa_gphaseLen;
+  ssa_gitemLoc = np_ssa->ssa_gitemLoc;
+  ssa_gitemLen = np_ssa->ssa_gitemLen;
+
+  // process neighbors in the local AIR
+  for (int workPhase = 0; workPhase < ssa_phaseCt; ++workPhase) {
+    int workItemCt = ssa_phaseLen[workPhase];
+
+    if(atom->ntypes > MAX_TYPES_STACKPARAMS) {
+      Kokkos::parallel_for(workItemCt, KOKKOS_LAMBDA (const int workItem ) {
+        int ct = ssa_itemLen(workPhase, workItem);
+        int ii = ssa_itemLoc(workPhase, workItem);
+        ssa_update_dpde<false>(ii, ct);
+      });
+    } else {
+      Kokkos::parallel_for(workItemCt, KOKKOS_LAMBDA (const int workItem ) {
+        int ct = ssa_itemLen(workPhase, workItem);
+        int ii = ssa_itemLoc(workPhase, workItem);
+        ssa_update_dpde<true>(ii, ct);
+      });
+    }
+  }
+
+  //Loop over all 13 outward directions (7 stages)
+  for (int workPhase = 0; workPhase < ssa_gphaseCt; ++workPhase) {
+    // int airnum = workPhase + 1;
+    int workItemCt = ssa_gphaseLen[workPhase];
+
+    // Communicate the updated velocities to all nodes
+    comm->forward_comm_fix(this);
+
+    if(k_pairDPDE){
+      // Zero out the ghosts' uCond & uMech to be used as delta accumulators
+//      memset(&(atom->uCond[nlocal]), 0, sizeof(double)*nghost);
+//      memset(&(atom->uMech[nlocal]), 0, sizeof(double)*nghost);
+
+      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType>(nlocal,nlocal+nghost), KOKKOS_LAMBDA (const int i) {
+        uCond(i) = 0.0;
+        uMech(i) = 0.0;
+      });
+      DeviceType::fence();
+    }
+
+    // process neighbors in this AIR
+    if(atom->ntypes > MAX_TYPES_STACKPARAMS) {
+      Kokkos::parallel_for(workItemCt, KOKKOS_LAMBDA (const int workItem ) {
+        int ct = ssa_gitemLen(workPhase, workItem);
+        int ii = ssa_gitemLoc(workPhase, workItem);
+        ssa_update_dpde<false>(ii, ct);
+      });
+    } else {
+      Kokkos::parallel_for(workItemCt, KOKKOS_LAMBDA (const int workItem ) {
+        int ct = ssa_gitemLen(workPhase, workItem);
+        int ii = ssa_gitemLoc(workPhase, workItem);
+        ssa_update_dpde<true>(ii, ct);
+      });
+    }
+
+    // Communicate the ghost deltas to the atom owners
+    comm->reverse_comm_fix(this);
+
+  }  //End Loop over all directions For airnum = Top, Top-Right, Right, Bottom-Right, Back
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+int FixShardlowKokkos<DeviceType>::pack_forward_comm(int n, int *list, double *buf, int pbc_flag, int *pbc)
+{
+  int ii,jj,m;
+
+  m = 0;
+  for (ii = 0; ii < n; ii++) {
+    jj = list[ii];
+    buf[m++] = h_v(jj, 0);
+    buf[m++] = h_v(jj, 1);
+    buf[m++] = h_v(jj, 2);
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixShardlowKokkos<DeviceType>::unpack_forward_comm(int n, int first, double *buf)
+{
+  int ii,m,last;
+
+  m = 0;
+  last = first + n ;
+  for (ii = first; ii < last; ii++) {
+    h_v_t0(ii - nlocal, 0) = h_v(ii, 0) = buf[m++];
+    h_v_t0(ii - nlocal, 1) = h_v(ii, 1) = buf[m++];
+    h_v_t0(ii - nlocal, 2) = h_v(ii, 2) = buf[m++];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+int FixShardlowKokkos<DeviceType>::pack_reverse_comm(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    buf[m++] = h_v(i, 0) - h_v_t0(i - nlocal, 0);
+    buf[m++] = h_v(i, 1) - h_v_t0(i - nlocal, 1);
+    buf[m++] = h_v(i, 2) - h_v_t0(i - nlocal, 2);
+    if(k_pairDPDE){
+      buf[m++] = h_uCond(i); // for ghosts, this is an accumulated delta
+      buf[m++] = h_uMech(i); // for ghosts, this is an accumulated delta
+    }
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixShardlowKokkos<DeviceType>::unpack_reverse_comm(int n, int *list, double *buf)
+{
+  int i,j,m;
+
+  m = 0;
+  for (i = 0; i < n; i++) {
+    j = list[i];
+
+    h_v(j, 0) += buf[m++];
+    h_v(j, 1) += buf[m++];
+    h_v(j, 2) += buf[m++];
+    if(k_pairDPDE){
+      h_uCond(j) += buf[m++]; // add in the accumulated delta
+      h_uMech(j) += buf[m++]; // add in the accumulated delta
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+double FixShardlowKokkos<DeviceType>::memory_usage()
+{
+  double bytes = 0.0;
+  bytes += sizeof(double)*3*ghostmax; // v_t0[]
+  return bytes;
+}
+
+namespace LAMMPS_NS {
+template class FixShardlowKokkos<LMPDeviceType>;
+#ifdef KOKKOS_HAVE_CUDA
+template class FixShardlowKokkos<LMPHostType>;
+#endif
+}
--- a/src/KOKKOS/fix_shardlow_kokkos.h
+++ b/src/KOKKOS/fix_shardlow_kokkos.h
@ -0,0 +1,154 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(shardlow/kk,FixShardlowKokkos<LMPDeviceType>)
+FixStyle(shardlow/kk/device,FixShardlowKokkos<LMPDeviceType>)
+FixStyle(shardlow/kk/host,FixShardlowKokkos<LMPHostType>)
+
+#else
+
+#ifndef LMP_FIX_SHARDLOW_KOKKOS_H
+#define LMP_FIX_SHARDLOW_KOKKOS_H
+
+#include "float.h"
+#include "fix_shardlow.h"
+#include "kokkos_type.h"
+#include "neigh_list_kokkos.h"
+#include "pair_dpd_fdt_energy_kokkos.h"
+
+namespace LAMMPS_NS {
+
+template<class DeviceType>
+class FixShardlowKokkos : public FixShardlow {
+ public:
+  typedef ArrayTypes<DeviceType> AT;
+  NeighListKokkos<DeviceType> *k_list; // The SSA specific neighbor list
+
+  FixShardlowKokkos(class LAMMPS *, int, char **);
+  ~FixShardlowKokkos();
+  int setmask();
+  virtual void init();
+  virtual void init_list(int, class NeighList *);
+  virtual void initial_integrate(int);
+  void setup_pre_neighbor();
+  void pre_neighbor();
+
+  double memory_usage();
+
+  int pack_reverse_comm(int, int, double *);
+  void unpack_reverse_comm(int, int *, double *);
+  int pack_forward_comm(int , int *, double *, int, int *);
+  void unpack_forward_comm(int , int , double *);
+
+  struct params_ssa {
+    KOKKOS_INLINE_FUNCTION
+    params_ssa(){cutinv=FLT_MAX;halfsigma=0;kappa=0;alpha=0;};
+    KOKKOS_INLINE_FUNCTION
+    params_ssa(int i){cutinv=FLT_MAX;halfsigma=0;kappa=0;alpha=0;};
+    F_FLOAT cutinv,halfsigma,kappa,alpha;
+  };
+
+ protected:
+//  class PairDPDfdt *pairDPD;
+  PairDPDfdtEnergyKokkos<DeviceType> *k_pairDPDE;
+  Kokkos::Random_XorShift64_Pool<DeviceType> *p_rand_pool;
+  typedef typename Kokkos::Random_XorShift64_Pool<DeviceType>::generator_type rand_type;
+
+  Kokkos::DualView<params_ssa**,Kokkos::LayoutRight,DeviceType> k_params;
+  typename Kokkos::DualView<params_ssa**,
+    Kokkos::LayoutRight,DeviceType>::t_dev_const_um params;
+  // hardwired to space for MAX_TYPES_STACKPARAMS (12) atom types
+  params_ssa m_params[MAX_TYPES_STACKPARAMS+1][MAX_TYPES_STACKPARAMS+1];
+
+  F_FLOAT m_cutsq[MAX_TYPES_STACKPARAMS+1][MAX_TYPES_STACKPARAMS+1];
+  typename ArrayTypes<DeviceType>::t_ffloat_2d d_cutsq;
+
+  typename DAT::tdual_v_array k_v_t0;
+  // typename AT::t_v_array d_v_t0; v_t0 only used in comm routines (on host)
+  typename HAT::t_v_array h_v_t0;
+
+  typename AT::t_x_array x;
+  typename AT::t_v_array v;
+  typename HAT::t_v_array h_v;
+  typename AT::t_efloat_1d uCond, uMech;
+  typename HAT::t_efloat_1d h_uCond, h_uMech;
+  typename AT::t_int_1d type;
+  bool massPerI;
+  typename AT::t_float_1d_randomread masses;
+  typename AT::t_efloat_1d dpdTheta;
+
+  double dtsqrt; // = sqrt(update->dt);
+  int ghostmax;
+  int nlocal, nghost;
+
+  typename AT::t_neighbors_2d d_neighbors;
+  typename AT::t_int_1d_randomread d_ilist, d_numneigh;
+
+  int ssa_phaseCt;
+  typename AT::t_int_1d ssa_phaseLen;
+  typename AT::t_int_2d ssa_itemLoc, ssa_itemLen;
+
+  int ssa_gphaseCt;
+  typename AT::t_int_1d ssa_gphaseLen;
+  typename AT::t_int_2d ssa_gitemLoc, ssa_gitemLen;
+
+
+//  template<bool STACKPARAMS>
+//  void ssa_update_dpd(int, int);  // Constant Temperature
+  template<bool STACKPARAMS>
+  void ssa_update_dpde(int, int); // Constant Energy
+
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Must use dpd/fdt pair_style with fix shardlow
+
+Self-explanatory.
+
+E: Must use pair_style dpd/fdt or dpd/fdt/energy with fix shardlow
+
+E: A deterministic integrator must be specified after fix shardlow in input
+file (e.g. fix nve or fix nph).
+
+Self-explanatory.
+
+E: Cannot use constant temperature integration routines with DPD
+
+Self-explanatory.  Must use deterministic integrators such as nve or nph
+
+E: Fix shardlow does not yet support triclinic geometries
+
+Self-explanatory.
+
+E:  Shardlow algorithm requires sub-domain length > 2*(rcut+skin). Either
+reduce the number of processors requested, or change the cutoff/skin
+
+The Shardlow splitting algorithm requires the size of the sub-domain lengths
+to be are larger than twice the cutoff+skin.  Generally, the domain decomposition
+is dependant on the number of processors requested.
+
+*/