diff --git a/src/USER-OMP/fix_nphug_omp.cpp b/src/USER-OMP/fix_nphug_omp.cpp
new file mode 100644
index 0000000000..273ba4c450
--- /dev/null
+++ b/src/USER-OMP/fix_nphug_omp.cpp
@@ -0,0 +1,461 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "string.h"
+#include "stdlib.h"
+#include "fix_nphug_omp.h"
+#include "modify.h"
+#include "error.h"
+#include "update.h"
+#include "compute.h"
+#include "force.h"
+#include "domain.h"
+#include "group.h"
+#include "math.h"
+#include "memory.h"
+#include "comm.h"
+#include "math.h"
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+enum{ISO,ANISO,TRICLINIC}; // same as fix_nh.cpp
+
+/* ---------------------------------------------------------------------- */
+
+FixNPHugOMP::FixNPHugOMP(LAMMPS *lmp, int narg, char **arg) :
+  FixNHOMP(lmp, narg, arg)
+{
+
+  // Prevent masses from being updated every timestep
+
+  eta_mass_flag = 0;
+  omega_mass_flag = 0;
+  etap_mass_flag = 0;
+
+  // extend vector of base-class computes
+
+  size_vector += 3;
+
+  // turn off deviatoric flag and remove strain energy from vector
+
+  deviatoric_flag = 0;
+  size_vector -= 1;
+
+  // use initial state as reference state
+
+  v0_set = p0_set = e0_set = 0;
+
+  // check pressure settings
+
+  if (p_start[0] != p_stop[0] ||
+      p_start[1] != p_stop[1] ||
+      p_start[2] != p_stop[2])
+    error->all(FLERR,"Pstart and Pstop must have the same value");
+
+  // uniaxial = 0 means hydrostatic compression
+  // uniaxial = 1 means uniaxial compression
+  //      in x, y, or z (idir = 0, 1, or 2)
+
+  // isotropic hydrostatic compression
+
+  if (pstyle == ISO) {
+    uniaxial = 0;
+
+    // anisotropic compression
+
+  } else if (pstyle == ANISO) {
+
+    // anisotropic hydrostatic compression
+
+    if (p_start[0] == p_start[1] &&
+        p_start[0] == p_start[2] )
+      uniaxial = 0;
+
+    // uniaxial compression
+
+    else if (p_flag[0] == 1 && p_flag[1] == 0
+        && p_flag[2] == 0) {
+      uniaxial = 1;
+      idir = 0;
+    } else if (p_flag[0] == 0 && p_flag[1] == 1
+           && p_flag[2] == 0) {
+      uniaxial = 1;
+      idir = 1;
+    } else if (p_flag[0] == 0 && p_flag[1] == 0
+               && p_flag[2] == 1) {
+      uniaxial = 1;
+      idir = 2;
+
+    } else error->all(FLERR,"Specified target stress must be uniaxial or hydrostatic");
+
+    // triclinic hydrostatic compression
+
+  } else if (pstyle == TRICLINIC) {
+
+    if (p_start[0] == p_start[1] &&
+        p_start[0] == p_start[2] &&
+        p_start[3] == 0.0 &&
+        p_start[4] == 0.0 &&
+        p_start[5] == 0.0 )
+      uniaxial = 0;
+
+    else error->all(FLERR,"For triclinic deformation, specified target stress must be hydrostatic");
+  }
+
+  if (!tstat_flag)
+    error->all(FLERR,"Temperature control must be used with fix nphug/omp");
+  if (!pstat_flag)
+    error->all(FLERR,"Pressure control must be used with fix nphug/omp");
+
+  // create a new compute temp style
+  // id = fix-ID + temp
+  // compute group = all since pressure is always global (group all)
+  // and thus its KE/temperature contribution should use group all
+
+  int n = strlen(id) + 6;
+  id_temp = new char[n];
+  strcpy(id_temp,id);
+  strcat(id_temp,"_temp");
+
+  char **newarg = new char*[3];
+  newarg[0] = id_temp;
+  newarg[1] = (char *) "all";
+  newarg[2] = (char *) "temp";
+
+  modify->add_compute(3,newarg);
+  delete [] newarg;
+  tflag = 1;
+
+  // create a new compute pressure style
+  // id = fix-ID + press, compute group = all
+  // pass id_temp as 4th arg to pressure constructor
+
+  n = strlen(id) + 7;
+  id_press = new char[n];
+  strcpy(id_press,id);
+  strcat(id_press,"_press");
+
+  newarg = new char*[4];
+  newarg[0] = id_press;
+  newarg[1] = (char *) "all";
+  newarg[2] = (char *) "pressure";
+  newarg[3] = id_temp;
+  modify->add_compute(4,newarg);
+  delete [] newarg;
+  pflag = 1;
+
+  // create a new compute potential energy compute
+
+  n = strlen(id) + 3;
+  id_pe = new char[n];
+  strcpy(id_pe,id);
+  strcat(id_pe,"_pe");
+
+  newarg = new char*[3];
+  newarg[0] = id_pe;
+  newarg[1] = (char*)"all";
+  newarg[2] = (char*)"pe";
+  modify->add_compute(3,newarg);
+  delete [] newarg;
+  peflag = 1;
+}
+
+/* ---------------------------------------------------------------------- */
+
+FixNPHugOMP::~FixNPHugOMP()
+{
+
+  // temp and press computes handled by base class
+  // delete pe compute
+
+  if (peflag) modify->delete_compute(id_pe);
+  delete [] id_pe;
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNPHugOMP::init()
+{
+  // Call base class init()
+
+  FixNHOMP::init();
+
+  // set pe ptr
+
+  int icompute = modify->find_compute(id_pe);
+  if (icompute < 0)
+    error->all(FLERR,"Potential energy ID for fix nvt/nph/npt does not exist");
+  pe = modify->compute[icompute];
+}
+
+
+/* ----------------------------------------------------------------------
+   compute initial state before integrator starts
+------------------------------------------------------------------------- */
+
+void FixNPHugOMP::setup(int vflag)
+{
+  FixNHOMP::setup(vflag);
+
+  if ( v0_set == 0 ) {
+    v0 = compute_vol();
+    v0_set = 1;
+  }
+
+  if ( p0_set == 0 ) {
+    p0_set = 1;
+    if (uniaxial == 1)
+      p0 = p_current[idir];
+    else
+      p0 = (p_current[0]+p_current[1]+p_current[2])/3.0;
+  }
+
+  if ( e0_set == 0 ) {
+    e0 = compute_etotal();
+    e0_set = 1;
+  }
+
+  double masstot = group->mass(igroup);
+  rho0 = nktv2p*force->mvv2e*masstot/v0;
+
+  t_target = 0.01;
+
+  pe->addstep(update->ntimestep+1);
+}
+
+/* ----------------------------------------------------------------------
+   compute target temperature and kinetic energy
+-----------------------------------------------------------------------*/
+
+void FixNPHugOMP::compute_temp_target()
+{
+  t_target = t_current + compute_hugoniot();
+  ke_target = tdof * boltz * t_target;
+  pe->addstep(update->ntimestep+1);
+}
+
+/* ---------------------------------------------------------------------- */
+
+double FixNPHugOMP::compute_etotal()
+{
+  double epot,ekin,etot;
+  epot = pe->compute_scalar();
+  if (thermo_energy) epot -= compute_scalar();
+  ekin = temperature->compute_scalar();
+  ekin *= 0.5 * tdof * force->boltz;
+  etot = epot+ekin;
+  return etot;
+}
+
+/* ---------------------------------------------------------------------- */
+
+double FixNPHugOMP::compute_vol()
+{
+  if (domain->dimension == 3)
+    return domain->xprd * domain->yprd * domain->zprd;
+  else
+    return domain->xprd * domain->yprd;
+}
+
+/* ----------------------------------------------------------------------
+   Computes the deviation of the current point
+   from the Hugoniot in temperature units.
+------------------------------------------------------------------------- */
+
+double FixNPHugOMP::compute_hugoniot()
+{
+  double v,e,p;
+  double dhugo;
+
+  e = compute_etotal();
+
+  temperature->compute_vector();
+
+
+  if (uniaxial == 1) {
+    pressure->compute_vector();
+    p = pressure->vector[idir];
+  } else
+    p = pressure->compute_scalar();
+
+  v = compute_vol();
+
+  dhugo = (0.5 * (p + p0 ) * ( v0 - v)) /
+    force->nktv2p + e0 - e;
+
+  dhugo /= tdof * boltz;
+
+  return dhugo;
+}
+
+/* ----------------------------------------------------------------------
+   Compute shock velocity is distance/time units
+------------------------------------------------------------------------- */
+
+double FixNPHugOMP::compute_us()
+{
+  double v,p;
+  double eps,us;
+
+  temperature->compute_vector();
+
+  if (uniaxial == 1) {
+    pressure->compute_vector();
+    p = pressure->vector[idir];
+  } else
+    p = pressure->compute_scalar();
+
+  v = compute_vol();
+
+  // Us^2 = (p-p0)/(rho0*eps)
+
+  eps = 1.0 - v/v0;
+  if (eps < 1.0e-10) us = 0.0;
+  else if (p < p0) us = 0.0;
+  else us = sqrt((p-p0)/(rho0*eps));
+
+  return us;
+}
+
+/* ----------------------------------------------------------------------
+   Compute particle velocity is distance/time units
+------------------------------------------------------------------------- */
+
+double FixNPHugOMP::compute_up()
+{
+  double v;
+  double eps,us,up;
+
+  v = compute_vol();
+  us = compute_us();
+
+  // u = eps*Us
+
+  eps = 1.0 - v/v0;
+  up = us*eps;
+
+  return up;
+}
+
+// look for index in local class
+// if index not found, look in base class
+
+double FixNPHugOMP::compute_vector(int n)
+{
+  int ilen;
+
+  // n = 0: Hugoniot energy difference (temperature units)
+
+  ilen = 1;
+  if (n < ilen) return compute_hugoniot();
+  n -= ilen;
+
+  // n = 1: Shock velocity
+
+  ilen = 1;
+  if (n < ilen) return compute_us();
+  n -= ilen;
+
+  // n = 2: Particle velocity
+
+  ilen = 1;
+  if (n < ilen) return compute_up();
+  n -= ilen;
+
+  // index not found, look in base class
+
+  return FixNHOMP::compute_vector(n);
+}
+
+/* ----------------------------------------------------------------------
+   pack restart data
+------------------------------------------------------------------------- */
+
+int FixNPHugOMP::pack_restart_data(double *list)
+{
+  int n = 0;
+
+  list[n++] = e0;
+  list[n++] = v0;
+  list[n++] = p0;
+
+  // call the base class function
+
+  n += FixNHOMP::pack_restart_data(list+n);
+
+  return n;
+}
+
+/* ----------------------------------------------------------------------
+   calculate the number of data to be packed
+------------------------------------------------------------------------- */
+
+int FixNPHugOMP::size_restart_global()
+{
+  int nsize = 3;
+
+  // call the base class function
+
+  nsize += FixNHOMP::size_restart_global();
+
+  return nsize;
+}
+
+/* ----------------------------------------------------------------------
+   use state info from restart file to restart the Fix
+------------------------------------------------------------------------- */
+
+void FixNPHugOMP::restart(char *buf)
+{
+  int n = 0;
+  double *list = (double *) buf;
+  e0 = list[n++];
+  v0 = list[n++];
+  p0 = list[n++];
+
+  e0_set = 1;
+  v0_set = 1;
+  p0_set = 1;
+
+  // call the base class function
+
+  buf += n*sizeof(double);
+  FixNHOMP::restart(buf);
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixNPHugOMP::modify_param(int narg, char **arg)
+{
+  if (strcmp(arg[0],"e0") == 0) {
+    if (narg < 2) error->all(FLERR,"Illegal fix nphug/omp command");
+    e0 = atof(arg[1]);
+    e0_set = 1;
+    return 2;
+  } else if (strcmp(arg[0],"v0") == 0) {
+    if (narg < 2) error->all(FLERR,"Illegal fix nphug/omp command");
+    v0 = atof(arg[1]);
+    v0_set = 1;
+    return 2;
+  } else if (strcmp(arg[0],"p0") == 0) {
+    if (narg < 2) error->all(FLERR,"Illegal fix nphug/omp command");
+    p0 = atof(arg[1]);
+    p0_set = 1;
+    return 2;
+  }
+
+  return 0;
+}
diff --git a/src/USER-OMP/fix_nphug_omp.h b/src/USER-OMP/fix_nphug_omp.h
new file mode 100644
index 0000000000..4a4db63440
--- /dev/null
+++ b/src/USER-OMP/fix_nphug_omp.h
@@ -0,0 +1,98 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(nphug/omp,FixNPHugOMP)
+
+#else
+
+#ifndef LMP_FIX_NPHUG_OMP_H
+#define LMP_FIX_NPHUG_OMP_H
+
+#include "fix_nh_omp.h"
+
+namespace LAMMPS_NS {
+
+class FixNPHugOMP : public FixNHOMP {
+ public:
+  FixNPHugOMP(class LAMMPS *, int, char **);
+  virtual ~FixNPHugOMP();
+
+  virtual void init();
+  virtual void setup(int);
+  virtual int modify_param(int, char **);
+  virtual int pack_restart_data(double *); // pack restart data
+  virtual void restart(char *);
+
+ private:
+  class Compute *pe;               // PE compute pointer
+
+  void compute_temp_target();
+  double compute_vector(int);
+  double compute_etotal();
+  double compute_vol();
+  double compute_hugoniot();
+  double compute_us();
+  double compute_up();
+
+  char *id_pe;
+  int peflag;
+  int v0_set,p0_set,e0_set;
+  double v0,p0,e0,rho0;
+  int idir;
+  int uniaxial;
+
+  int size_restart_global();
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Pstart and Pstop must have the same value
+
+Self-explanatory.
+
+E: Specified target stress must be uniaxial or hydrostatic
+
+Self-explanatory.
+
+E: For triclinic deformation, specified target stress must be hydrostatic
+
+Triclinic pressure control is allowed using the tri keyword, but
+non-hydrostatic pressure control can not be used in this case.
+
+E: Temperature control must be used with fix nphug
+
+The temp keyword must be provided.
+
+E: Pressure control must be used with fix nphug
+
+A pressure control keyword (iso, aniso, tri, x, y, or z) must be
+provided.
+
+E: Potential energy ID for fix nvt/nph/npt does not exist
+
+A compute for potential energy must be defined.
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+*/
diff --git a/src/USER-OMP/fix_rigid_small_omp.cpp b/src/USER-OMP/fix_rigid_small_omp.cpp
new file mode 100644
index 0000000000..b5d20655e8
--- /dev/null
+++ b/src/USER-OMP/fix_rigid_small_omp.cpp
@@ -0,0 +1,620 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "fix_rigid_small_omp.h"
+
+#include "atom.h"
+#include "atom_vec_ellipsoid.h"
+#include "atom_vec_line.h"
+#include "atom_vec_tri.h"
+#include "comm.h"
+#include "domain.h"
+
+#include <string.h>
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+#include "math_extra.h"
+#include "math_const.h"
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+using namespace MathConst;
+
+#define EINERTIA 0.4            // moment of inertia prefactor for ellipsoid
+
+enum{FULL_BODY,INITIAL,FINAL,FORCE_TORQUE,VCM_ANGMOM,XCM_MASS,ITENSOR,DOF};
+
+typedef struct { double x,y,z; } dbl3_t;
+#if defined(__GNUC__)
+#define _noalias __restrict
+#else
+#define _noalias
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+void FixRigidSmallOMP::initial_integrate(int vflag)
+{
+  int ibody;
+
+#if defined(_OPENMP)
+#pragma omp parallel for default(none) private(ibody) schedule(static)
+#endif
+  for (ibody = 0; ibody < nlocal_body; ibody++) {
+
+    Body &b = body[ibody];
+
+    // update vcm by 1/2 step
+
+    const double dtfm = dtf / b.mass;
+    b.vcm[0] += dtfm * b.fcm[0];
+    b.vcm[1] += dtfm * b.fcm[1];
+    b.vcm[2] += dtfm * b.fcm[2];
+
+    // update xcm by full step
+
+    b.xcm[0] += dtv * b.vcm[0];
+    b.xcm[1] += dtv * b.vcm[1];
+    b.xcm[2] += dtv * b.vcm[2];
+
+    // update angular momentum by 1/2 step
+
+    b.angmom[0] += dtf * b.torque[0];
+    b.angmom[1] += dtf * b.torque[1];
+    b.angmom[2] += dtf * b.torque[2];
+
+    // compute omega at 1/2 step from angmom at 1/2 step and current q
+    // update quaternion a full step via Richardson iteration
+    // returns new normalized quaternion, also updated omega at 1/2 step
+    // update ex,ey,ez to reflect new quaternion
+
+    MathExtra::angmom_to_omega(b.angmom,b.ex_space,b.ey_space,
+                               b.ez_space,b.inertia,b.omega);
+    MathExtra::richardson(b.quat,b.angmom,b.omega,b.inertia,dtq);
+    MathExtra::q_to_exyz(b.quat,b.ex_space,b.ey_space,b.ez_space);
+  } // end of omp parallel for
+
+  // virial setup before call to set_xv
+
+  if (vflag) v_setup(vflag);
+  else evflag = 0;
+
+  // forward communicate updated info of all bodies
+
+  commflag = INITIAL;
+  comm->forward_comm_variable_fix(this);
+
+  // set coords/orient and velocity/rotation of atoms in rigid bodies
+
+  if (triclinic)
+    if (evflag)
+      set_xv_thr<1,1>();
+    else
+      set_xv_thr<1,0>();
+  else
+    if (evflag)
+      set_xv_thr<0,1>();
+    else
+      set_xv_thr<0,0>();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixRigidSmallOMP::final_integrate()
+{
+  double * const * _noalias const x = atom->x;
+  const dbl3_t * _noalias const f = (dbl3_t *) atom->f[0];
+  const double * const * const torque_one = atom->torque;
+  const tagint * _noalias const image = atom->image;
+  const int nlocal = atom->nlocal;
+  const int nthreads=comm->nthreads;
+  int i, ibody;
+  
+#if defined(_OPENMP)
+#pragma omp parallel for default(none) private(ibody) schedule(static)
+#endif
+  for (ibody = 0; ibody < nlocal_body+nghost_body; ibody++) {
+    double * _noalias const fcm = body[ibody].fcm;
+    fcm[0] = fcm[1] = fcm[2] = 0.0;
+    double * _noalias const tcm = body[ibody].torque;
+    tcm[0] = tcm[1] = tcm[2] = 0.0;
+  }
+
+  // sum over atoms to get force and torque on rigid body
+  // we likely have a large number of rigid objects with only a
+  // a few atoms each. so we loop over all atoms for all threads
+  // and then each thread only processes some bodies.
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) private(i,ibody)
+#endif
+  {       
+#if defined(_OPENMP)
+    const int tid = omp_get_thread_num();
+#else
+    const int tid = 0;
+#endif
+
+    for (i = 0; i < nlocal; i++) {
+      ibody = atom2body[i];
+      if ((ibody < 0) || (ibody % nthreads != tid)) continue;
+    
+      Body &b = body[ibody];
+
+      double unwrap[3];
+      domain->unmap(x[i],image[i],unwrap);
+
+      double * _noalias const fcm = b.fcm;
+      double * _noalias const xcm = b.xcm;
+      double * _noalias const tcm = b.torque;
+
+      const double dx = unwrap[0] - xcm[0];
+      const double dy = unwrap[1] - xcm[1];
+      const double dz = unwrap[2] - xcm[2];
+
+      fcm[0] += f[i].x;
+      fcm[1] += f[i].y;
+      fcm[2] += f[i].z;
+
+      tcm[0] += dy*f[i].z - dz*f[i].y;
+      tcm[1] += dz*f[i].x - dx*f[i].z;
+      tcm[2] += dx*f[i].y - dy*f[i].x;
+
+      if (extended && (eflags[i] & TORQUE)) {
+        tcm[0] += torque_one[i][0];
+        tcm[1] += torque_one[i][1];
+        tcm[2] += torque_one[i][2];
+      }
+    }
+  } // end of omp parallel region
+
+  // reverse communicate fcm, torque of all bodies
+
+  commflag = FORCE_TORQUE;
+  comm->reverse_comm_variable_fix(this);
+
+  // include Langevin thermostat forces and torques
+
+  if (langflag) {
+#if defined(_OPENMP)
+#pragma omp parallel for default(none) private(ibody) schedule(static)
+#endif
+    for (ibody = 0; ibody < nlocal_body; ibody++) {
+      double * _noalias const fcm = body[ibody].fcm;
+      fcm[0] += langextra[ibody][0];
+      fcm[1] += langextra[ibody][1];
+      fcm[2] += langextra[ibody][2];
+      double * _noalias const tcm = body[ibody].torque;
+      tcm[0] += langextra[ibody][3];
+      tcm[1] += langextra[ibody][4];
+      tcm[2] += langextra[ibody][5];
+    }
+  }
+
+  // update vcm and angmom, recompute omega
+  
+#if defined(_OPENMP)
+#pragma omp parallel for default(none) private(ibody) schedule(static)
+#endif
+  for (ibody = 0; ibody < nlocal_body; ibody++) {
+    Body &b = body[ibody];
+
+    // update vcm by 1/2 step
+
+    const double dtfm = dtf / b.mass;
+    b.vcm[0] += dtfm * b.fcm[0];
+    b.vcm[1] += dtfm * b.fcm[1];
+    b.vcm[2] += dtfm * b.fcm[2];
+
+    // update angular momentum by 1/2 step
+
+    b.angmom[0] += dtf * b.torque[0];
+    b.angmom[1] += dtf * b.torque[1];
+    b.angmom[2] += dtf * b.torque[2];
+
+    MathExtra::angmom_to_omega(b.angmom,b.ex_space,b.ey_space,
+                               b.ez_space,b.inertia,b.omega);
+  }
+
+  // forward communicate updated info of all bodies
+
+  commflag = FINAL;
+  comm->forward_comm_variable_fix(this);
+
+  // set velocity/rotation of atoms in rigid bodies
+  // virial is already setup from initial_integrate
+  // triclinic only matters for virial calculation.
+
+  if (evflag)
+    if (triclinic)
+      set_v_thr<1,1>();
+    else
+      set_v_thr<0,1>();
+  else
+    set_v_thr<0,0>();
+}
+
+
+/* ----------------------------------------------------------------------
+   set space-frame coords and velocity of each atom in each rigid body
+   set orientation and rotation of extended particles
+   x = Q displace + Xcm, mapped back to periodic box
+   v = Vcm + (W cross (x - Xcm))
+------------------------------------------------------------------------- */
+
+template <int TRICLINIC, int EVFLAG>
+void FixRigidSmallOMP::set_xv_thr()
+{
+  dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
+  dbl3_t * _noalias const v = (dbl3_t *) atom->v[0];
+  const dbl3_t * _noalias const f = (dbl3_t *) atom->f[0];
+  const double * _noalias const rmass = atom->rmass;
+  const double * _noalias const mass = atom->mass;
+  const int * _noalias const type = atom->type;
+  const tagint * _noalias const image = atom->image;
+
+  double v0=0.0,v1=0.0,v2=0.0,v3=0.0,v4=0.0,v5=0.0;
+
+  const double xprd = domain->xprd;
+  const double yprd = domain->yprd;
+  const double zprd = domain->zprd;
+  const double xy = domain->xy;
+  const double xz = domain->xz;
+  const double yz = domain->yz;
+
+  // set x and v of each atom
+
+  const int nlocal = atom->nlocal;
+  int i;
+
+#if defined(_OPENMP)
+#pragma omp parallel for default(none) private(i) reduction(+:v0,v1,v2,v3,v4,v5)
+#endif
+  for (i = 0; i < nlocal; i++) {
+    const int ibody = atom2body[i];
+    if (ibody  < 0) continue;
+
+    Body &b = body[ibody];
+
+    const int xbox = (image[i] & IMGMASK) - IMGMAX;
+    const int ybox = (image[i] >> IMGBITS & IMGMASK) - IMGMAX;
+    const int zbox = (image[i] >> IMG2BITS) - IMGMAX;
+    const double deltax = xbox*xprd + (TRICLINIC ? ybox*xy + zbox*xz : 0.0);
+    const double deltay = ybox*yprd + (TRICLINIC ? zbox*yz : 0.0);
+    const double deltaz = zbox*zprd;
+
+    // save old positions and velocities for virial
+    double x0,x1,x2,vx,vy,vz;
+    if (EVFLAG) {
+      x0 = x[i].x + deltax;
+      x1 = x[i].y + deltay;
+      x2 = x[i].z + deltaz;
+      vx = v[i].x;
+      vy = v[i].y;
+      vz = v[i].z;
+    }
+
+    // x = displacement from center-of-mass, based on body orientation
+    // v = vcm + omega around center-of-mass
+
+    MathExtra::matvec(b.ex_space,b.ey_space,b.ez_space,displace[i],&x[i].x);
+
+    v[i].x = b.omega[1]*x[i].z - b.omega[2]*x[i].y + b.vcm[0];
+    v[i].y = b.omega[2]*x[i].x - b.omega[0]*x[i].z + b.vcm[1];
+    v[i].z = b.omega[0]*x[i].y - b.omega[1]*x[i].x + b.vcm[2];
+
+    // add center of mass to displacement
+    // map back into periodic box via xbox,ybox,zbox
+    // for triclinic, add in box tilt factors as well
+
+    x[i].x += b.xcm[0] - deltax;
+    x[i].y += b.xcm[1] - deltay;
+    x[i].z += b.xcm[2] - deltaz;
+
+    // virial = unwrapped coords dotted into body constraint force
+    // body constraint force = implied force due to v change minus f external
+    // assume f does not include forces internal to body
+    // 1/2 factor b/c final_integrate contributes other half
+    // assume per-atom contribution is due to constraint force on that atom
+
+    if (EVFLAG) {
+      double massone,vr[6];
+
+      if (rmass) massone = rmass[i];
+      else massone = mass[type[i]];
+
+      const double fc0 = 0.5*(massone*(v[i].x - vx)/dtf - f[i].x);
+      const double fc1 = 0.5*(massone*(v[i].y - vy)/dtf - f[i].y);
+      const double fc2 = 0.5*(massone*(v[i].z - vz)/dtf - f[i].z);
+
+      vr[0] = x0*fc0; vr[1] = x1*fc1; vr[2] = x2*fc2;
+      vr[3] = x0*fc1; vr[4] = x0*fc2; vr[5] = x1*fc2;
+
+      // Fix::v_tally() is not thread safe, so we do this manually here
+      // accumulate global virial into thread-local variables for reduction
+      if (vflag_global) {
+	v0 += vr[0];
+	v1 += vr[1];
+	v2 += vr[2];
+	v3 += vr[3];
+	v4 += vr[4];
+	v5 += vr[5];
+      }
+
+      // accumulate per atom virial directly since we parallelize over atoms.
+      if (vflag_atom) {
+	vatom[i][0] += vr[0];
+	vatom[i][1] += vr[1];
+	vatom[i][2] += vr[2];
+	vatom[i][3] += vr[3];
+	vatom[i][4] += vr[4];
+	vatom[i][5] += vr[5];
+      }
+    }
+  }
+
+  // second part of thread safe virial accumulation
+  // add global virial component after it was reduced across all threads
+  if (EVFLAG) {
+    if (vflag_global) {
+      virial[0] += v0;
+      virial[1] += v1;
+      virial[2] += v2;
+      virial[3] += v3;
+      virial[4] += v4;
+      virial[5] += v5;
+    }
+  }
+
+  // set orientation, omega, angmom of each extended particle
+  // XXX: extended particle info not yet multi-threaded
+
+  if (extended) {
+    double ione[3],exone[3],eyone[3],ezone[3],p[3][3];
+    double theta_body,theta;
+    double *shape,*quatatom,*inertiaatom;
+
+    AtomVecEllipsoid::Bonus *ebonus;
+    if (avec_ellipsoid) ebonus = avec_ellipsoid->bonus;
+    AtomVecLine::Bonus *lbonus;
+    if (avec_line) lbonus = avec_line->bonus;
+    AtomVecTri::Bonus *tbonus;
+    if (avec_tri) tbonus = avec_tri->bonus;
+    double **omega = atom->omega;
+    double **angmom = atom->angmom;
+    double **mu = atom->mu;
+    int *ellipsoid = atom->ellipsoid;
+    int *line = atom->line;
+    int *tri = atom->tri;
+
+    for (int i = 0; i < nlocal; i++) {
+      if (atom2body[i] < 0) continue;
+      Body &b = body[atom2body[i]];
+
+      if (eflags[i] & SPHERE) {
+        omega[i][0] = b.omega[0];
+        omega[i][1] = b.omega[1];
+        omega[i][2] = b.omega[2];
+      } else if (eflags[i] & ELLIPSOID) {
+        shape = ebonus[ellipsoid[i]].shape;
+        quatatom = ebonus[ellipsoid[i]].quat;
+        MathExtra::quatquat(b.quat,orient[i],quatatom);
+        MathExtra::qnormalize(quatatom);
+        ione[0] = EINERTIA*rmass[i] * (shape[1]*shape[1] + shape[2]*shape[2]);
+        ione[1] = EINERTIA*rmass[i] * (shape[0]*shape[0] + shape[2]*shape[2]);
+        ione[2] = EINERTIA*rmass[i] * (shape[0]*shape[0] + shape[1]*shape[1]);
+        MathExtra::q_to_exyz(quatatom,exone,eyone,ezone);
+        MathExtra::omega_to_angmom(b.omega,exone,eyone,ezone,ione,angmom[i]);
+      } else if (eflags[i] & LINE) {
+        if (b.quat[3] >= 0.0) theta_body = 2.0*acos(b.quat[0]);
+        else theta_body = -2.0*acos(b.quat[0]);
+        theta = orient[i][0] + theta_body;
+        while (theta <= MINUSPI) theta += TWOPI;
+        while (theta > MY_PI) theta -= TWOPI;
+        lbonus[line[i]].theta = theta;
+        omega[i][0] = b.omega[0];
+        omega[i][1] = b.omega[1];
+        omega[i][2] = b.omega[2];
+      } else if (eflags[i] & TRIANGLE) {
+        inertiaatom = tbonus[tri[i]].inertia;
+        quatatom = tbonus[tri[i]].quat;
+        MathExtra::quatquat(b.quat,orient[i],quatatom);
+        MathExtra::qnormalize(quatatom);
+        MathExtra::q_to_exyz(quatatom,exone,eyone,ezone);
+        MathExtra::omega_to_angmom(b.omega,exone,eyone,ezone,
+                                   inertiaatom,angmom[i]);
+      }
+      if (eflags[i] & DIPOLE) {
+        MathExtra::quat_to_mat(b.quat,p);
+        MathExtra::matvec(p,dorient[i],mu[i]);
+        MathExtra::snormalize3(mu[i][3],mu[i],mu[i]);
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   set space-frame velocity of each atom in a rigid body
+   set omega and angmom of extended particles
+   v = Vcm + (W cross (x - Xcm))
+------------------------------------------------------------------------- */
+
+template <int TRICLINIC, int EVFLAG>
+void FixRigidSmallOMP::set_v_thr()
+{
+  dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
+  dbl3_t * _noalias const v = (dbl3_t *) atom->v[0];
+  const dbl3_t * _noalias const f = (dbl3_t *) atom->f[0];
+  const double * _noalias const rmass = atom->rmass;
+  const double * _noalias const mass = atom->mass;
+  const int * _noalias const type = atom->type;
+  const tagint * _noalias const image = atom->image;
+
+  const double xprd = domain->xprd;
+  const double yprd = domain->yprd;
+  const double zprd = domain->zprd;
+  const double xy = domain->xy;
+  const double xz = domain->xz;
+  const double yz = domain->yz;
+
+  double v0=0.0,v1=0.0,v2=0.0,v3=0.0,v4=0.0,v5=0.0;
+
+  // set v of each atom
+
+  const int nlocal = atom->nlocal;
+  int i;
+
+#if defined(_OPENMP)
+#pragma omp parallel for default(none) private(i) reduction(+:v0,v1,v2,v3,v4,v5)
+#endif
+  for (i = 0; i < nlocal; i++) {
+    const int ibody = atom2body[i];
+    if (ibody < 0) continue;
+
+    Body &b = body[atom2body[i]];
+    double delta[3],vx,vy,vz;
+
+    MathExtra::matvec(b.ex_space,b.ey_space,b.ez_space,displace[i],delta);
+
+    // save old velocities for virial
+
+    if (EVFLAG) {
+      vx = v[i].x;
+      vy = v[i].y;
+      vz = v[i].z;
+    }
+
+    v[i].x = b.omega[1]*delta[2] - b.omega[2]*delta[1] + b.vcm[0];
+    v[i].y = b.omega[2]*delta[0] - b.omega[0]*delta[2] + b.vcm[1];
+    v[i].z = b.omega[0]*delta[1] - b.omega[1]*delta[0] + b.vcm[2];
+
+    // virial = unwrapped coords dotted into body constraint force
+    // body constraint force = implied force due to v change minus f external
+    // assume f does not include forces internal to body
+    // 1/2 factor b/c initial_integrate contributes other half
+    // assume per-atom contribution is due to constraint force on that atom
+
+    if (EVFLAG) {
+      double massone, vr[6];
+      if (rmass) massone = rmass[i];
+      else massone = mass[type[i]];
+
+      const int xbox = (image[i] & IMGMASK) - IMGMAX;
+      const int ybox = (image[i] >> IMGBITS & IMGMASK) - IMGMAX;
+      const int zbox = (image[i] >> IMG2BITS) - IMGMAX;
+      const double deltax = xbox*xprd + (TRICLINIC ? ybox*xy + zbox*xz : 0.0);
+      const double deltay = ybox*yprd + (TRICLINIC ? zbox*yz : 0.0);
+      const double deltaz = zbox*zprd;
+
+      const double fc0 = 0.5*(massone*(v[i].x - vx)/dtf - f[i].x);
+      const double fc1 = 0.5*(massone*(v[i].y - vy)/dtf - f[i].y);
+      const double fc2 = 0.5*(massone*(v[i].z - vz)/dtf - f[i].z);
+
+      const double x0 = x[i].x + deltax;
+      const double x1 = x[i].y + deltay;
+      const double x2 = x[i].z + deltaz;
+
+      vr[0] = x0*fc0; vr[1] = x1*fc1; vr[2] = x2*fc2;
+      vr[3] = x0*fc1; vr[4] = x0*fc2; vr[5] = x1*fc2;
+
+      // Fix::v_tally() is not thread safe, so we do this manually here
+      // accumulate global virial into thread-local variables and reduce them later
+      if (vflag_global) {
+	v0 += vr[0];
+	v1 += vr[1];
+	v2 += vr[2];
+	v3 += vr[3];
+	v4 += vr[4];
+	v5 += vr[5];
+      }
+
+      // accumulate per atom virial directly since we parallelize over atoms.
+      if (vflag_atom) {
+	vatom[i][0] += vr[0];
+	vatom[i][1] += vr[1];
+	vatom[i][2] += vr[2];
+	vatom[i][3] += vr[3];
+	vatom[i][4] += vr[4];
+	vatom[i][5] += vr[5];
+      }
+    }
+  } // end of parallel for
+
+  // second part of thread safe virial accumulation
+  // add global virial component after it was reduced across all threads
+  if (EVFLAG) {
+    if (vflag_global) {
+      virial[0] += v0;
+      virial[1] += v1;
+      virial[2] += v2;
+      virial[3] += v3;
+      virial[4] += v4;
+      virial[5] += v5;
+    }
+  }
+
+  // set omega, angmom of each extended particle 
+  // XXX: extended particle info not yet multi-threaded
+
+  if (extended) {
+    double ione[3],exone[3],eyone[3],ezone[3];
+    double *shape,*quatatom,*inertiaatom;
+
+    AtomVecEllipsoid::Bonus *ebonus;
+    if (avec_ellipsoid) ebonus = avec_ellipsoid->bonus;
+    AtomVecTri::Bonus *tbonus;
+    if (avec_tri) tbonus = avec_tri->bonus;
+    double **omega = atom->omega;
+    double **angmom = atom->angmom;
+    int *ellipsoid = atom->ellipsoid;
+    int *tri = atom->tri;
+
+    for (int i = 0; i < nlocal; i++) {
+      if (atom2body[i] < 0) continue;
+      Body &b = body[atom2body[i]];
+
+      if (eflags[i] & SPHERE) {
+        omega[i][0] = b.omega[0];
+        omega[i][1] = b.omega[1];
+        omega[i][2] = b.omega[2];
+      } else if (eflags[i] & ELLIPSOID) {
+        shape = ebonus[ellipsoid[i]].shape;
+        quatatom = ebonus[ellipsoid[i]].quat;
+        ione[0] = EINERTIA*rmass[i] * (shape[1]*shape[1] + shape[2]*shape[2]);
+        ione[1] = EINERTIA*rmass[i] * (shape[0]*shape[0] + shape[2]*shape[2]);
+        ione[2] = EINERTIA*rmass[i] * (shape[0]*shape[0] + shape[1]*shape[1]);
+        MathExtra::q_to_exyz(quatatom,exone,eyone,ezone);
+        MathExtra::omega_to_angmom(b.omega,exone,eyone,ezone,ione,
+                                   angmom[i]);
+      } else if (eflags[i] & LINE) {
+        omega[i][0] = b.omega[0];
+        omega[i][1] = b.omega[1];
+        omega[i][2] = b.omega[2];
+      } else if (eflags[i] & TRIANGLE) {
+        inertiaatom = tbonus[tri[i]].inertia;
+        quatatom = tbonus[tri[i]].quat;
+        MathExtra::q_to_exyz(quatatom,exone,eyone,ezone);
+        MathExtra::omega_to_angmom(b.omega,exone,eyone,ezone,
+                                   inertiaatom,angmom[i]);
+      }
+    }
+  }
+}
+
diff --git a/src/USER-OMP/fix_rigid_small_omp.h b/src/USER-OMP/fix_rigid_small_omp.h
new file mode 100644
index 0000000000..ffc4bee660
--- /dev/null
+++ b/src/USER-OMP/fix_rigid_small_omp.h
@@ -0,0 +1,134 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(rigid/small/omp,FixRigidSmallOMP)
+
+#else
+
+#ifndef LMP_FIX_RIGID_SMALL_OMP_H
+#define LMP_FIX_RIGID_SMALL_OMP_H
+
+#include "fix_rigid_small.h"
+
+namespace LAMMPS_NS {
+
+class FixRigidSmallOMP : public FixRigidSmall {
+ public:
+  FixRigidSmallOMP(class LAMMPS *lmp, int narg, char **args)
+    : FixRigidSmall(lmp,narg,args) {};
+  virtual ~FixRigidSmallOMP() {};
+
+  virtual void initial_integrate(int);
+  virtual void final_integrate();
+
+ private:
+  template <int, int> void set_xv_thr();
+  template <int, int> void set_v_thr();
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Fix rigid molecule requires atom attribute molecule
+
+Self-explanatory.
+
+E: Could not find fix rigid group ID
+
+A group ID used in the fix rigid command does not exist.
+
+E: One or more atoms belong to multiple rigid bodies
+
+Two or more rigid bodies defined by the fix rigid command cannot
+contain the same atom.
+
+E: No rigid bodies defined
+
+The fix specification did not end up defining any rigid bodies.
+
+E: Fix rigid z force cannot be on for 2d simulation
+
+Self-explanatory.
+
+E: Fix rigid xy torque cannot be on for 2d simulation
+
+Self-explanatory.
+
+E: Fix rigid langevin period must be > 0.0
+
+Self-explanatory.
+
+E: One or zero atoms in rigid body
+
+Any rigid body defined by the fix rigid command must contain 2 or more
+atoms.
+
+W: More than one fix rigid
+
+It is not efficient to use fix rigid more than once.
+
+E: Rigid fix must come before NPT/NPH fix
+
+NPT/NPH fix must be defined in input script after all rigid fixes,
+else the rigid fix contribution to the pressure virial is
+incorrect.
+
+W: Computing temperature of portions of rigid bodies
+
+The group defined by the temperature compute does not encompass all
+the atoms in one or more rigid bodies, so the change in
+degrees-of-freedom for the atoms in those partial rigid bodies will
+not be accounted for.
+
+E: Fix rigid atom has non-zero image flag in a non-periodic dimension
+
+You cannot set image flags for non-periodic dimensions.
+
+E: Insufficient Jacobi rotations for rigid body
+
+Eigensolve for rigid body was not sufficiently accurate.
+
+E: Fix rigid: Bad principal moments
+
+The principal moments of inertia computed for a rigid body
+are not within the required tolerances.
+
+E: Cannot open fix rigid infile %s
+
+UNDOCUMENTED
+
+E: Unexpected end of fix rigid file
+
+UNDOCUMENTED
+
+E: Incorrect rigid body format in fix rigid file
+
+UNDOCUMENTED
+
+E: Invalid rigid body ID in fix rigid file
+
+UNDOCUMENTED
+
+*/
diff --git a/src/USER-OMP/pppm_disp_omp.cpp b/src/USER-OMP/pppm_disp_omp.cpp
new file mode 100644
index 0000000000..6d6ab70a0a
--- /dev/null
+++ b/src/USER-OMP/pppm_disp_omp.cpp
@@ -0,0 +1,1779 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U), Rolf Isele-Holder (RWTH Aachen University)
+------------------------------------------------------------------------- */
+
+#include "pppm_disp_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "domain.h"
+#include "force.h"
+#include "memory.h"
+#include "math_const.h"
+
+#include <string.h>
+#include <math.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#ifdef FFT_SINGLE
+#define ZEROF 0.0f
+#define ONEF  1.0f
+#else
+#define ZEROF 0.0
+#define ONEF  1.0
+#endif
+
+#define OFFSET 16384
+
+
+/* ---------------------------------------------------------------------- */
+
+PPPMDispOMP::PPPMDispOMP(LAMMPS *lmp, int narg, char **arg) :
+  PPPMDisp(lmp, narg, arg), ThrOMP(lmp, THR_KSPACE)
+{
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ----------------------------------------------------------------------
+   allocate memory that depends on # of K-vectors and order
+------------------------------------------------------------------------- */
+
+void PPPMDispOMP::allocate()
+{
+  PPPMDisp::allocate();
+
+  const int nthreads = comm->nthreads;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+#if defined(_OPENMP)
+    const int tid = omp_get_thread_num();
+#else
+    const int tid = 0;
+#endif
+
+    if (function[0]) {
+      ThrData *thr = fix->get_thr(tid);
+      thr->init_pppm(order,memory);
+    }
+    if (function[1] + function[2]) {
+      ThrData * thr = fix->get_thr(tid);
+      thr->init_pppm_disp(order_6,memory);  
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   free memory that depends on # of K-vectors and order
+------------------------------------------------------------------------- */
+
+void PPPMDispOMP::deallocate()
+{
+  PPPMDisp::deallocate();
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+#if defined(_OPENMP)
+    const int tid = omp_get_thread_num();
+#else
+    const int tid = 0;
+#endif
+    if (function[0]) {
+      ThrData * thr = fix->get_thr(tid);
+      thr->init_pppm(-order,memory);
+    }
+    if (function[1] + function[2]) {
+      ThrData * thr = fix->get_thr(tid);
+      thr->init_pppm_disp(-order_6,memory);
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   Compute the modified (hockney-eastwood) coulomb green function
+------------------------------------------------------------------------- */
+
+void PPPMDispOMP::compute_gf()
+{
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+
+    double *prd;
+    if (triclinic == 0) prd = domain->prd;
+    else prd = domain->prd_lamda;
+
+    double xprd = prd[0];
+    double yprd = prd[1];
+    double zprd = prd[2];
+    double zprd_slab = zprd*slab_volfactor;
+
+    double unitkx = (2.0*MY_PI/xprd);
+    double unitky = (2.0*MY_PI/yprd);
+    double unitkz = (2.0*MY_PI/zprd_slab);
+
+    int tid,nn,nnfrom,nnto,nx,ny,nz,k,l,m;
+    int kper,lper,mper;
+    double snx,sny,snz,snx2,sny2,snz2;
+    double sqk;
+    double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
+    double sum1,dot1,dot2;
+    double numerator,denominator;
+
+    const int nnx = nxhi_fft-nxlo_fft+1;
+    const int nny = nyhi_fft-nylo_fft+1;
+
+    loop_setup_thr(nnfrom, nnto, tid, nfft, comm->nthreads);
+
+    for (m = nzlo_fft; m <= nzhi_fft; m++) {
+      mper = m - nz_pppm*(2*m/nz_pppm);
+      qz = unitkz*mper;
+      snz = sin(0.5*qz*zprd_slab/nz_pppm);
+      snz2 = snz*snz;
+      sz = exp(-0.25*pow(qz/g_ewald,2.0));
+      wz = 1.0;
+      argz = 0.5*qz*zprd_slab/nz_pppm;
+      if (argz != 0.0) wz = pow(sin(argz)/argz,order);
+      wz *= wz;
+
+      for (l = nylo_fft; l <= nyhi_fft; l++) {
+        lper = l - ny_pppm*(2*l/ny_pppm);
+        qy = unitky*lper;
+        sny = sin(0.5*qy*yprd/ny_pppm);
+        sny2 = sny*sny;
+        sy = exp(-0.25*pow(qy/g_ewald,2.0));
+        wy = 1.0;
+        argy = 0.5*qy*yprd/ny_pppm;
+        if (argy != 0.0) wy = pow(sin(argy)/argy,order);
+        wy *= wy;
+
+        for (k = nxlo_fft; k <= nxhi_fft; k++) {
+
+          /* only compute the part designated to this thread */
+          nn = k-nxlo_fft + nnx*(l-nylo_fft + nny*(m-nzlo_fft));
+          if ((nn < nnfrom) || (nn >=nnto)) continue;
+
+          kper = k - nx_pppm*(2*k/nx_pppm);
+          qx = unitkx*kper;
+          snx = sin(0.5*qx*xprd/nx_pppm);
+          snx2 = snx*snx;
+          sx = exp(-0.25*pow(qx/g_ewald,2.0));
+          wx = 1.0;
+          argx = 0.5*qx*xprd/nx_pppm;
+          if (argx != 0.0) wx = pow(sin(argx)/argx,order);
+          wx *= wx;
+
+          sqk = pow(qx,2.0) + pow(qy,2.0) + pow(qz,2.0);
+
+          if (sqk != 0.0) {
+            numerator = 4.0*MY_PI/sqk;
+            denominator = gf_denom(snx2,sny2,snz2, gf_b, order);  
+            greensfn[nn] = numerator*sx*sy*sz*wx*wy*wz/denominator;
+          } else greensfn[nn] = 0.0;
+        }
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   Compyute the modified (hockney-eastwood) dispersion green function
+------------------------------------------------------------------------- */
+
+void PPPMDispOMP::compute_gf_6()
+{
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+    double *prd;
+    int k,l,m,nn;
+
+    // volume-dependent factors
+    // adjust z dimension for 2d slab PPPM
+    // z dimension for 3d PPPM is zprd since slab_volfactor = 1.0
+
+    if (triclinic == 0) prd = domain->prd;
+    else prd = domain->prd_lamda;
+
+    double xprd = prd[0];
+    double yprd = prd[1];
+    double zprd = prd[2];
+    double zprd_slab = zprd*slab_volfactor;
+
+    double unitkx = (2.0*MY_PI/xprd);
+    double unitky = (2.0*MY_PI/yprd);
+    double unitkz = (2.0*MY_PI/zprd_slab);
+
+    int kper,lper,mper;
+    double sqk;
+    double snx,sny,snz,snx2,sny2,snz2;
+    double argx,argy,argz,wx,wy,wz,sx,sy,sz;
+    double qx,qy,qz;
+    double rtsqk, term;
+    double numerator,denominator;
+    double inv2ew = 2*g_ewald_6;
+    inv2ew = 1/inv2ew;
+    double rtpi = sqrt(MY_PI);
+    int nnfrom, nnto, tid;
+
+    numerator = -MY_PI*rtpi*g_ewald_6*g_ewald_6*g_ewald_6/(3.0);
+
+    const int nnx = nxhi_fft_6-nxlo_fft_6+1;
+    const int nny = nyhi_fft_6-nylo_fft_6+1;
+
+    loop_setup_thr(nnfrom, nnto, tid, nfft_6, comm->nthreads);
+
+    for (m = nzlo_fft_6; m <= nzhi_fft_6; m++) {
+      mper = m - nz_pppm_6*(2*m/nz_pppm_6);
+      qz = unitkz*mper;
+      snz = sin(0.5*unitkz*mper*zprd_slab/nz_pppm_6);
+      snz2 = snz*snz;
+      sz = exp(-qz*qz*inv2ew*inv2ew);
+      wz = 1.0;
+      argz = 0.5*qz*zprd_slab/nz_pppm_6;
+      if (argz != 0.0) wz = pow(sin(argz)/argz,order_6);
+      wz *= wz;
+              
+      for (l = nylo_fft_6; l <= nyhi_fft_6; l++) {
+        lper = l - ny_pppm_6*(2*l/ny_pppm_6);
+        qy = unitky*lper;
+        sny = sin(0.5*unitky*lper*yprd/ny_pppm_6);
+        sny2 = sny*sny;
+        sy = exp(-qy*qy*inv2ew*inv2ew);
+        wy = 1.0;
+        argy = 0.5*qy*yprd/ny_pppm_6;
+        if (argy != 0.0) wy = pow(sin(argy)/argy,order_6);
+        wy *= wy;
+
+        for (k = nxlo_fft_6; k <= nxhi_fft_6; k++) {
+
+          /* only compute the part designated to this thread */
+          nn = k-nxlo_fft_6 + nnx*(l-nylo_fft_6 + nny*(m-nzlo_fft_6));
+          if ((nn < nnfrom) || (nn >=nnto)) continue;
+
+	  kper = k - nx_pppm_6*(2*k/nx_pppm_6);
+          qx = unitkx*kper;
+	  snx = sin(0.5*unitkx*kper*xprd/nx_pppm_6);
+	  snx2 = snx*snx;
+          sx = exp(-qx*qx*inv2ew*inv2ew);
+	  wx = 1.0;
+	  argx = 0.5*qx*xprd/nx_pppm_6;
+	  if (argx != 0.0) wx = pow(sin(argx)/argx,order_6);
+          wx *= wx;
+      
+	  sqk = pow(qx,2.0) + pow(qy,2.0) + pow(qz,2.0);
+
+	  denominator = gf_denom(snx2,sny2,snz2, gf_b_6, order_6); 
+	  rtsqk = sqrt(sqk);
+          term = (1-2*sqk*inv2ew*inv2ew)*sx*sy*sz +
+                  2*sqk*rtsqk*inv2ew*inv2ew*inv2ew*rtpi*erfc(rtsqk*inv2ew);
+	  greensfn_6[nn] = numerator*term*wx*wy*wz/denominator;
+        }
+      }
+    }
+  }
+}
+/* ----------------------------------------------------------------------
+   run the regular toplevel compute method from plain PPPPM
+   which will have individual methods replaced by our threaded
+   versions and then call the obligatory force reduction.
+------------------------------------------------------------------------- */
+
+void PPPMDispOMP::compute(int eflag, int vflag)
+{
+
+  PPPMDisp::compute(eflag,vflag);
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+#if defined(_OPENMP)
+    const int tid = omp_get_thread_num();
+#else
+    const int tid = 0;
+#endif
+
+    ThrData *thr = fix->get_thr(tid);
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+
+/* ----------------------------------------------------------------------
+   find center grid pt for each of my particles
+   check that full stencil for the particle will fit in my 3d brick
+   store central grid pt indices in part2grid array
+------------------------------------------------------------------------- */
+
+void PPPMDispOMP::particle_map(double dxinv, double dyinv,
+                               double dzinv, double sft,
+                               int ** part2grid, int nup,
+                               int nlw, int nxlo_o,
+                               int nylo_o, int nzlo_o,
+                               int nxhi_o, int nyhi_o,
+                               int nzhi_o)
+{
+  const int * _noalias const type = atom->type;
+  const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
+  int3_t * _noalias const p2g = (int3_t *) part2grid[0];
+  const double boxlox = boxlo[0];
+  const double boxloy = boxlo[1];
+  const double boxloz = boxlo[2];
+  const int nlocal = atom->nlocal;
+
+  const double delxinv = dxinv;
+  const double delyinv = dyinv;
+  const double delzinv = dzinv;
+  const double shift = sft;
+  const int nupper = nup;
+  const int nlower = nlw;
+  const int nxlo_out = nxlo_o;
+  const int nylo_out = nylo_o;
+  const int nzlo_out = nzlo_o;
+  const int nxhi_out = nxhi_o;
+  const int nyhi_out = nyhi_o;
+  const int nzhi_out = nzhi_o;
+
+  int i, flag = 0;
+#if defined(_OPENMP)
+#pragma omp parallel for private(i) default(none) reduction(+:flag) schedule(static)
+#endif
+  for (i = 0; i < nlocal; i++) {
+
+    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+    // current particle coord can be outside global and local box
+    // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
+
+    const int nx = static_cast<int> ((x[i].x-boxlox)*delxinv+shift) - OFFSET;
+    const int ny = static_cast<int> ((x[i].y-boxloy)*delyinv+shift) - OFFSET;
+    const int nz = static_cast<int> ((x[i].z-boxloz)*delzinv+shift) - OFFSET;
+
+    p2g[i].a = nx;
+    p2g[i].b = ny;
+    p2g[i].t = nz;
+
+    // check that entire stencil around nx,ny,nz will fit in my 3d brick
+
+    if (nx+nlower < nxlo_out || nx+nupper > nxhi_out ||
+        ny+nlower < nylo_out || ny+nupper > nyhi_out ||
+        nz+nlower < nzlo_out || nz+nupper > nzhi_out)
+      flag++;
+  }
+
+  int flag_all;
+  MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world);
+  if (flag_all) error->all(FLERR,"Out of range atoms - cannot compute PPPM");
+}
+
+/* ----------------------------------------------------------------------
+   create discretized "density" on section of global grid due to my particles
+   density(x,y,z) = charge "density" at grid points of my 3d brick
+   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
+   in global grid
+------------------------------------------------------------------------- */
+
+void PPPMDispOMP::make_rho_c()
+{
+
+  // clear 3d density array
+
+  FFT_SCALAR * _noalias const d = &(density_brick[nzlo_out][nylo_out][nxlo_out]);
+  memset(d,0,ngrid*sizeof(FFT_SCALAR));
+
+  const int ix = nxhi_out - nxlo_out + 1;
+  const int iy = nyhi_out - nylo_out + 1;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+    const double * _noalias const q = atom->q;
+    const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
+    const int3_t * _noalias const p2g = (int3_t *) part2grid[0];
+
+    const double boxlox = boxlo[0];
+    const double boxloy = boxlo[1];
+    const double boxloz = boxlo[2];
+
+    // determine range of grid points handled by this thread
+    int i,jfrom,jto,tid;
+    loop_setup_thr(jfrom,jto,tid,ngrid,comm->nthreads);
+
+    // get per thread data
+    ThrData *thr = fix->get_thr(tid);
+    FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
+
+    // loop over my charges, add their contribution to nearby grid points
+    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+    // (dx,dy,dz) = distance to "lower left" grid pt
+
+    // loop over all local atoms for all threads
+    const int nlocal = atom->nlocal;
+    for (i = 0; i < nlocal; i++) {
+
+      const int nx = p2g[i].a;
+      const int ny = p2g[i].b;
+      const int nz = p2g[i].t;
+
+      // pre-screen whether this atom will ever come within 
+      // reach of the data segement this thread is updating.
+      if ( ((nz+nlower-nzlo_out)*ix*iy >= jto)
+	   || ((nz+nupper-nzlo_out+1)*ix*iy < jfrom) ) continue;
+
+      const FFT_SCALAR dx = nx+shiftone - (x[i].x-boxlox)*delxinv;
+      const FFT_SCALAR dy = ny+shiftone - (x[i].y-boxloy)*delyinv;
+      const FFT_SCALAR dz = nz+shiftone - (x[i].z-boxloz)*delzinv;
+
+      compute_rho1d_thr(r1d,dx,dy,dz,order,rho_coeff);
+
+      const FFT_SCALAR z0 = delvolinv * q[i];
+
+      for (int n = nlower; n <= nupper; ++n) {
+	const int jn = (nz+n-nzlo_out)*ix*iy;
+	const FFT_SCALAR y0 = z0*r1d[2][n];
+
+	for (int m = nlower; m <= nupper; ++m) {
+	  const int jm = jn+(ny+m-nylo_out)*ix;
+	  const FFT_SCALAR x0 = y0*r1d[1][m];
+
+	  for (int l = nlower; l <= nupper; ++l) {
+	    const int jl = jm+nx+l-nxlo_out;
+	    // make sure each thread only updates
+	    // "his" elements of the density grid
+	    if (jl >= jto) break;
+	    if (jl < jfrom) continue;
+
+	    d[jl] += x0*r1d[0][l];
+	  }
+	}
+      }
+    }
+  }
+}
+
+
+/* ----------------------------------------------------------------------
+   same as above for dispersion interaction with geometric mixing rule
+------------------------------------------------------------------------- */
+
+void PPPMDispOMP::make_rho_g()
+{
+
+  // clear 3d density array
+
+  FFT_SCALAR * _noalias const d = &(density_brick_g[nzlo_out_6][nylo_out_6][nxlo_out_6]);
+  memset(d,0,ngrid_6*sizeof(FFT_SCALAR));
+
+  const int ix = nxhi_out_6 - nxlo_out_6 + 1;
+  const int iy = nyhi_out_6 - nylo_out_6 + 1;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+    const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
+    const int3_t * _noalias const p2g = (int3_t *) part2grid_6[0];
+
+    const double boxlox = boxlo[0];
+    const double boxloy = boxlo[1];
+    const double boxloz = boxlo[2];
+
+    // determine range of grid points handled by this thread
+    int i,jfrom,jto,tid;
+    loop_setup_thr(jfrom,jto,tid,ngrid_6,comm->nthreads);
+
+    // get per thread data
+    ThrData *thr = fix->get_thr(tid);
+    FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d_6());
+
+    // loop over my charges, add their contribution to nearby grid points
+    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+    // (dx,dy,dz) = distance to "lower left" grid pt
+
+    // loop over all local atoms for all threads
+    const int nlocal = atom->nlocal;
+    for (i = 0; i < nlocal; i++) {
+
+      const int nx = p2g[i].a;
+      const int ny = p2g[i].b;
+      const int nz = p2g[i].t;
+
+      // pre-screen whether this atom will ever come within 
+      // reach of the data segement this thread is updating.
+      if ( ((nz+nlower_6-nzlo_out_6)*ix*iy >= jto)
+	   || ((nz+nupper_6-nzlo_out_6+1)*ix*iy < jfrom) ) continue;
+
+      const FFT_SCALAR dx = nx+shiftone_6 - (x[i].x-boxlox)*delxinv_6;
+      const FFT_SCALAR dy = ny+shiftone_6 - (x[i].y-boxloy)*delyinv_6;
+      const FFT_SCALAR dz = nz+shiftone_6 - (x[i].z-boxloz)*delzinv_6;
+
+      compute_rho1d_thr(r1d,dx,dy,dz,order_6,rho_coeff_6);
+
+      const int type = atom->type[i];
+      const double lj = B[type];
+      const FFT_SCALAR z0 = delvolinv_6 * lj;
+
+      for (int n = nlower_6; n <= nupper_6; ++n) {
+	const int jn = (nz+n-nzlo_out_6)*ix*iy;
+	const FFT_SCALAR y0 = z0*r1d[2][n];
+
+	for (int m = nlower_6; m <= nupper_6; ++m) {
+	  const int jm = jn+(ny+m-nylo_out_6)*ix;
+	  const FFT_SCALAR x0 = y0*r1d[1][m];
+
+	  for (int l = nlower_6; l <= nupper_6; ++l) {
+	    const int jl = jm+nx+l-nxlo_out_6;
+	    // make sure each thread only updates
+	    // "his" elements of the density grid
+	    if (jl >= jto) break;
+	    if (jl < jfrom) continue;
+
+	    d[jl] += x0*r1d[0][l];
+	  }
+	}
+      }
+    }
+  }
+}
+
+
+/* ----------------------------------------------------------------------
+   same as above for dispersion interaction with arithmetic mixing rule
+------------------------------------------------------------------------- */
+
+void PPPMDispOMP::make_rho_a()
+{
+
+  // clear 3d density array
+
+  FFT_SCALAR * _noalias const d0 = &(density_brick_a0[nzlo_out_6][nylo_out_6][nxlo_out_6]);
+  FFT_SCALAR * _noalias const d1 = &(density_brick_a1[nzlo_out_6][nylo_out_6][nxlo_out_6]);
+  FFT_SCALAR * _noalias const d2 = &(density_brick_a2[nzlo_out_6][nylo_out_6][nxlo_out_6]);
+  FFT_SCALAR * _noalias const d3 = &(density_brick_a3[nzlo_out_6][nylo_out_6][nxlo_out_6]);
+  FFT_SCALAR * _noalias const d4 = &(density_brick_a4[nzlo_out_6][nylo_out_6][nxlo_out_6]);
+  FFT_SCALAR * _noalias const d5 = &(density_brick_a5[nzlo_out_6][nylo_out_6][nxlo_out_6]);
+  FFT_SCALAR * _noalias const d6 = &(density_brick_a6[nzlo_out_6][nylo_out_6][nxlo_out_6]);
+
+  memset(d0,0,ngrid_6*sizeof(FFT_SCALAR));
+  memset(d1,0,ngrid_6*sizeof(FFT_SCALAR));
+  memset(d2,0,ngrid_6*sizeof(FFT_SCALAR));
+  memset(d3,0,ngrid_6*sizeof(FFT_SCALAR));
+  memset(d4,0,ngrid_6*sizeof(FFT_SCALAR));
+  memset(d5,0,ngrid_6*sizeof(FFT_SCALAR));
+  memset(d6,0,ngrid_6*sizeof(FFT_SCALAR));
+
+  const int ix = nxhi_out_6 - nxlo_out_6 + 1;
+  const int iy = nyhi_out_6 - nylo_out_6 + 1;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+    const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
+    const int3_t * _noalias const p2g = (int3_t *) part2grid_6[0];
+
+    const double boxlox = boxlo[0];
+    const double boxloy = boxlo[1];
+    const double boxloz = boxlo[2];
+
+    // determine range of grid points handled by this thread
+    int i,jfrom,jto,tid;
+    loop_setup_thr(jfrom,jto,tid,ngrid_6,comm->nthreads);
+
+    // get per thread data
+    ThrData *thr = fix->get_thr(tid);
+    FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d_6());
+
+    // loop over my charges, add their contribution to nearby grid points
+    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+    // (dx,dy,dz) = distance to "lower left" grid pt
+
+    // loop over all local atoms for all threads
+    const int nlocal = atom->nlocal;
+    for (i = 0; i < nlocal; i++) {
+
+      const int nx = p2g[i].a;
+      const int ny = p2g[i].b;
+      const int nz = p2g[i].t;
+
+      // pre-screen whether this atom will ever come within 
+      // reach of the data segement this thread is updating.
+      if ( ((nz+nlower_6-nzlo_out_6)*ix*iy >= jto)
+	   || ((nz+nupper_6-nzlo_out_6+1)*ix*iy < jfrom) ) continue;
+
+      const FFT_SCALAR dx = nx+shiftone_6 - (x[i].x-boxlox)*delxinv_6;
+      const FFT_SCALAR dy = ny+shiftone_6 - (x[i].y-boxloy)*delyinv_6;
+      const FFT_SCALAR dz = nz+shiftone_6 - (x[i].z-boxloz)*delzinv_6;
+
+      compute_rho1d_thr(r1d,dx,dy,dz,order_6,rho_coeff_6);
+
+      const int type = atom->type[i];
+      const double lj0 = B[7*type];
+      const double lj1 = B[7*type+1];
+      const double lj2 = B[7*type+2];
+      const double lj3 = B[7*type+3];
+      const double lj4 = B[7*type+4];
+      const double lj5 = B[7*type+5];
+      const double lj6 = B[7*type+6];
+
+      const FFT_SCALAR z0 = delvolinv_6;
+
+      for (int n = nlower_6; n <= nupper_6; ++n) {
+	const int jn = (nz+n-nzlo_out_6)*ix*iy;
+	const FFT_SCALAR y0 = z0*r1d[2][n];
+
+	for (int m = nlower_6; m <= nupper_6; ++m) {
+	  const int jm = jn+(ny+m-nylo_out_6)*ix;
+	  const FFT_SCALAR x0 = y0*r1d[1][m];
+
+	  for (int l = nlower_6; l <= nupper_6; ++l) {
+	    const int jl = jm+nx+l-nxlo_out_6;
+	    // make sure each thread only updates
+	    // "his" elements of the density grid
+	    if (jl >= jto) break;
+	    if (jl < jfrom) continue;
+
+            const double w = x0*r1d[0][l];
+
+	    d0[jl] += w*lj0;
+	    d1[jl] += w*lj1;
+	    d2[jl] += w*lj2;
+	    d3[jl] += w*lj3;
+	    d4[jl] += w*lj4;
+	    d5[jl] += w*lj5;
+	    d6[jl] += w*lj6;
+	  }
+	}
+      }
+    }
+  }
+}
+
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get electric field & force on my particles
+   for ik scheme
+------------------------------------------------------------------------- */
+
+void PPPMDispOMP::fieldforce_c_ik()
+{
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+
+  const double * const q = atom->q;
+  const double * const * const x = atom->x;
+  const int nthreads = comm->nthreads;
+  const int nlocal = atom->nlocal;
+  const double qqrd2e = force->qqrd2e;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+#if defined(_OPENMP)
+    // each thread works on a fixed chunk of atoms.
+    const int tid = omp_get_thread_num();
+    const int inum = nlocal;
+    const int idelta = 1 + inum/nthreads;
+    const int ifrom = tid*idelta;
+    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
+#else
+    const int ifrom = 0;
+    const int ito = nlocal;
+    const int tid = 0;
+#endif
+    ThrData *thr = fix->get_thr(tid);
+    double * const * const f = thr->get_f();
+    FFT_SCALAR * const * const r1d =  static_cast<FFT_SCALAR **>(thr->get_rho1d());
+
+    int l,m,n,nx,ny,nz,mx,my,mz;
+    FFT_SCALAR dx,dy,dz,x0,y0,z0;
+    FFT_SCALAR ekx,eky,ekz;
+
+    // this if protects against having more threads than local atoms
+    if (ifrom < nlocal) {
+      for (int i = ifrom; i < ito; i++) {
+
+        nx = part2grid[i][0];
+        ny = part2grid[i][1];
+        nz = part2grid[i][2];
+        dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+        dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+        dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+        compute_rho1d_thr(r1d,dx,dy,dz, order, rho_coeff);
+
+        ekx = eky = ekz = ZEROF;
+        for (n = nlower; n <= nupper; n++) {
+          mz = n+nz;
+          z0 = r1d[2][n];
+          for (m = nlower; m <= nupper; m++) {
+            my = m+ny;
+            y0 = z0*r1d[1][m];
+            for (l = nlower; l <= nupper; l++) {
+              mx = l+nx;
+              x0 = y0*r1d[0][l];
+              ekx -= x0*vdx_brick[mz][my][mx];
+              eky -= x0*vdy_brick[mz][my][mx];
+              ekz -= x0*vdz_brick[mz][my][mx];
+            }
+          }
+        }
+
+        // convert E-field to force
+        const double qfactor = qqrd2e*scale*q[i];
+        f[i][0] += qfactor*ekx;
+        f[i][1] += qfactor*eky;
+        f[i][2] += qfactor*ekz;
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get electric field & force on my particles
+   for ad scheme
+------------------------------------------------------------------------- */
+
+void PPPMDispOMP::fieldforce_c_ad()
+{
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+
+  const double * const q = atom->q;
+  const double * const * const x = atom->x;
+  const int nthreads = comm->nthreads;
+  const int nlocal = atom->nlocal;
+  const double qqrd2e = force->qqrd2e;
+  //const double * const sf_c = sf_coeff;
+
+  double *prd;
+
+  if (triclinic == 0) prd = domain->prd;
+  else prd = domain->prd_lamda;
+
+  double xprd = prd[0];
+  double yprd = prd[1];
+  double zprd = prd[2];
+  double zprd_slab = zprd*slab_volfactor;
+
+  const double hx_inv = nx_pppm/xprd;
+  const double hy_inv = ny_pppm/yprd;
+  const double hz_inv = nz_pppm/zprd_slab;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+#if defined(_OPENMP)
+    // each thread works on a fixed chunk of atoms.
+    const int tid = omp_get_thread_num();
+    const int inum = nlocal;
+    const int idelta = 1 + inum/nthreads;
+    const int ifrom = tid*idelta;
+    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
+#else
+    const int ifrom = 0;
+    const int ito = nlocal;
+    const int tid = 0;
+#endif
+    ThrData *thr = fix->get_thr(tid);
+    double * const * const f = thr->get_f();
+    FFT_SCALAR * const * const r1d =  static_cast<FFT_SCALAR **>(thr->get_rho1d());
+    FFT_SCALAR * const * const dr1d = static_cast<FFT_SCALAR **>(thr->get_drho1d());
+
+    int l,m,n,nx,ny,nz,mx,my,mz;
+    FFT_SCALAR dx,dy,dz,x0,y0,z0;
+    FFT_SCALAR ekx,eky,ekz;
+    double sf = 0.0;
+    double s1,s2,s3;
+
+    // this if protects against having more threads than local atoms
+    if (ifrom < nlocal) {
+      for (int i = ifrom; i < ito; i++) {
+
+        nx = part2grid[i][0];
+        ny = part2grid[i][1];
+        nz = part2grid[i][2];
+        dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+        dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+        dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+        compute_rho1d_thr(r1d,dx,dy,dz, order, rho_coeff);
+        compute_drho1d_thr(dr1d,dx,dy,dz, order, drho_coeff);
+
+        ekx = eky = ekz = ZEROF;
+        for (n = nlower; n <= nupper; n++) {
+          mz = n+nz;
+          for (m = nlower; m <= nupper; m++) {
+            my = m+ny;
+            for (l = nlower; l <= nupper; l++) {
+              mx = l+nx;
+              ekx += dr1d[0][l]*r1d[1][m]*r1d[2][n]*u_brick[mz][my][mx];
+              eky += r1d[0][l]*dr1d[1][m]*r1d[2][n]*u_brick[mz][my][mx];
+              ekz += r1d[0][l]*r1d[1][m]*dr1d[2][n]*u_brick[mz][my][mx];
+            }
+          }
+        }
+        ekx *= hx_inv;
+        eky *= hy_inv;
+        ekz *= hz_inv;
+
+        // convert E-field to force
+        const double qfactor = qqrd2e*scale;
+
+        s1 = x[i][0]*hx_inv;
+        s2 = x[i][1]*hy_inv;
+        s3 = x[i][2]*hz_inv;
+        sf = sf_coeff[0]*sin(2*MY_PI*s1);
+        sf += sf_coeff[1]*sin(4*MY_PI*s1);
+        sf *= 2*q[i]*q[i];
+        f[i][0] += qfactor*(ekx*q[i] - sf);
+
+        sf = sf_coeff[2]*sin(2*MY_PI*s2);
+        sf += sf_coeff[3]*sin(4*MY_PI*s2);
+        sf *= 2*q[i]*q[i];
+        f[i][1] += qfactor*(eky*q[i] - sf);
+
+
+        sf = sf_coeff[4]*sin(2*MY_PI*s3);
+        sf += sf_coeff[5]*sin(4*MY_PI*s3);
+        sf *= 2*q[i]*q[i];
+        if (slabflag != 2) f[i][2] += qfactor*(ekz*q[i] - sf);
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+ interpolate from grid to get per-atom energy/virial
+ ------------------------------------------------------------------------- */
+
+void PPPMDispOMP::fieldforce_c_peratom()
+{
+  // loop over my charges, interpolate from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  const double * const q = atom->q;
+  const double * const * const x = atom->x;
+  const int nthreads = comm->nthreads;
+  const int nlocal = atom->nlocal;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+#if defined(_OPENMP)
+    // each thread works on a fixed chunk of atoms.
+    const int tid = omp_get_thread_num();
+    const int inum = nlocal;
+    const int idelta = 1 + inum/nthreads;
+    const int ifrom = tid*idelta;
+    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
+#else
+    const int ifrom = 0;
+    const int ito = nlocal;
+    const int tid = 0;
+#endif
+    ThrData *thr = fix->get_thr(tid);
+    FFT_SCALAR * const * const r1d =  static_cast<FFT_SCALAR **>(thr->get_rho1d());
+
+    int i,l,m,n,nx,ny,nz,mx,my,mz;
+    FFT_SCALAR dx,dy,dz,x0,y0,z0;
+    FFT_SCALAR u,v0,v1,v2,v3,v4,v5;
+
+    // this if protects against having more threads than local atoms
+    if (ifrom < nlocal) {
+      for (int i = ifrom; i < ito; i++) {
+
+        nx = part2grid[i][0];
+        ny = part2grid[i][1];
+        nz = part2grid[i][2];
+        dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+        dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+        dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+        compute_rho1d_thr(r1d,dx,dy,dz, order, rho_coeff);
+
+        u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF;
+        for (n = nlower; n <= nupper; n++) {
+          mz = n+nz;
+          z0 = r1d[2][n];
+          for (m = nlower; m <= nupper; m++) {
+            my = m+ny;
+            y0 = z0*r1d[1][m];
+            for (l = nlower; l <= nupper; l++) {
+              mx = l+nx;
+              x0 = y0*r1d[0][l];
+              if (eflag_atom) u += x0*u_brick[mz][my][mx];
+              if (vflag_atom) {
+                v0 += x0*v0_brick[mz][my][mx];
+                v1 += x0*v1_brick[mz][my][mx];
+                v2 += x0*v2_brick[mz][my][mx];
+                v3 += x0*v3_brick[mz][my][mx];
+                v4 += x0*v4_brick[mz][my][mx];
+                v5 += x0*v5_brick[mz][my][mx];
+              }
+            }
+          }
+        }
+
+        const double qfactor = 0.5*force->qqrd2e * scale * q[i];
+
+        if (eflag_atom) eatom[i] += u*qfactor;
+        if (vflag_atom) {
+          vatom[i][0] += v0*qfactor;
+          vatom[i][1] += v1*qfactor;
+          vatom[i][2] += v2*qfactor;
+          vatom[i][3] += v3*qfactor;
+          vatom[i][4] += v4*qfactor;
+          vatom[i][5] += v5*qfactor;
+        }
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get dispersion field & force on my particles
+   for ik scheme and geometric mixing rule
+------------------------------------------------------------------------- */
+
+void PPPMDispOMP::fieldforce_g_ik()
+{
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+
+  const double * const * const x = atom->x;
+  const int nthreads = comm->nthreads;
+  const int nlocal = atom->nlocal;
+  const double qqrd2e = force->qqrd2e;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+#if defined(_OPENMP)
+    // each thread works on a fixed chunk of atoms.
+    const int tid = omp_get_thread_num();
+    const int inum = nlocal;
+    const int idelta = 1 + inum/nthreads;
+    const int ifrom = tid*idelta;
+    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
+#else
+    const int ifrom = 0;
+    const int ito = nlocal;
+    const int tid = 0;
+#endif
+    ThrData *thr = fix->get_thr(tid);
+    double * const * const f = thr->get_f();
+    FFT_SCALAR * const * const r1d =  static_cast<FFT_SCALAR **>(thr->get_rho1d_6());
+
+    int l,m,n,nx,ny,nz,mx,my,mz;
+    FFT_SCALAR dx,dy,dz,x0,y0,z0;
+    FFT_SCALAR ekx,eky,ekz;
+    int type;
+    double lj;
+
+    // this if protects against having more threads than local atoms
+    if (ifrom < nlocal) {
+      for (int i = ifrom; i < ito; i++) {
+
+        nx = part2grid_6[i][0];
+        ny = part2grid_6[i][1];
+        nz = part2grid_6[i][2];
+        dx = nx+shiftone_6 - (x[i][0]-boxlo[0])*delxinv_6;
+        dy = ny+shiftone_6 - (x[i][1]-boxlo[1])*delyinv_6;
+        dz = nz+shiftone_6 - (x[i][2]-boxlo[2])*delzinv_6;
+
+        compute_rho1d_thr(r1d,dx,dy,dz, order_6, rho_coeff_6);
+
+        ekx = eky = ekz = ZEROF;
+        for (n = nlower_6; n <= nupper_6; n++) {
+          mz = n+nz;
+          z0 = r1d[2][n];
+          for (m = nlower_6; m <= nupper_6; m++) {
+            my = m+ny;
+            y0 = z0*r1d[1][m];
+            for (l = nlower_6; l <= nupper_6; l++) {
+              mx = l+nx;
+              x0 = y0*r1d[0][l];
+              ekx -= x0*vdx_brick_g[mz][my][mx];
+              eky -= x0*vdy_brick_g[mz][my][mx];
+              ekz -= x0*vdz_brick_g[mz][my][mx];
+            }
+          }
+        }
+
+        // convert E-field to force
+        type = atom->type[i];
+        lj = B[type];
+        f[i][0] += lj*ekx;
+        f[i][1] += lj*eky;
+        f[i][2] += lj*ekz;
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get dispersion field & force on my particles
+   for ad scheme and geometric mixing rule
+------------------------------------------------------------------------- */
+
+void PPPMDispOMP::fieldforce_g_ad()
+{
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+
+  const double * const * const x = atom->x;
+  const int nthreads = comm->nthreads;
+  const int nlocal = atom->nlocal;
+
+  double *prd;
+
+  if (triclinic == 0) prd = domain->prd;
+  else prd = domain->prd_lamda;
+
+  double xprd = prd[0];
+  double yprd = prd[1];
+  double zprd = prd[2];
+  double zprd_slab = zprd*slab_volfactor;
+
+  const double hx_inv = nx_pppm_6/xprd;
+  const double hy_inv = ny_pppm_6/yprd;
+  const double hz_inv = nz_pppm_6/zprd_slab;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+#if defined(_OPENMP)
+    // each thread works on a fixed chunk of atoms.
+    const int tid = omp_get_thread_num();
+    const int inum = nlocal;
+    const int idelta = 1 + inum/nthreads;
+    const int ifrom = tid*idelta;
+    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
+#else
+    const int ifrom = 0;
+    const int ito = nlocal;
+    const int tid = 0;
+#endif
+    ThrData *thr = fix->get_thr(tid);
+    double * const * const f = thr->get_f();
+    FFT_SCALAR * const * const r1d =  static_cast<FFT_SCALAR **>(thr->get_rho1d_6());
+    FFT_SCALAR * const * const dr1d = static_cast<FFT_SCALAR **>(thr->get_drho1d_6());
+
+    int l,m,n,nx,ny,nz,mx,my,mz;
+    FFT_SCALAR dx,dy,dz,x0,y0,z0;
+    FFT_SCALAR ekx,eky,ekz;
+    int type;
+    double lj;
+    double sf = 0.0;
+    double s1,s2,s3;
+
+    // this if protects against having more threads than local atoms
+    if (ifrom < nlocal) {
+      for (int i = ifrom; i < ito; i++) {
+
+        nx = part2grid_6[i][0];
+        ny = part2grid_6[i][1];
+        nz = part2grid_6[i][2];
+        dx = nx+shiftone_6 - (x[i][0]-boxlo[0])*delxinv_6;
+        dy = ny+shiftone_6 - (x[i][1]-boxlo[1])*delyinv_6;
+        dz = nz+shiftone_6 - (x[i][2]-boxlo[2])*delzinv_6;
+
+        compute_rho1d_thr(r1d,dx,dy,dz, order_6, rho_coeff_6);
+        compute_drho1d_thr(dr1d,dx,dy,dz, order_6, drho_coeff_6);
+
+        ekx = eky = ekz = ZEROF;
+        for (n = nlower_6; n <= nupper_6; n++) {
+          mz = n+nz;
+          for (m = nlower_6; m <= nupper_6; m++) {
+            my = m+ny;
+            for (l = nlower_6; l <= nupper_6; l++) {
+              mx = l+nx;
+              ekx += dr1d[0][l]*r1d[1][m]*r1d[2][n]*u_brick_g[mz][my][mx];
+              eky += r1d[0][l]*dr1d[1][m]*r1d[2][n]*u_brick_g[mz][my][mx];
+              ekz += r1d[0][l]*r1d[1][m]*dr1d[2][n]*u_brick_g[mz][my][mx];
+            }
+          }
+        }
+        ekx *= hx_inv;
+        eky *= hy_inv;
+        ekz *= hz_inv;
+
+        // convert E-field to force
+        type = atom->type[i];
+        lj = B[type];
+
+        s1 = x[i][0]*hx_inv;
+        s2 = x[i][1]*hy_inv;
+        s3 = x[i][2]*hz_inv;
+
+        sf = sf_coeff_6[0]*sin(2*MY_PI*s1);
+        sf += sf_coeff_6[1]*sin(4*MY_PI*s1);
+        sf *= 2*lj*lj;
+        f[i][0] += ekx*lj - sf;
+
+        sf = sf_coeff_6[2]*sin(2*MY_PI*s2);
+        sf += sf_coeff_6[3]*sin(4*MY_PI*s2);
+        sf *= 2*lj*lj;
+        f[i][1] += eky*lj - sf;
+
+        sf = sf_coeff_6[4]*sin(2*MY_PI*s3);
+        sf += sf_coeff_6[5]*sin(4*MY_PI*s3);
+        sf *= 2*lj*lj;
+        if (slabflag != 2) f[i][2] += ekz*lj - sf;
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+ interpolate from grid to get per-atom energy/virial for dispersion
+ interaction and geometric mixing rule
+ ------------------------------------------------------------------------- */
+
+void PPPMDispOMP::fieldforce_g_peratom()
+{
+  // loop over my charges, interpolate from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  const double * const * const x = atom->x;
+  const int nthreads = comm->nthreads;
+  const int nlocal = atom->nlocal;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+#if defined(_OPENMP)
+    // each thread works on a fixed chunk of atoms.
+    const int tid = omp_get_thread_num();
+    const int inum = nlocal;
+    const int idelta = 1 + inum/nthreads;
+    const int ifrom = tid*idelta;
+    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
+#else
+    const int ifrom = 0;
+    const int ito = nlocal;
+    const int tid = 0;
+#endif
+    ThrData *thr = fix->get_thr(tid);
+    FFT_SCALAR * const * const r1d =  static_cast<FFT_SCALAR **>(thr->get_rho1d_6());
+
+    int i,l,m,n,nx,ny,nz,mx,my,mz;
+    FFT_SCALAR dx,dy,dz,x0,y0,z0;
+    FFT_SCALAR u,v0,v1,v2,v3,v4,v5;
+    int type;
+    double lj;
+
+    // this if protects against having more threads than local atoms
+    if (ifrom < nlocal) {
+      for (int i = ifrom; i < ito; i++) {
+
+        nx = part2grid_6[i][0];
+        ny = part2grid_6[i][1];
+        nz = part2grid_6[i][2];
+        dx = nx+shiftone_6 - (x[i][0]-boxlo[0])*delxinv_6;
+        dy = ny+shiftone_6 - (x[i][1]-boxlo[1])*delyinv_6;
+        dz = nz+shiftone_6 - (x[i][2]-boxlo[2])*delzinv_6;
+
+        compute_rho1d_thr(r1d,dx,dy,dz, order_6, rho_coeff_6);
+
+        u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF;
+        for (n = nlower_6; n <= nupper_6; n++) {
+          mz = n+nz;
+          z0 = r1d[2][n];
+          for (m = nlower_6; m <= nupper_6; m++) {
+            my = m+ny;
+            y0 = z0*r1d[1][m];
+            for (l = nlower_6; l <= nupper_6; l++) {
+              mx = l+nx;
+              x0 = y0*r1d[0][l];
+              if (eflag_atom) u += x0*u_brick_g[mz][my][mx];
+              if (vflag_atom) {
+                v0 += x0*v0_brick_g[mz][my][mx];
+                v1 += x0*v1_brick_g[mz][my][mx];
+                v2 += x0*v2_brick_g[mz][my][mx];
+                v3 += x0*v3_brick_g[mz][my][mx];
+                v4 += x0*v4_brick_g[mz][my][mx];
+                v5 += x0*v5_brick_g[mz][my][mx];
+              }
+            }
+          }
+        }
+
+        type = atom->type[i];
+        lj = B[type]*0.5;
+
+        if (eflag_atom) eatom[i] += u*lj;
+        if (vflag_atom) {
+          vatom[i][0] += v0*lj;
+          vatom[i][1] += v1*lj;
+          vatom[i][2] += v2*lj;
+          vatom[i][3] += v3*lj;
+          vatom[i][4] += v4*lj;
+          vatom[i][5] += v5*lj;
+        }
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get dispersion field & force on my particles
+   for ik scheme and arithmetic mixing rule
+------------------------------------------------------------------------- */
+
+void PPPMDispOMP::fieldforce_a_ik()
+{
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+
+  const double * const * const x = atom->x;
+  const int nthreads = comm->nthreads;
+  const int nlocal = atom->nlocal;
+  const double qqrd2e = force->qqrd2e;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+#if defined(_OPENMP)
+    // each thread works on a fixed chunk of atoms.
+    const int tid = omp_get_thread_num();
+    const int inum = nlocal;
+    const int idelta = 1 + inum/nthreads;
+    const int ifrom = tid*idelta;
+    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
+#else
+    const int ifrom = 0;
+    const int ito = nlocal;
+    const int tid = 0;
+#endif
+    ThrData *thr = fix->get_thr(tid);
+    double * const * const f = thr->get_f();
+    FFT_SCALAR * const * const r1d =  static_cast<FFT_SCALAR **>(thr->get_rho1d_6());
+
+    int l,m,n,nx,ny,nz,mx,my,mz;
+    FFT_SCALAR dx,dy,dz,x0,y0,z0;
+    FFT_SCALAR ekx0, eky0, ekz0, ekx1, eky1, ekz1, ekx2, eky2, ekz2;
+    FFT_SCALAR ekx3, eky3, ekz3, ekx4, eky4, ekz4, ekx5, eky5, ekz5;
+    FFT_SCALAR ekx6, eky6, ekz6;
+    int type;
+    double lj0,lj1,lj2,lj3,lj4,lj5,lj6;
+
+    // this if protects against having more threads than local atoms
+    if (ifrom < nlocal) {
+      for (int i = ifrom; i < ito; i++) {
+
+        nx = part2grid_6[i][0];
+        ny = part2grid_6[i][1];
+        nz = part2grid_6[i][2];
+        dx = nx+shiftone_6 - (x[i][0]-boxlo[0])*delxinv_6;
+        dy = ny+shiftone_6 - (x[i][1]-boxlo[1])*delyinv_6;
+        dz = nz+shiftone_6 - (x[i][2]-boxlo[2])*delzinv_6;
+
+        compute_rho1d_thr(r1d,dx,dy,dz, order_6, rho_coeff_6);
+
+        ekx0 = eky0 = ekz0 = ZEROF;
+        ekx1 = eky1 = ekz1 = ZEROF;
+        ekx2 = eky2 = ekz2 = ZEROF;
+        ekx3 = eky3 = ekz3 = ZEROF;
+        ekx4 = eky4 = ekz4 = ZEROF;
+        ekx5 = eky5 = ekz5 = ZEROF;
+        ekx6 = eky6 = ekz6 = ZEROF;
+        for (n = nlower_6; n <= nupper_6; n++) {
+          mz = n+nz;
+          z0 = r1d[2][n];
+          for (m = nlower_6; m <= nupper_6; m++) {
+            my = m+ny;
+            y0 = z0*r1d[1][m];
+            for (l = nlower_6; l <= nupper_6; l++) {
+              mx = l+nx;
+              x0 = y0*r1d[0][l];
+	      ekx0 -= x0*vdx_brick_a0[mz][my][mx];
+	      eky0 -= x0*vdy_brick_a0[mz][my][mx];
+	      ekz0 -= x0*vdz_brick_a0[mz][my][mx];
+	      ekx1 -= x0*vdx_brick_a1[mz][my][mx];
+	      eky1 -= x0*vdy_brick_a1[mz][my][mx];
+	      ekz1 -= x0*vdz_brick_a1[mz][my][mx];
+              ekx2 -= x0*vdx_brick_a2[mz][my][mx];
+	      eky2 -= x0*vdy_brick_a2[mz][my][mx];
+	      ekz2 -= x0*vdz_brick_a2[mz][my][mx];
+	      ekx3 -= x0*vdx_brick_a3[mz][my][mx];
+	      eky3 -= x0*vdy_brick_a3[mz][my][mx];
+	      ekz3 -= x0*vdz_brick_a3[mz][my][mx];
+	      ekx4 -= x0*vdx_brick_a4[mz][my][mx];
+	      eky4 -= x0*vdy_brick_a4[mz][my][mx];
+	      ekz4 -= x0*vdz_brick_a4[mz][my][mx];
+              ekx5 -= x0*vdx_brick_a5[mz][my][mx];
+	      eky5 -= x0*vdy_brick_a5[mz][my][mx];
+	      ekz5 -= x0*vdz_brick_a5[mz][my][mx];
+              ekx6 -= x0*vdx_brick_a6[mz][my][mx];
+	      eky6 -= x0*vdy_brick_a6[mz][my][mx];
+	      ekz6 -= x0*vdz_brick_a6[mz][my][mx];
+            }
+          }
+        }
+
+        // convert D-field to force
+        type = atom->type[i];
+        lj0 = B[7*type+6];
+        lj1 = B[7*type+5];
+        lj2 = B[7*type+4];
+        lj3 = B[7*type+3];
+        lj4 = B[7*type+2];
+        lj5 = B[7*type+1];
+        lj6 = B[7*type];
+        f[i][0] += lj0*ekx0 + lj1*ekx1 + lj2*ekx2 + lj3*ekx3 + lj4*ekx4 + lj5*ekx5 + lj6*ekx6;
+        f[i][1] += lj0*eky0 + lj1*eky1 + lj2*eky2 + lj3*eky3 + lj4*eky4 + lj5*eky5 + lj6*eky6;
+        f[i][2] += lj0*ekz0 + lj1*ekz1 + lj2*ekz2 + lj3*ekz3 + lj4*ekz4 + lj5*ekz5 + lj6*ekz6;
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get dispersion field & force on my particles
+   for ad scheme and arithmetic mixing rule
+------------------------------------------------------------------------- */
+
+void PPPMDispOMP::fieldforce_a_ad()
+{
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+
+  const double * const * const x = atom->x;
+  const int nthreads = comm->nthreads;
+  const int nlocal = atom->nlocal;
+
+  double *prd;
+
+  if (triclinic == 0) prd = domain->prd;
+  else prd = domain->prd_lamda;
+
+  double xprd = prd[0];
+  double yprd = prd[1];
+  double zprd = prd[2];
+  double zprd_slab = zprd*slab_volfactor;
+
+  const double hx_inv = nx_pppm_6/xprd;
+  const double hy_inv = ny_pppm_6/yprd;
+  const double hz_inv = nz_pppm_6/zprd_slab;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+#if defined(_OPENMP)
+    // each thread works on a fixed chunk of atoms.
+    const int tid = omp_get_thread_num();
+    const int inum = nlocal;
+    const int idelta = 1 + inum/nthreads;
+    const int ifrom = tid*idelta;
+    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
+#else
+    const int ifrom = 0;
+    const int ito = nlocal;
+    const int tid = 0;
+#endif
+    ThrData *thr = fix->get_thr(tid);
+    double * const * const f = thr->get_f();
+    FFT_SCALAR * const * const r1d =  static_cast<FFT_SCALAR **>(thr->get_rho1d_6());
+    FFT_SCALAR * const * const dr1d = static_cast<FFT_SCALAR **>(thr->get_drho1d_6());
+
+    int l,m,n,nx,ny,nz,mx,my,mz;
+    FFT_SCALAR dx,dy,dz,x0,y0,z0;
+    FFT_SCALAR ekx0, eky0, ekz0, ekx1, eky1, ekz1, ekx2, eky2, ekz2;
+    FFT_SCALAR ekx3, eky3, ekz3, ekx4, eky4, ekz4, ekx5, eky5, ekz5;
+    FFT_SCALAR ekx6, eky6, ekz6;
+    int type;
+    double lj0,lj1,lj2,lj3,lj4,lj5,lj6;
+    double sf = 0.0;
+    double s1,s2,s3;
+
+    // this if protects against having more threads than local atoms
+    if (ifrom < nlocal) {
+      for (int i = ifrom; i < ito; i++) {
+
+        nx = part2grid_6[i][0];
+        ny = part2grid_6[i][1];
+        nz = part2grid_6[i][2];
+        dx = nx+shiftone_6 - (x[i][0]-boxlo[0])*delxinv_6;
+        dy = ny+shiftone_6 - (x[i][1]-boxlo[1])*delyinv_6;
+        dz = nz+shiftone_6 - (x[i][2]-boxlo[2])*delzinv_6;
+
+        compute_rho1d_thr(r1d,dx,dy,dz, order_6, rho_coeff_6);
+        compute_drho1d_thr(dr1d,dx,dy,dz, order_6, drho_coeff_6);
+
+        ekx0 = eky0 = ekz0 = ZEROF;
+        ekx1 = eky1 = ekz1 = ZEROF;
+        ekx2 = eky2 = ekz2 = ZEROF;
+        ekx3 = eky3 = ekz3 = ZEROF;
+        ekx4 = eky4 = ekz4 = ZEROF;
+        ekx5 = eky5 = ekz5 = ZEROF;
+        ekx6 = eky6 = ekz6 = ZEROF;
+        for (n = nlower_6; n <= nupper_6; n++) {
+          mz = n+nz;
+          for (m = nlower_6; m <= nupper_6; m++) {
+            my = m+ny;
+            for (l = nlower_6; l <= nupper_6; l++) {
+              mx = l+nx;
+              x0 = dr1d[0][l]*r1d[1][m]*r1d[2][n];
+              y0 = r1d[0][l]*dr1d[1][m]*r1d[2][n];
+              z0 = r1d[0][l]*r1d[1][m]*dr1d[2][n];
+
+              ekx0 += x0*u_brick_a0[mz][my][mx];
+              eky0 += y0*u_brick_a0[mz][my][mx];
+              ekz0 += z0*u_brick_a0[mz][my][mx];
+
+              ekx1 += x0*u_brick_a1[mz][my][mx];
+              eky1 += y0*u_brick_a1[mz][my][mx];
+              ekz1 += z0*u_brick_a1[mz][my][mx];
+
+              ekx2 += x0*u_brick_a2[mz][my][mx];
+              eky2 += y0*u_brick_a2[mz][my][mx];
+              ekz2 += z0*u_brick_a2[mz][my][mx];
+
+              ekx3 += x0*u_brick_a3[mz][my][mx];
+              eky3 += y0*u_brick_a3[mz][my][mx];
+              ekz3 += z0*u_brick_a3[mz][my][mx];
+
+              ekx4 += x0*u_brick_a4[mz][my][mx];
+              eky4 += y0*u_brick_a4[mz][my][mx];
+              ekz4 += z0*u_brick_a4[mz][my][mx];
+
+              ekx5 += x0*u_brick_a5[mz][my][mx];
+              eky5 += y0*u_brick_a5[mz][my][mx];
+              ekz5 += z0*u_brick_a5[mz][my][mx];
+
+              ekx6 += x0*u_brick_a6[mz][my][mx];
+              eky6 += y0*u_brick_a6[mz][my][mx];
+              ekz6 += z0*u_brick_a6[mz][my][mx];
+            }
+          }
+        }
+
+        ekx0 *= hx_inv;
+        eky0 *= hy_inv;
+        ekz0 *= hz_inv;
+
+        ekx1 *= hx_inv;
+        eky1 *= hy_inv;
+        ekz1 *= hz_inv;
+
+        ekx2 *= hx_inv;
+        eky2 *= hy_inv;
+        ekz2 *= hz_inv;
+
+        ekx3 *= hx_inv;
+        eky3 *= hy_inv;
+        ekz3 *= hz_inv;
+
+        ekx4 *= hx_inv;
+        eky4 *= hy_inv;
+        ekz4 *= hz_inv;
+
+        ekx5 *= hx_inv;
+        eky5 *= hy_inv;
+        ekz5 *= hz_inv;
+
+        ekx6 *= hx_inv;
+        eky6 *= hy_inv;
+        ekz6 *= hz_inv;
+
+        // convert D-field to force
+        type = atom->type[i];
+        lj0 = B[7*type+6];
+        lj1 = B[7*type+5];
+        lj2 = B[7*type+4];
+        lj3 = B[7*type+3];
+        lj4 = B[7*type+2];
+        lj5 = B[7*type+1];
+        lj6 = B[7*type];
+
+        s1 = x[i][0]*hx_inv;
+        s2 = x[i][1]*hy_inv;
+        s3 = x[i][2]*hz_inv;
+
+        sf = sf_coeff_6[0]*sin(2*MY_PI*s1);
+        sf += sf_coeff_6[1]*sin(4*MY_PI*s1);
+        sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3;
+        f[i][0] += lj0*ekx0 + lj1*ekx1 + lj2*ekx2 + lj3*ekx3 + lj4*ekx4 + lj5*ekx5 + lj6*ekx6 - sf;
+
+        sf = sf_coeff_6[2]*sin(2*MY_PI*s2);
+        sf += sf_coeff_6[3]*sin(4*MY_PI*s2);
+        sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3;
+        f[i][1] += lj0*eky0 + lj1*eky1 + lj2*eky2 + lj3*eky3 + lj4*eky4 + lj5*eky5 + lj6*eky6 - sf;
+
+        sf = sf_coeff_6[4]*sin(2*MY_PI*s3);
+        sf += sf_coeff_6[5]*sin(4*MY_PI*s3);
+        sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3;
+        if (slabflag != 2) f[i][2] += lj0*ekz0 + lj1*ekz1 + lj2*ekz2 + lj3*ekz3 + lj4*ekz4 + lj5*ekz5 + lj6*ekz6 - sf;
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+ interpolate from grid to get per-atom energy/virial for dispersion
+ interaction and arithmetic mixing rule
+ ------------------------------------------------------------------------- */
+
+void PPPMDispOMP::fieldforce_a_peratom()
+{
+  // loop over my charges, interpolate from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  const double * const * const x = atom->x;
+  const int nthreads = comm->nthreads;
+  const int nlocal = atom->nlocal;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+#if defined(_OPENMP)
+    // each thread works on a fixed chunk of atoms.
+    const int tid = omp_get_thread_num();
+    const int inum = nlocal;
+    const int idelta = 1 + inum/nthreads;
+    const int ifrom = tid*idelta;
+    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
+#else
+    const int ifrom = 0;
+    const int ito = nlocal;
+    const int tid = 0;
+#endif
+    ThrData *thr = fix->get_thr(tid);
+    FFT_SCALAR * const * const r1d =  static_cast<FFT_SCALAR **>(thr->get_rho1d_6());
+
+    int i,l,m,n,nx,ny,nz,mx,my,mz;
+    FFT_SCALAR dx,dy,dz,x0,y0,z0;
+    FFT_SCALAR u0,v00,v10,v20,v30,v40,v50;
+    FFT_SCALAR u1,v01,v11,v21,v31,v41,v51;
+    FFT_SCALAR u2,v02,v12,v22,v32,v42,v52;
+    FFT_SCALAR u3,v03,v13,v23,v33,v43,v53;
+    FFT_SCALAR u4,v04,v14,v24,v34,v44,v54;
+    FFT_SCALAR u5,v05,v15,v25,v35,v45,v55;
+    FFT_SCALAR u6,v06,v16,v26,v36,v46,v56;
+    int type;
+    double lj0,lj1,lj2,lj3,lj4,lj5,lj6;
+
+    // this if protects against having more threads than local atoms
+    if (ifrom < nlocal) {
+      for (int i = ifrom; i < ito; i++) {
+
+        nx = part2grid_6[i][0];
+        ny = part2grid_6[i][1];
+        nz = part2grid_6[i][2];
+        dx = nx+shiftone_6 - (x[i][0]-boxlo[0])*delxinv_6;
+        dy = ny+shiftone_6 - (x[i][1]-boxlo[1])*delyinv_6;
+        dz = nz+shiftone_6 - (x[i][2]-boxlo[2])*delzinv_6;
+
+        compute_rho1d_thr(r1d,dx,dy,dz, order_6, rho_coeff_6);
+
+        u0 = v00 = v10 = v20 = v30 = v40 = v50 = ZEROF;
+        u1 = v01 = v11 = v21 = v31 = v41 = v51 = ZEROF;
+        u2 = v02 = v12 = v22 = v32 = v42 = v52 = ZEROF;
+        u3 = v03 = v13 = v23 = v33 = v43 = v53 = ZEROF;
+        u4 = v04 = v14 = v24 = v34 = v44 = v54 = ZEROF;
+        u5 = v05 = v15 = v25 = v35 = v45 = v55 = ZEROF;
+        u6 = v06 = v16 = v26 = v36 = v46 = v56 = ZEROF;
+        for (n = nlower_6; n <= nupper_6; n++) {
+          mz = n+nz;
+          z0 = r1d[2][n];
+          for (m = nlower_6; m <= nupper_6; m++) {
+            my = m+ny;
+            y0 = z0*r1d[1][m];
+            for (l = nlower_6; l <= nupper_6; l++) {
+              mx = l+nx;
+              x0 = y0*r1d[0][l];
+              if (eflag_atom) {
+                u0 += x0*u_brick_a0[mz][my][mx];
+                u1 += x0*u_brick_a1[mz][my][mx];
+                u2 += x0*u_brick_a2[mz][my][mx];
+                u3 += x0*u_brick_a3[mz][my][mx];
+                u4 += x0*u_brick_a4[mz][my][mx];
+                u5 += x0*u_brick_a5[mz][my][mx];
+                u6 += x0*u_brick_a6[mz][my][mx];
+	      }
+              if (vflag_atom) {
+                v00 += x0*v0_brick_a0[mz][my][mx];
+                v10 += x0*v1_brick_a0[mz][my][mx];
+                v20 += x0*v2_brick_a0[mz][my][mx];
+                v30 += x0*v3_brick_a0[mz][my][mx];
+                v40 += x0*v4_brick_a0[mz][my][mx];
+                v50 += x0*v5_brick_a0[mz][my][mx];
+                v01 += x0*v0_brick_a1[mz][my][mx];
+                v11 += x0*v1_brick_a1[mz][my][mx];
+                v21 += x0*v2_brick_a1[mz][my][mx];
+                v31 += x0*v3_brick_a1[mz][my][mx];
+                v41 += x0*v4_brick_a1[mz][my][mx];
+                v51 += x0*v5_brick_a1[mz][my][mx];
+                v02 += x0*v0_brick_a2[mz][my][mx];
+                v12 += x0*v1_brick_a2[mz][my][mx];
+                v22 += x0*v2_brick_a2[mz][my][mx];
+                v32 += x0*v3_brick_a2[mz][my][mx];
+                v42 += x0*v4_brick_a2[mz][my][mx];
+                v52 += x0*v5_brick_a2[mz][my][mx];
+                v03 += x0*v0_brick_a3[mz][my][mx];
+                v13 += x0*v1_brick_a3[mz][my][mx];
+                v23 += x0*v2_brick_a3[mz][my][mx];
+                v33 += x0*v3_brick_a3[mz][my][mx];
+                v43 += x0*v4_brick_a3[mz][my][mx];
+                v53 += x0*v5_brick_a3[mz][my][mx];
+                v04 += x0*v0_brick_a4[mz][my][mx];
+                v14 += x0*v1_brick_a4[mz][my][mx];
+                v24 += x0*v2_brick_a4[mz][my][mx];
+                v34 += x0*v3_brick_a4[mz][my][mx];
+                v44 += x0*v4_brick_a4[mz][my][mx];
+                v54 += x0*v5_brick_a4[mz][my][mx];
+                v05 += x0*v0_brick_a5[mz][my][mx];
+                v15 += x0*v1_brick_a5[mz][my][mx];
+                v25 += x0*v2_brick_a5[mz][my][mx];
+                v35 += x0*v3_brick_a5[mz][my][mx];
+                v45 += x0*v4_brick_a5[mz][my][mx];
+                v55 += x0*v5_brick_a5[mz][my][mx];
+                v06 += x0*v0_brick_a6[mz][my][mx];
+                v16 += x0*v1_brick_a6[mz][my][mx];
+                v26 += x0*v2_brick_a6[mz][my][mx];
+                v36 += x0*v3_brick_a6[mz][my][mx];
+                v46 += x0*v4_brick_a6[mz][my][mx];
+                v56 += x0*v5_brick_a6[mz][my][mx];
+              }
+            }
+          }
+        }
+
+        // convert D-field to force
+        type = atom->type[i];
+        lj0 = B[7*type+6]*0.5;
+        lj1 = B[7*type+5]*0.5;
+        lj2 = B[7*type+4]*0.5;
+        lj3 = B[7*type+3]*0.5;
+        lj4 = B[7*type+2]*0.5;
+        lj5 = B[7*type+1]*0.5;
+        lj6 = B[7*type]*0.5;
+
+        if (eflag_atom)
+          eatom[i] += u0*lj0 + u1*lj1 + u2*lj2 + 
+            u3*lj3 + u4*lj4 + u5*lj5 + u6*lj6;
+        if (vflag_atom) {
+          vatom[i][0] += v00*lj0 + v01*lj1 + v02*lj2 + v03*lj3 + 
+            v04*lj4 + v05*lj5 + v06*lj6;
+          vatom[i][1] += v10*lj0 + v11*lj1 + v12*lj2 + v13*lj3 + 
+            v14*lj4 + v15*lj5 + v16*lj6;
+          vatom[i][2] += v20*lj0 + v21*lj1 + v22*lj2 + v23*lj3 + 
+            v24*lj4 + v25*lj5 + v26*lj6;
+          vatom[i][3] += v30*lj0 + v31*lj1 + v32*lj2 + v33*lj3 + 
+            v34*lj4 + v35*lj5 + v36*lj6;
+          vatom[i][4] += v40*lj0 + v41*lj1 + v42*lj2 + v43*lj3 + 
+            v44*lj4 + v45*lj5 + v46*lj6;
+          vatom[i][5] += v50*lj0 + v51*lj1 + v52*lj2 + v53*lj3 + 
+            v54*lj4 + v55*lj5 + v56*lj6;
+        }
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   charge assignment into rho1d
+   dx,dy,dz = distance of particle from "lower left" grid point
+------------------------------------------------------------------------- */
+void PPPMDispOMP::compute_rho1d_thr(FFT_SCALAR * const * const r1d, const FFT_SCALAR &dx,
+				    const FFT_SCALAR &dy, const FFT_SCALAR &dz,
+                                    const int ord, FFT_SCALAR * const * const rho_c)
+{
+  int k,l;
+  FFT_SCALAR r1,r2,r3;
+
+  for (k = (1-ord)/2; k <= ord/2; k++) {
+    r1 = r2 = r3 = ZEROF;
+
+    for (l = ord-1; l >= 0; l--) {
+      r1 = rho_c[l][k] + r1*dx;
+      r2 = rho_c[l][k] + r2*dy;
+      r3 = rho_c[l][k] + r3*dz;
+    }
+    r1d[0][k] = r1;
+    r1d[1][k] = r2;
+    r1d[2][k] = r3;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   charge assignment into drho1d
+   dx,dy,dz = distance of particle from "lower left" grid point
+------------------------------------------------------------------------- */
+
+void PPPMDispOMP::compute_drho1d_thr(FFT_SCALAR * const * const dr1d, const FFT_SCALAR &dx,
+				    const FFT_SCALAR &dy, const FFT_SCALAR &dz,
+                                    const int ord, FFT_SCALAR * const * const drho_c)
+{
+  int k,l;
+  FFT_SCALAR r1,r2,r3;
+
+  for (k = (1-ord)/2; k <= ord/2; k++) {
+    r1 = r2 = r3 = ZEROF;
+
+    for (l = ord-2; l >= 0; l--) {
+      r1 = drho_c[l][k] + r1*dx;
+      r2 = drho_c[l][k] + r2*dy;
+      r3 = drho_c[l][k] + r3*dz;
+    }
+    dr1d[0][k] = r1;
+    dr1d[1][k] = r2;
+    dr1d[2][k] = r3;
+  }
+}
diff --git a/src/USER-OMP/pppm_disp_omp.h b/src/USER-OMP/pppm_disp_omp.h
new file mode 100644
index 0000000000..e2f588d169
--- /dev/null
+++ b/src/USER-OMP/pppm_disp_omp.h
@@ -0,0 +1,74 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef KSPACE_CLASS
+
+KSpaceStyle(pppm/disp/omp,PPPMDispOMP)
+
+#else
+
+#ifndef LMP_PPPM_DISP_OMP_H
+#define LMP_PPPM_DISP_OMP_H
+
+#include "pppm_disp.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+  class PPPMDispOMP : public PPPMDisp, public ThrOMP {
+ public:
+  PPPMDispOMP(class LAMMPS *, int, char **);
+  virtual ~PPPMDispOMP () {};
+  virtual void compute(int, int);
+
+ protected:
+  virtual void allocate();
+  virtual void deallocate();
+
+  virtual void compute_gf();
+  virtual void compute_gf_6();
+
+  virtual void particle_map(double,double,double,
+                            double,int**,int,int,
+                            int,int,int,int,int,int);
+                                
+
+  virtual void fieldforce_c_ik();
+  virtual void fieldforce_c_ad();
+  virtual void fieldforce_c_peratom();
+  virtual void fieldforce_g_ik();
+  virtual void fieldforce_g_ad();
+  virtual void fieldforce_g_peratom();
+  virtual void fieldforce_a_ik();
+  virtual void fieldforce_a_ad();
+  virtual void fieldforce_a_peratom();
+
+  virtual void make_rho_c();
+  virtual void make_rho_g();
+  virtual void make_rho_a();
+
+  void compute_rho1d_thr(FFT_SCALAR * const * const, const FFT_SCALAR &,
+                         const FFT_SCALAR &, const FFT_SCALAR &,
+                         const int, FFT_SCALAR * const * const);
+  void compute_drho1d_thr(FFT_SCALAR * const * const, const FFT_SCALAR &,
+			  const FFT_SCALAR &, const FFT_SCALAR &,
+                          const int, FFT_SCALAR * const * const);
+//  void compute_rho_coeff();
+//  void slabcorr(int);
+
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-OMP/pppm_disp_tip4p_omp.cpp b/src/USER-OMP/pppm_disp_tip4p_omp.cpp
new file mode 100644
index 0000000000..bbb96228a3
--- /dev/null
+++ b/src/USER-OMP/pppm_disp_tip4p_omp.cpp
@@ -0,0 +1,1826 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "pppm_disp_tip4p_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "domain.h"
+#include "error.h"
+#include "fix_omp.h"
+#include "force.h"
+#include "memory.h"
+#include "math_const.h"
+#include "math_special.h"
+
+#include <string.h>
+#include <math.h>
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+using namespace MathSpecial;
+
+#ifdef FFT_SINGLE
+#define ZEROF 0.0f
+#else
+#define ZEROF 0.0
+#endif
+
+#define OFFSET 16384
+
+/* ---------------------------------------------------------------------- */
+
+PPPMDispTIP4POMP::PPPMDispTIP4POMP(LAMMPS *lmp, int narg, char **arg) :
+  PPPMDispTIP4P(lmp, narg, arg), ThrOMP(lmp, THR_KSPACE)
+{
+  tip4pflag = 1;
+  suffix_flag |= Suffix::OMP;
+}
+
+/* ----------------------------------------------------------------------
+   allocate memory that depends on # of K-vectors and order
+------------------------------------------------------------------------- */
+
+void PPPMDispTIP4POMP::allocate()
+{
+  PPPMDispTIP4P::allocate();
+
+  const int nthreads = comm->nthreads;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+#if defined(_OPENMP)
+    const int tid = omp_get_thread_num();
+#else
+    const int tid = 0;
+#endif
+
+    if (function[0]) {
+      ThrData *thr = fix->get_thr(tid);
+      thr->init_pppm(order,memory);
+    }
+    if (function[1] + function[2]) {
+      ThrData * thr = fix->get_thr(tid);
+      thr->init_pppm_disp(order_6,memory);  
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   free memory that depends on # of K-vectors and order
+------------------------------------------------------------------------- */
+
+void PPPMDispTIP4POMP::deallocate()
+{
+  PPPMDispTIP4P::deallocate();
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+#if defined(_OPENMP)
+    const int tid = omp_get_thread_num();
+#else
+    const int tid = 0;
+#endif
+    if (function[0]) {
+      ThrData * thr = fix->get_thr(tid);
+      thr->init_pppm(-order,memory);
+    }
+    if (function[1] + function[2]) {
+      ThrData * thr = fix->get_thr(tid);
+      thr->init_pppm_disp(-order_6,memory);
+    }
+  }
+}
+
+
+/* ----------------------------------------------------------------------
+   Compute the modified (hockney-eastwood) coulomb green function
+------------------------------------------------------------------------- */
+
+void PPPMDispTIP4POMP::compute_gf()
+{
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+
+    double *prd;
+    if (triclinic == 0) prd = domain->prd;
+    else prd = domain->prd_lamda;
+
+    double xprd = prd[0];
+    double yprd = prd[1];
+    double zprd = prd[2];
+    double zprd_slab = zprd*slab_volfactor;
+
+    double unitkx = (2.0*MY_PI/xprd);
+    double unitky = (2.0*MY_PI/yprd);
+    double unitkz = (2.0*MY_PI/zprd_slab);
+
+    int tid,nn,nnfrom,nnto,nx,ny,nz,k,l,m;
+    int kper,lper,mper;
+    double snx,sny,snz,snx2,sny2,snz2;
+    double sqk;
+    double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
+    double sum1,dot1,dot2;
+    double numerator,denominator;
+
+    const int nnx = nxhi_fft-nxlo_fft+1;
+    const int nny = nyhi_fft-nylo_fft+1;
+
+    loop_setup_thr(nnfrom, nnto, tid, nfft, comm->nthreads);
+
+    for (m = nzlo_fft; m <= nzhi_fft; m++) {
+      mper = m - nz_pppm*(2*m/nz_pppm);
+      qz = unitkz*mper;
+      snz = sin(0.5*qz*zprd_slab/nz_pppm);
+      snz2 = snz*snz;
+      sz = exp(-0.25*pow(qz/g_ewald,2.0));
+      wz = 1.0;
+      argz = 0.5*qz*zprd_slab/nz_pppm;
+      if (argz != 0.0) wz = pow(sin(argz)/argz,order);
+      wz *= wz;
+
+      for (l = nylo_fft; l <= nyhi_fft; l++) {
+        lper = l - ny_pppm*(2*l/ny_pppm);
+        qy = unitky*lper;
+        sny = sin(0.5*qy*yprd/ny_pppm);
+        sny2 = sny*sny;
+        sy = exp(-0.25*pow(qy/g_ewald,2.0));
+        wy = 1.0;
+        argy = 0.5*qy*yprd/ny_pppm;
+        if (argy != 0.0) wy = pow(sin(argy)/argy,order);
+        wy *= wy;
+
+        for (k = nxlo_fft; k <= nxhi_fft; k++) {
+
+          /* only compute the part designated to this thread */
+          nn = k-nxlo_fft + nnx*(l-nylo_fft + nny*(m-nzlo_fft));
+          if ((nn < nnfrom) || (nn >=nnto)) continue;
+
+          kper = k - nx_pppm*(2*k/nx_pppm);
+          qx = unitkx*kper;
+          snx = sin(0.5*qx*xprd/nx_pppm);
+          snx2 = snx*snx;
+          sx = exp(-0.25*pow(qx/g_ewald,2.0));
+          wx = 1.0;
+          argx = 0.5*qx*xprd/nx_pppm;
+          if (argx != 0.0) wx = pow(sin(argx)/argx,order);
+          wx *= wx;
+
+          sqk = pow(qx,2.0) + pow(qy,2.0) + pow(qz,2.0);
+
+          if (sqk != 0.0) {
+            numerator = 4.0*MY_PI/sqk;
+            denominator = gf_denom(snx2,sny2,snz2, gf_b, order);  
+            greensfn[nn] = numerator*sx*sy*sz*wx*wy*wz/denominator;
+          } else greensfn[nn] = 0.0;
+        }
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   Compyute the modified (hockney-eastwood) dispersion green function
+------------------------------------------------------------------------- */
+
+void PPPMDispTIP4POMP::compute_gf_6()
+{
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+    double *prd;
+    int k,l,m,nn;
+
+    // volume-dependent factors
+    // adjust z dimension for 2d slab PPPM
+    // z dimension for 3d PPPM is zprd since slab_volfactor = 1.0
+
+    if (triclinic == 0) prd = domain->prd;
+    else prd = domain->prd_lamda;
+
+    double xprd = prd[0];
+    double yprd = prd[1];
+    double zprd = prd[2];
+    double zprd_slab = zprd*slab_volfactor;
+
+    double unitkx = (2.0*MY_PI/xprd);
+    double unitky = (2.0*MY_PI/yprd);
+    double unitkz = (2.0*MY_PI/zprd_slab);
+
+    int kper,lper,mper;
+    double sqk;
+    double snx,sny,snz,snx2,sny2,snz2;
+    double argx,argy,argz,wx,wy,wz,sx,sy,sz;
+    double qx,qy,qz;
+    double rtsqk, term;
+    double numerator,denominator;
+    double inv2ew = 2*g_ewald_6;
+    inv2ew = 1/inv2ew;
+    double rtpi = sqrt(MY_PI);
+    int nnfrom, nnto, tid;
+
+    numerator = -MY_PI*rtpi*g_ewald_6*g_ewald_6*g_ewald_6/(3.0);
+
+    const int nnx = nxhi_fft_6-nxlo_fft_6+1;
+    const int nny = nyhi_fft_6-nylo_fft_6+1;
+
+    loop_setup_thr(nnfrom, nnto, tid, nfft_6, comm->nthreads);
+
+    for (m = nzlo_fft_6; m <= nzhi_fft_6; m++) {
+      mper = m - nz_pppm_6*(2*m/nz_pppm_6);
+      qz = unitkz*mper;
+      snz = sin(0.5*unitkz*mper*zprd_slab/nz_pppm_6);
+      snz2 = snz*snz;
+      sz = exp(-qz*qz*inv2ew*inv2ew);
+      wz = 1.0;
+      argz = 0.5*qz*zprd_slab/nz_pppm_6;
+      if (argz != 0.0) wz = pow(sin(argz)/argz,order_6);
+      wz *= wz;
+              
+      for (l = nylo_fft_6; l <= nyhi_fft_6; l++) {
+        lper = l - ny_pppm_6*(2*l/ny_pppm_6);
+        qy = unitky*lper;
+        sny = sin(0.5*unitky*lper*yprd/ny_pppm_6);
+        sny2 = sny*sny;
+        sy = exp(-qy*qy*inv2ew*inv2ew);
+        wy = 1.0;
+        argy = 0.5*qy*yprd/ny_pppm_6;
+        if (argy != 0.0) wy = pow(sin(argy)/argy,order_6);
+        wy *= wy;
+
+        for (k = nxlo_fft_6; k <= nxhi_fft_6; k++) {
+
+          /* only compute the part designated to this thread */
+          nn = k-nxlo_fft_6 + nnx*(l-nylo_fft_6 + nny*(m-nzlo_fft_6));
+          if ((nn < nnfrom) || (nn >=nnto)) continue;
+
+	  kper = k - nx_pppm_6*(2*k/nx_pppm_6);
+          qx = unitkx*kper;
+	  snx = sin(0.5*unitkx*kper*xprd/nx_pppm_6);
+	  snx2 = snx*snx;
+          sx = exp(-qx*qx*inv2ew*inv2ew);
+	  wx = 1.0;
+	  argx = 0.5*qx*xprd/nx_pppm_6;
+	  if (argx != 0.0) wx = pow(sin(argx)/argx,order_6);
+          wx *= wx;
+      
+	  sqk = pow(qx,2.0) + pow(qy,2.0) + pow(qz,2.0);
+
+	  denominator = gf_denom(snx2,sny2,snz2, gf_b_6, order_6); 
+	  rtsqk = sqrt(sqk);
+          term = (1-2*sqk*inv2ew*inv2ew)*sx*sy*sz +
+                  2*sqk*rtsqk*inv2ew*inv2ew*inv2ew*rtpi*erfc(rtsqk*inv2ew);
+	  greensfn_6[nn] = numerator*term*wx*wy*wz/denominator;
+        }
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   run the regular toplevel compute method from plain PPPM
+   which will have individual methods replaced by our threaded
+   versions and then call the obligatory force reduction.
+------------------------------------------------------------------------- */
+
+void PPPMDispTIP4POMP::compute(int eflag, int vflag)
+{
+
+  PPPMDispTIP4P::compute(eflag,vflag);
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+#if defined(_OPENMP)
+    const int tid = omp_get_thread_num();
+#else
+    const int tid = 0;
+#endif
+    ThrData *thr = fix->get_thr(tid);
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+/* ----------------------------------------------------------------------
+   find center grid pt for each of my particles
+   check that full stencil for the particle will fit in my 3d brick
+   store central grid pt indices in part2grid array
+------------------------------------------------------------------------- */
+
+void PPPMDispTIP4POMP::particle_map_c(double dxinv, double dyinv,
+                                      double dzinv, double sft,
+                                      int ** part2grid, int nup,
+                                      int nlw, int nxlo_o,
+                                      int nylo_o, int nzlo_o,
+                                      int nxhi_o, int nyhi_o,
+                                      int nzhi_o)
+{
+  const int * _noalias const type = atom->type;
+  const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
+  int3_t * _noalias const p2g = (int3_t *) part2grid[0];
+  const double boxlox = boxlo[0];
+  const double boxloy = boxlo[1];
+  const double boxloz = boxlo[2];
+  const int nlocal = atom->nlocal;
+
+  const double delxinv = dxinv;
+  const double delyinv = dyinv;
+  const double delzinv = dzinv;
+  const double shift = sft;
+  const int nupper = nup;
+  const int nlower = nlw;
+  const int nxlo_out = nxlo_o;
+  const int nylo_out = nylo_o;
+  const int nzlo_out = nzlo_o;
+  const int nxhi_out = nxhi_o;
+  const int nyhi_out = nyhi_o;
+  const int nzhi_out = nzhi_o;
+
+  int i, flag = 0;
+#if defined(_OPENMP)
+#pragma omp parallel for private(i) default(none) reduction(+:flag) schedule(static)
+#endif
+  for (i = 0; i < nlocal; i++) {
+    dbl3_t xM;
+    int iH1,iH2;
+
+    if (type[i] == typeO) {
+      find_M_thr(i,iH1,iH2,xM);
+    } else {
+      xM = x[i];
+    }
+
+    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+    // current particle coord can be outside global and local box
+    // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
+
+    const int nx = static_cast<int> ((xM.x-boxlox)*delxinv+shift) - OFFSET;
+    const int ny = static_cast<int> ((xM.y-boxloy)*delyinv+shift) - OFFSET;
+    const int nz = static_cast<int> ((xM.z-boxloz)*delzinv+shift) - OFFSET;
+
+    p2g[i].a = nx;
+    p2g[i].b = ny;
+    p2g[i].t = nz;
+
+    // check that entire stencil around nx,ny,nz will fit in my 3d brick
+
+    if (nx+nlower < nxlo_out || nx+nupper > nxhi_out ||
+        ny+nlower < nylo_out || ny+nupper > nyhi_out ||
+        nz+nlower < nzlo_out || nz+nupper > nzhi_out)
+      flag++;
+  }
+
+  int flag_all;
+  MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world);
+  if (flag_all) error->all(FLERR,"Out of range atoms - cannot compute PPPM");
+}
+
+/* ----------------------------------------------------------------------
+   find center grid pt for each of my particles
+   check that full stencil for the particle will fit in my 3d brick
+   store central grid pt indices in part2grid array
+------------------------------------------------------------------------- */
+
+void PPPMDispTIP4POMP::particle_map(double dxinv, double dyinv,
+                               double dzinv, double sft,
+                               int ** part2grid, int nup,
+                               int nlw, int nxlo_o,
+                               int nylo_o, int nzlo_o,
+                               int nxhi_o, int nyhi_o,
+                               int nzhi_o)
+{
+  const int * _noalias const type = atom->type;
+  const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
+  int3_t * _noalias const p2g = (int3_t *) part2grid[0];
+  const double boxlox = boxlo[0];
+  const double boxloy = boxlo[1];
+  const double boxloz = boxlo[2];
+  const int nlocal = atom->nlocal;
+
+  const double delxinv = dxinv;
+  const double delyinv = dyinv;
+  const double delzinv = dzinv;
+  const double shift = sft;
+  const int nupper = nup;
+  const int nlower = nlw;
+  const int nxlo_out = nxlo_o;
+  const int nylo_out = nylo_o;
+  const int nzlo_out = nzlo_o;
+  const int nxhi_out = nxhi_o;
+  const int nyhi_out = nyhi_o;
+  const int nzhi_out = nzhi_o;
+
+  int i, flag = 0;
+#if defined(_OPENMP)
+#pragma omp parallel for private(i) default(none) reduction(+:flag) schedule(static)
+#endif
+  for (i = 0; i < nlocal; i++) {
+
+    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+    // current particle coord can be outside global and local box
+    // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
+
+    const int nx = static_cast<int> ((x[i].x-boxlox)*delxinv+shift) - OFFSET;
+    const int ny = static_cast<int> ((x[i].y-boxloy)*delyinv+shift) - OFFSET;
+    const int nz = static_cast<int> ((x[i].z-boxloz)*delzinv+shift) - OFFSET;
+
+    p2g[i].a = nx;
+    p2g[i].b = ny;
+    p2g[i].t = nz;
+
+    // check that entire stencil around nx,ny,nz will fit in my 3d brick
+
+    if (nx+nlower < nxlo_out || nx+nupper > nxhi_out ||
+        ny+nlower < nylo_out || ny+nupper > nyhi_out ||
+        nz+nlower < nzlo_out || nz+nupper > nzhi_out)
+      flag++;
+  }
+
+  int flag_all;
+  MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world);
+  if (flag_all) error->all(FLERR,"Out of range atoms - cannot compute PPPM");
+}
+
+/* ----------------------------------------------------------------------
+   create discretized "density" on section of global grid due to my particles
+   density(x,y,z) = charge "density" at grid points of my 3d brick
+   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
+   in global grid
+------------------------------------------------------------------------- */
+
+void PPPMDispTIP4POMP::make_rho_c()
+{
+
+  // clear 3d density array
+
+  FFT_SCALAR * _noalias const d = &(density_brick[nzlo_out][nylo_out][nxlo_out]);
+  memset(d,0,ngrid*sizeof(FFT_SCALAR));
+
+  const int ix = nxhi_out - nxlo_out + 1;
+  const int iy = nyhi_out - nylo_out + 1;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+    const double * _noalias const q = atom->q;
+    const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
+    const int3_t * _noalias const p2g = (int3_t *) part2grid[0];
+    const int * _noalias const type = atom->type;
+    dbl3_t xM;
+
+    const double boxlox = boxlo[0];
+    const double boxloy = boxlo[1];
+    const double boxloz = boxlo[2];
+
+    // determine range of grid points handled by this thread
+    int i,jfrom,jto,tid,iH1,iH2;
+    loop_setup_thr(jfrom,jto,tid,ngrid,comm->nthreads);
+
+    // get per thread data
+    ThrData *thr = fix->get_thr(tid);
+    FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
+
+    // loop over my charges, add their contribution to nearby grid points
+    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+    // (dx,dy,dz) = distance to "lower left" grid pt
+
+    // loop over all local atoms for all threads
+    const int nlocal = atom->nlocal;
+    for (i = 0; i < nlocal; i++) {
+
+      const int nx = p2g[i].a;
+      const int ny = p2g[i].b;
+      const int nz = p2g[i].t;
+
+      // pre-screen whether this atom will ever come within 
+      // reach of the data segement this thread is updating.
+      if ( ((nz+nlower-nzlo_out)*ix*iy >= jto)
+	   || ((nz+nupper-nzlo_out+1)*ix*iy < jfrom) ) continue;
+
+    if (type[i] == typeO) {
+      find_M_thr(i,iH1,iH2,xM);
+    } else {
+      xM = x[i];
+    }
+      const FFT_SCALAR dx = nx+shiftone - (xM.x-boxlox)*delxinv;
+      const FFT_SCALAR dy = ny+shiftone - (xM.y-boxloy)*delyinv;
+      const FFT_SCALAR dz = nz+shiftone - (xM.z-boxloz)*delzinv;
+
+      compute_rho1d_thr(r1d,dx,dy,dz,order,rho_coeff);
+
+      const FFT_SCALAR z0 = delvolinv * q[i];
+
+      for (int n = nlower; n <= nupper; ++n) {
+	const int jn = (nz+n-nzlo_out)*ix*iy;
+	const FFT_SCALAR y0 = z0*r1d[2][n];
+
+	for (int m = nlower; m <= nupper; ++m) {
+	  const int jm = jn+(ny+m-nylo_out)*ix;
+	  const FFT_SCALAR x0 = y0*r1d[1][m];
+
+	  for (int l = nlower; l <= nupper; ++l) {
+	    const int jl = jm+nx+l-nxlo_out;
+	    // make sure each thread only updates
+	    // "his" elements of the density grid
+	    if (jl >= jto) break;
+	    if (jl < jfrom) continue;
+
+	    d[jl] += x0*r1d[0][l];
+	  }
+	}
+      }
+    }
+  }
+}
+
+
+/* ----------------------------------------------------------------------
+   same as above for dispersion interaction with geometric mixing rule
+------------------------------------------------------------------------- */
+
+void PPPMDispTIP4POMP::make_rho_g()
+{
+
+  // clear 3d density array
+
+  FFT_SCALAR * _noalias const d = &(density_brick_g[nzlo_out_6][nylo_out_6][nxlo_out_6]);
+  memset(d,0,ngrid_6*sizeof(FFT_SCALAR));
+
+  const int ix = nxhi_out_6 - nxlo_out_6 + 1;
+  const int iy = nyhi_out_6 - nylo_out_6 + 1;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+    const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
+    const int3_t * _noalias const p2g = (int3_t *) part2grid_6[0];
+
+    const double boxlox = boxlo[0];
+    const double boxloy = boxlo[1];
+    const double boxloz = boxlo[2];
+
+    // determine range of grid points handled by this thread
+    int i,jfrom,jto,tid;
+    loop_setup_thr(jfrom,jto,tid,ngrid_6,comm->nthreads);
+
+    // get per thread data
+    ThrData *thr = fix->get_thr(tid);
+    FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d_6());
+
+    // loop over my charges, add their contribution to nearby grid points
+    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+    // (dx,dy,dz) = distance to "lower left" grid pt
+
+    // loop over all local atoms for all threads
+    const int nlocal = atom->nlocal;
+    for (i = 0; i < nlocal; i++) {
+
+      const int nx = p2g[i].a;
+      const int ny = p2g[i].b;
+      const int nz = p2g[i].t;
+
+      // pre-screen whether this atom will ever come within 
+      // reach of the data segement this thread is updating.
+      if ( ((nz+nlower_6-nzlo_out_6)*ix*iy >= jto)
+	   || ((nz+nupper_6-nzlo_out_6+1)*ix*iy < jfrom) ) continue;
+
+      const FFT_SCALAR dx = nx+shiftone_6 - (x[i].x-boxlox)*delxinv_6;
+      const FFT_SCALAR dy = ny+shiftone_6 - (x[i].y-boxloy)*delyinv_6;
+      const FFT_SCALAR dz = nz+shiftone_6 - (x[i].z-boxloz)*delzinv_6;
+
+      compute_rho1d_thr(r1d,dx,dy,dz,order_6,rho_coeff_6);
+
+      const int type = atom->type[i];
+      const double lj = B[type];
+      const FFT_SCALAR z0 = delvolinv_6 * lj;
+
+      for (int n = nlower_6; n <= nupper_6; ++n) {
+	const int jn = (nz+n-nzlo_out_6)*ix*iy;
+	const FFT_SCALAR y0 = z0*r1d[2][n];
+
+	for (int m = nlower_6; m <= nupper_6; ++m) {
+	  const int jm = jn+(ny+m-nylo_out_6)*ix;
+	  const FFT_SCALAR x0 = y0*r1d[1][m];
+
+	  for (int l = nlower_6; l <= nupper_6; ++l) {
+	    const int jl = jm+nx+l-nxlo_out_6;
+	    // make sure each thread only updates
+	    // "his" elements of the density grid
+	    if (jl >= jto) break;
+	    if (jl < jfrom) continue;
+
+	    d[jl] += x0*r1d[0][l];
+	  }
+	}
+      }
+    }
+  }
+}
+
+
+/* ----------------------------------------------------------------------
+   same as above for dispersion interaction with arithmetic mixing rule
+------------------------------------------------------------------------- */
+
+void PPPMDispTIP4POMP::make_rho_a()
+{
+
+  // clear 3d density array
+
+  FFT_SCALAR * _noalias const d0 = &(density_brick_a0[nzlo_out_6][nylo_out_6][nxlo_out_6]);
+  FFT_SCALAR * _noalias const d1 = &(density_brick_a1[nzlo_out_6][nylo_out_6][nxlo_out_6]);
+  FFT_SCALAR * _noalias const d2 = &(density_brick_a2[nzlo_out_6][nylo_out_6][nxlo_out_6]);
+  FFT_SCALAR * _noalias const d3 = &(density_brick_a3[nzlo_out_6][nylo_out_6][nxlo_out_6]);
+  FFT_SCALAR * _noalias const d4 = &(density_brick_a4[nzlo_out_6][nylo_out_6][nxlo_out_6]);
+  FFT_SCALAR * _noalias const d5 = &(density_brick_a5[nzlo_out_6][nylo_out_6][nxlo_out_6]);
+  FFT_SCALAR * _noalias const d6 = &(density_brick_a6[nzlo_out_6][nylo_out_6][nxlo_out_6]);
+
+  memset(d0,0,ngrid_6*sizeof(FFT_SCALAR));
+  memset(d1,0,ngrid_6*sizeof(FFT_SCALAR));
+  memset(d2,0,ngrid_6*sizeof(FFT_SCALAR));
+  memset(d3,0,ngrid_6*sizeof(FFT_SCALAR));
+  memset(d4,0,ngrid_6*sizeof(FFT_SCALAR));
+  memset(d5,0,ngrid_6*sizeof(FFT_SCALAR));
+  memset(d6,0,ngrid_6*sizeof(FFT_SCALAR));
+
+  const int ix = nxhi_out_6 - nxlo_out_6 + 1;
+  const int iy = nyhi_out_6 - nylo_out_6 + 1;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+    const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
+    const int3_t * _noalias const p2g = (int3_t *) part2grid_6[0];
+
+    const double boxlox = boxlo[0];
+    const double boxloy = boxlo[1];
+    const double boxloz = boxlo[2];
+
+    // determine range of grid points handled by this thread
+    int i,jfrom,jto,tid;
+    loop_setup_thr(jfrom,jto,tid,ngrid_6,comm->nthreads);
+
+    // get per thread data
+    ThrData *thr = fix->get_thr(tid);
+    FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d_6());
+
+    // loop over my charges, add their contribution to nearby grid points
+    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+    // (dx,dy,dz) = distance to "lower left" grid pt
+
+    // loop over all local atoms for all threads
+    const int nlocal = atom->nlocal;
+    for (i = 0; i < nlocal; i++) {
+
+      const int nx = p2g[i].a;
+      const int ny = p2g[i].b;
+      const int nz = p2g[i].t;
+
+      // pre-screen whether this atom will ever come within 
+      // reach of the data segement this thread is updating.
+      if ( ((nz+nlower_6-nzlo_out_6)*ix*iy >= jto)
+	   || ((nz+nupper_6-nzlo_out_6+1)*ix*iy < jfrom) ) continue;
+
+      const FFT_SCALAR dx = nx+shiftone_6 - (x[i].x-boxlox)*delxinv_6;
+      const FFT_SCALAR dy = ny+shiftone_6 - (x[i].y-boxloy)*delyinv_6;
+      const FFT_SCALAR dz = nz+shiftone_6 - (x[i].z-boxloz)*delzinv_6;
+
+      compute_rho1d_thr(r1d,dx,dy,dz,order_6,rho_coeff_6);
+
+      const int type = atom->type[i];
+      const double lj0 = B[7*type];
+      const double lj1 = B[7*type+1];
+      const double lj2 = B[7*type+2];
+      const double lj3 = B[7*type+3];
+      const double lj4 = B[7*type+4];
+      const double lj5 = B[7*type+5];
+      const double lj6 = B[7*type+6];
+
+      const FFT_SCALAR z0 = delvolinv_6;
+
+      for (int n = nlower_6; n <= nupper_6; ++n) {
+	const int jn = (nz+n-nzlo_out_6)*ix*iy;
+	const FFT_SCALAR y0 = z0*r1d[2][n];
+
+	for (int m = nlower_6; m <= nupper_6; ++m) {
+	  const int jm = jn+(ny+m-nylo_out_6)*ix;
+	  const FFT_SCALAR x0 = y0*r1d[1][m];
+
+	  for (int l = nlower_6; l <= nupper_6; ++l) {
+	    const int jl = jm+nx+l-nxlo_out_6;
+	    // make sure each thread only updates
+	    // "his" elements of the density grid
+	    if (jl >= jto) break;
+	    if (jl < jfrom) continue;
+
+            const double w = x0*r1d[0][l];
+
+	    d0[jl] += w*lj0;
+	    d1[jl] += w*lj1;
+	    d2[jl] += w*lj2;
+	    d3[jl] += w*lj3;
+	    d4[jl] += w*lj4;
+	    d5[jl] += w*lj5;
+	    d6[jl] += w*lj6;
+	  }
+	}
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get electric field & force on my particles for ik
+------------------------------------------------------------------------- */
+
+void PPPMDispTIP4POMP::fieldforce_c_ik()
+{
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+
+  const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
+  const double * _noalias const q = atom->q;
+  const int3_t * _noalias const p2g = (int3_t *) part2grid[0];
+  const int * _noalias const type = atom->type;
+
+  const double qqrd2e = force->qqrd2e;
+  const double boxlox = boxlo[0];
+  const double boxloy = boxlo[1];
+  const double boxloz = boxlo[2];
+
+  const int nthreads = comm->nthreads;
+  const int nlocal = atom->nlocal;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+    dbl3_t xM;
+    FFT_SCALAR x0,y0,z0,ekx,eky,ekz;
+    int i,ifrom,ito,tid,iH1,iH2,l,m,n,mx,my,mz;
+
+    loop_setup_thr(ifrom,ito,tid,nlocal,nthreads);
+
+    // get per thread data
+    ThrData *thr = fix->get_thr(tid);
+    dbl3_t * _noalias const f = (dbl3_t *) thr->get_f()[0];
+    FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
+
+    for (i = ifrom; i < ito; ++i) {
+      if (type[i] == typeO) {
+        find_M_thr(i,iH1,iH2,xM);
+      } else xM = x[i];
+
+      const int nx = p2g[i].a;
+      const int ny = p2g[i].b;
+      const int nz = p2g[i].t;
+      const FFT_SCALAR dx = nx+shiftone - (xM.x-boxlox)*delxinv;
+      const FFT_SCALAR dy = ny+shiftone - (xM.y-boxloy)*delyinv;
+      const FFT_SCALAR dz = nz+shiftone - (xM.z-boxloz)*delzinv;
+
+      compute_rho1d_thr(r1d,dx,dy,dz, order, rho_coeff);
+
+      ekx = eky = ekz = ZEROF;
+      for (n = nlower; n <= nupper; n++) {
+        mz = n+nz;
+        z0 = r1d[2][n];
+        for (m = nlower; m <= nupper; m++) {
+          my = m+ny;
+          y0 = z0*r1d[1][m];
+          for (l = nlower; l <= nupper; l++) {
+            mx = l+nx;
+            x0 = y0*r1d[0][l];
+            ekx -= x0*vdx_brick[mz][my][mx];
+            eky -= x0*vdy_brick[mz][my][mx];
+            ekz -= x0*vdz_brick[mz][my][mx];
+          }
+        }
+      }
+
+      // convert E-field to force
+
+      const double qfactor = qqrd2e * scale * q[i];
+      if (type[i] != typeO) {
+        f[i].x += qfactor*ekx;
+        f[i].y += qfactor*eky;
+        if (slabflag != 2) f[i].z += qfactor*ekz;
+
+      } else {
+        const double fx = qfactor * ekx;
+        const double fy = qfactor * eky;
+        const double fz = qfactor * ekz;
+
+        f[i].x += fx*(1 - alpha);
+        f[i].y += fy*(1 - alpha);
+        if (slabflag != 2) f[i].z += fz*(1 - alpha);
+
+        f[iH1].x += 0.5*alpha*fx;
+        f[iH1].y += 0.5*alpha*fy;
+        if (slabflag != 2) f[iH1].z += 0.5*alpha*fz;
+
+        f[iH2].x += 0.5*alpha*fx;
+        f[iH2].y += 0.5*alpha*fy;
+        if (slabflag != 2) f[iH2].z += 0.5*alpha*fz;
+      }
+    }
+  } // end of parallel region
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get electric field & force on my particles for ad
+------------------------------------------------------------------------- */
+
+void PPPMDispTIP4POMP::fieldforce_c_ad()
+{
+  const double *prd = (triclinic == 0) ? domain->prd : domain->prd_lamda;
+  const double hx_inv = nx_pppm/prd[0];
+  const double hy_inv = ny_pppm/prd[1];
+  const double hz_inv = nz_pppm/prd[2];
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+
+  const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
+  const double * _noalias const q = atom->q;
+  const int3_t * _noalias const p2g = (int3_t *) part2grid[0];
+  const int * _noalias const type = atom->type;
+
+  const double qqrd2e = force->qqrd2e;
+  const double boxlox = boxlo[0];
+  const double boxloy = boxlo[1];
+  const double boxloz = boxlo[2];
+
+  const int nthreads = comm->nthreads;
+  const int nlocal = atom->nlocal;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+    double s1,s2,s3,sf;
+    dbl3_t xM;
+    FFT_SCALAR ekx,eky,ekz;
+    int i,ifrom,ito,tid,iH1,iH2,l,m,n,mx,my,mz;
+
+    loop_setup_thr(ifrom,ito,tid,nlocal,nthreads);
+
+    // get per thread data
+    ThrData *thr = fix->get_thr(tid);
+    dbl3_t * _noalias const f = (dbl3_t *) thr->get_f()[0];
+    FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
+    FFT_SCALAR * const * const d1d = static_cast<FFT_SCALAR **>(thr->get_drho1d());
+
+    for (i = ifrom; i < ito; ++i) {
+      if (type[i] == typeO) {
+        find_M_thr(i,iH1,iH2,xM);
+      } else xM = x[i];
+
+      const int nx = p2g[i].a;
+      const int ny = p2g[i].b;
+      const int nz = p2g[i].t;
+      const FFT_SCALAR dx = nx+shiftone - (xM.x-boxlox)*delxinv;
+      const FFT_SCALAR dy = ny+shiftone - (xM.y-boxloy)*delyinv;
+      const FFT_SCALAR dz = nz+shiftone - (xM.z-boxloz)*delzinv;
+
+      compute_rho1d_thr(r1d,dx,dy,dz,order,rho_coeff);
+      compute_drho1d_thr(d1d,dx,dy,dz,order,drho_coeff);
+
+      ekx = eky = ekz = ZEROF;
+      for (n = nlower; n <= nupper; n++) {
+        mz = n+nz;
+        for (m = nlower; m <= nupper; m++) {
+          my = m+ny;
+          for (l = nlower; l <= nupper; l++) {
+            mx = l+nx;
+            ekx += d1d[0][l]*r1d[1][m]*r1d[2][n]*u_brick[mz][my][mx];
+            eky += r1d[0][l]*d1d[1][m]*r1d[2][n]*u_brick[mz][my][mx];
+            ekz += r1d[0][l]*r1d[1][m]*d1d[2][n]*u_brick[mz][my][mx];
+          }
+        }
+      }
+      ekx *= hx_inv;
+      eky *= hy_inv;
+      ekz *= hz_inv;
+
+      // convert E-field to force and substract self forces
+
+      const double qi = q[i];
+      const double qfactor = qqrd2e * scale * qi;
+
+      s1 = x[i].x*hx_inv;
+      sf = sf_coeff[0]*sin(MY_2PI*s1);
+      sf += sf_coeff[1]*sin(MY_4PI*s1);
+      sf *= 2.0*qi;
+      const double fx = qfactor*(ekx - sf);
+
+      s2 = x[i].y*hy_inv;
+      sf = sf_coeff[2]*sin(MY_2PI*s2);
+      sf += sf_coeff[3]*sin(MY_4PI*s2);
+      sf *= 2.0*qi;
+      const double fy = qfactor*(eky - sf);
+
+      s3 = x[i].z*hz_inv;
+      sf = sf_coeff[4]*sin(MY_2PI*s3);
+      sf += sf_coeff[5]*sin(MY_4PI*s3);
+      sf *= 2.0*qi;
+      const double fz = qfactor*(ekz - sf);
+
+      if (type[i] != typeO) {
+        f[i].x += fx;
+        f[i].y += fy;
+        if (slabflag != 2) f[i].z += fz;
+
+      } else {
+        f[i].x += fx*(1 - alpha);
+        f[i].y += fy*(1 - alpha);
+        if (slabflag != 2) f[i].z += fz*(1 - alpha);
+
+        f[iH1].x += 0.5*alpha*fx;
+        f[iH1].y += 0.5*alpha*fy;
+        if (slabflag != 2) f[iH1].z += 0.5*alpha*fz;
+
+        f[iH2].x += 0.5*alpha*fx;
+        f[iH2].y += 0.5*alpha*fy;
+        if (slabflag != 2) f[iH2].z += 0.5*alpha*fz;
+      }
+    }
+  } // end of parallel region
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get dispersion field & force on my particles
+   for ik scheme and geometric mixing rule
+------------------------------------------------------------------------- */
+
+void PPPMDispTIP4POMP::fieldforce_g_ik()
+{
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+
+  const double * const * const x = atom->x;
+  const int nthreads = comm->nthreads;
+  const int nlocal = atom->nlocal;
+  const double qqrd2e = force->qqrd2e;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+#if defined(_OPENMP)
+    // each thread works on a fixed chunk of atoms.
+    const int tid = omp_get_thread_num();
+    const int inum = nlocal;
+    const int idelta = 1 + inum/nthreads;
+    const int ifrom = tid*idelta;
+    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
+#else
+    const int ifrom = 0;
+    const int ito = nlocal;
+    const int tid = 0;
+#endif
+    ThrData *thr = fix->get_thr(tid);
+    double * const * const f = thr->get_f();
+    FFT_SCALAR * const * const r1d =  static_cast<FFT_SCALAR **>(thr->get_rho1d_6());
+
+    int l,m,n,nx,ny,nz,mx,my,mz;
+    FFT_SCALAR dx,dy,dz,x0,y0,z0;
+    FFT_SCALAR ekx,eky,ekz;
+    int type;
+    double lj;
+
+    // this if protects against having more threads than local atoms
+    if (ifrom < nlocal) {
+      for (int i = ifrom; i < ito; i++) {
+
+        nx = part2grid_6[i][0];
+        ny = part2grid_6[i][1];
+        nz = part2grid_6[i][2];
+        dx = nx+shiftone_6 - (x[i][0]-boxlo[0])*delxinv_6;
+        dy = ny+shiftone_6 - (x[i][1]-boxlo[1])*delyinv_6;
+        dz = nz+shiftone_6 - (x[i][2]-boxlo[2])*delzinv_6;
+
+        compute_rho1d_thr(r1d,dx,dy,dz, order_6, rho_coeff_6);
+
+        ekx = eky = ekz = ZEROF;
+        for (n = nlower_6; n <= nupper_6; n++) {
+          mz = n+nz;
+          z0 = r1d[2][n];
+          for (m = nlower_6; m <= nupper_6; m++) {
+            my = m+ny;
+            y0 = z0*r1d[1][m];
+            for (l = nlower_6; l <= nupper_6; l++) {
+              mx = l+nx;
+              x0 = y0*r1d[0][l];
+              ekx -= x0*vdx_brick_g[mz][my][mx];
+              eky -= x0*vdy_brick_g[mz][my][mx];
+              ekz -= x0*vdz_brick_g[mz][my][mx];
+            }
+          }
+        }
+
+        // convert E-field to force
+        type = atom->type[i];
+        lj = B[type];
+        f[i][0] += lj*ekx;
+        f[i][1] += lj*eky;
+        f[i][2] += lj*ekz;
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get dispersion field & force on my particles
+   for ad scheme and geometric mixing rule
+------------------------------------------------------------------------- */
+
+void PPPMDispTIP4POMP::fieldforce_g_ad()
+{
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+
+  const double * const * const x = atom->x;
+  const int nthreads = comm->nthreads;
+  const int nlocal = atom->nlocal;
+
+  double *prd;
+
+  if (triclinic == 0) prd = domain->prd;
+  else prd = domain->prd_lamda;
+
+  double xprd = prd[0];
+  double yprd = prd[1];
+  double zprd = prd[2];
+  double zprd_slab = zprd*slab_volfactor;
+
+  const double hx_inv = nx_pppm_6/xprd;
+  const double hy_inv = ny_pppm_6/yprd;
+  const double hz_inv = nz_pppm_6/zprd_slab;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+#if defined(_OPENMP)
+    // each thread works on a fixed chunk of atoms.
+    const int tid = omp_get_thread_num();
+    const int inum = nlocal;
+    const int idelta = 1 + inum/nthreads;
+    const int ifrom = tid*idelta;
+    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
+#else
+    const int ifrom = 0;
+    const int ito = nlocal;
+    const int tid = 0;
+#endif
+    ThrData *thr = fix->get_thr(tid);
+    double * const * const f = thr->get_f();
+    FFT_SCALAR * const * const r1d =  static_cast<FFT_SCALAR **>(thr->get_rho1d_6());
+    FFT_SCALAR * const * const dr1d = static_cast<FFT_SCALAR **>(thr->get_drho1d_6());
+
+    int l,m,n,nx,ny,nz,mx,my,mz;
+    FFT_SCALAR dx,dy,dz,x0,y0,z0;
+    FFT_SCALAR ekx,eky,ekz;
+    int type;
+    double lj;
+    double sf = 0.0;
+    double s1,s2,s3;
+
+    // this if protects against having more threads than local atoms
+    if (ifrom < nlocal) {
+      for (int i = ifrom; i < ito; i++) {
+
+        nx = part2grid_6[i][0];
+        ny = part2grid_6[i][1];
+        nz = part2grid_6[i][2];
+        dx = nx+shiftone_6 - (x[i][0]-boxlo[0])*delxinv_6;
+        dy = ny+shiftone_6 - (x[i][1]-boxlo[1])*delyinv_6;
+        dz = nz+shiftone_6 - (x[i][2]-boxlo[2])*delzinv_6;
+
+        compute_rho1d_thr(r1d,dx,dy,dz, order_6, rho_coeff_6);
+        compute_drho1d_thr(dr1d,dx,dy,dz, order_6, drho_coeff_6);
+
+        ekx = eky = ekz = ZEROF;
+        for (n = nlower_6; n <= nupper_6; n++) {
+          mz = n+nz;
+          for (m = nlower_6; m <= nupper_6; m++) {
+            my = m+ny;
+            for (l = nlower_6; l <= nupper_6; l++) {
+              mx = l+nx;
+              ekx += dr1d[0][l]*r1d[1][m]*r1d[2][n]*u_brick_g[mz][my][mx];
+              eky += r1d[0][l]*dr1d[1][m]*r1d[2][n]*u_brick_g[mz][my][mx];
+              ekz += r1d[0][l]*r1d[1][m]*dr1d[2][n]*u_brick_g[mz][my][mx];
+            }
+          }
+        }
+        ekx *= hx_inv;
+        eky *= hy_inv;
+        ekz *= hz_inv;
+
+        // convert E-field to force
+        type = atom->type[i];
+        lj = B[type];
+
+        s1 = x[i][0]*hx_inv;
+        s2 = x[i][1]*hy_inv;
+        s3 = x[i][2]*hz_inv;
+
+        sf = sf_coeff_6[0]*sin(2*MY_PI*s1);
+        sf += sf_coeff_6[1]*sin(4*MY_PI*s1);
+        sf *= 2*lj*lj;
+        f[i][0] += ekx*lj - sf;
+
+        sf = sf_coeff_6[2]*sin(2*MY_PI*s2);
+        sf += sf_coeff_6[3]*sin(4*MY_PI*s2);
+        sf *= 2*lj*lj;
+        f[i][1] += eky*lj - sf;
+
+        sf = sf_coeff_6[4]*sin(2*MY_PI*s3);
+        sf += sf_coeff_6[5]*sin(4*MY_PI*s3);
+        sf *= 2*lj*lj;
+        if (slabflag != 2) f[i][2] += ekz*lj - sf;
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+ interpolate from grid to get per-atom energy/virial for dispersion
+ interaction and geometric mixing rule
+ ------------------------------------------------------------------------- */
+
+void PPPMDispTIP4POMP::fieldforce_g_peratom()
+{
+  // loop over my charges, interpolate from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  const double * const * const x = atom->x;
+  const int nthreads = comm->nthreads;
+  const int nlocal = atom->nlocal;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+#if defined(_OPENMP)
+    // each thread works on a fixed chunk of atoms.
+    const int tid = omp_get_thread_num();
+    const int inum = nlocal;
+    const int idelta = 1 + inum/nthreads;
+    const int ifrom = tid*idelta;
+    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
+#else
+    const int ifrom = 0;
+    const int ito = nlocal;
+    const int tid = 0;
+#endif
+    ThrData *thr = fix->get_thr(tid);
+    FFT_SCALAR * const * const r1d =  static_cast<FFT_SCALAR **>(thr->get_rho1d_6());
+
+    int i,l,m,n,nx,ny,nz,mx,my,mz;
+    FFT_SCALAR dx,dy,dz,x0,y0,z0;
+    FFT_SCALAR u,v0,v1,v2,v3,v4,v5;
+    int type;
+    double lj;
+
+    // this if protects against having more threads than local atoms
+    if (ifrom < nlocal) {
+      for (int i = ifrom; i < ito; i++) {
+
+        nx = part2grid_6[i][0];
+        ny = part2grid_6[i][1];
+        nz = part2grid_6[i][2];
+        dx = nx+shiftone_6 - (x[i][0]-boxlo[0])*delxinv_6;
+        dy = ny+shiftone_6 - (x[i][1]-boxlo[1])*delyinv_6;
+        dz = nz+shiftone_6 - (x[i][2]-boxlo[2])*delzinv_6;
+
+        compute_rho1d_thr(r1d,dx,dy,dz, order_6, rho_coeff_6);
+
+        u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF;
+        for (n = nlower_6; n <= nupper_6; n++) {
+          mz = n+nz;
+          z0 = r1d[2][n];
+          for (m = nlower_6; m <= nupper_6; m++) {
+            my = m+ny;
+            y0 = z0*r1d[1][m];
+            for (l = nlower_6; l <= nupper_6; l++) {
+              mx = l+nx;
+              x0 = y0*r1d[0][l];
+              if (eflag_atom) u += x0*u_brick_g[mz][my][mx];
+              if (vflag_atom) {
+                v0 += x0*v0_brick_g[mz][my][mx];
+                v1 += x0*v1_brick_g[mz][my][mx];
+                v2 += x0*v2_brick_g[mz][my][mx];
+                v3 += x0*v3_brick_g[mz][my][mx];
+                v4 += x0*v4_brick_g[mz][my][mx];
+                v5 += x0*v5_brick_g[mz][my][mx];
+              }
+            }
+          }
+        }
+
+        type = atom->type[i];
+        lj = B[type]*0.5;
+
+        if (eflag_atom) eatom[i] += u*lj;
+        if (vflag_atom) {
+          vatom[i][0] += v0*lj;
+          vatom[i][1] += v1*lj;
+          vatom[i][2] += v2*lj;
+          vatom[i][3] += v3*lj;
+          vatom[i][4] += v4*lj;
+          vatom[i][5] += v5*lj;
+        }
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get dispersion field & force on my particles
+   for ik scheme and arithmetic mixing rule
+------------------------------------------------------------------------- */
+
+void PPPMDispTIP4POMP::fieldforce_a_ik()
+{
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+
+  const double * const * const x = atom->x;
+  const int nthreads = comm->nthreads;
+  const int nlocal = atom->nlocal;
+  const double qqrd2e = force->qqrd2e;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+#if defined(_OPENMP)
+    // each thread works on a fixed chunk of atoms.
+    const int tid = omp_get_thread_num();
+    const int inum = nlocal;
+    const int idelta = 1 + inum/nthreads;
+    const int ifrom = tid*idelta;
+    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
+#else
+    const int ifrom = 0;
+    const int ito = nlocal;
+    const int tid = 0;
+#endif
+    ThrData *thr = fix->get_thr(tid);
+    double * const * const f = thr->get_f();
+    FFT_SCALAR * const * const r1d =  static_cast<FFT_SCALAR **>(thr->get_rho1d_6());
+
+    int l,m,n,nx,ny,nz,mx,my,mz;
+    FFT_SCALAR dx,dy,dz,x0,y0,z0;
+    FFT_SCALAR ekx0, eky0, ekz0, ekx1, eky1, ekz1, ekx2, eky2, ekz2;
+    FFT_SCALAR ekx3, eky3, ekz3, ekx4, eky4, ekz4, ekx5, eky5, ekz5;
+    FFT_SCALAR ekx6, eky6, ekz6;
+    int type;
+    double lj0,lj1,lj2,lj3,lj4,lj5,lj6;
+
+    // this if protects against having more threads than local atoms
+    if (ifrom < nlocal) {
+      for (int i = ifrom; i < ito; i++) {
+
+        nx = part2grid_6[i][0];
+        ny = part2grid_6[i][1];
+        nz = part2grid_6[i][2];
+        dx = nx+shiftone_6 - (x[i][0]-boxlo[0])*delxinv_6;
+        dy = ny+shiftone_6 - (x[i][1]-boxlo[1])*delyinv_6;
+        dz = nz+shiftone_6 - (x[i][2]-boxlo[2])*delzinv_6;
+
+        compute_rho1d_thr(r1d,dx,dy,dz, order_6, rho_coeff_6);
+
+        ekx0 = eky0 = ekz0 = ZEROF;
+        ekx1 = eky1 = ekz1 = ZEROF;
+        ekx2 = eky2 = ekz2 = ZEROF;
+        ekx3 = eky3 = ekz3 = ZEROF;
+        ekx4 = eky4 = ekz4 = ZEROF;
+        ekx5 = eky5 = ekz5 = ZEROF;
+        ekx6 = eky6 = ekz6 = ZEROF;
+        for (n = nlower_6; n <= nupper_6; n++) {
+          mz = n+nz;
+          z0 = r1d[2][n];
+          for (m = nlower_6; m <= nupper_6; m++) {
+            my = m+ny;
+            y0 = z0*r1d[1][m];
+            for (l = nlower_6; l <= nupper_6; l++) {
+              mx = l+nx;
+              x0 = y0*r1d[0][l];
+	      ekx0 -= x0*vdx_brick_a0[mz][my][mx];
+	      eky0 -= x0*vdy_brick_a0[mz][my][mx];
+	      ekz0 -= x0*vdz_brick_a0[mz][my][mx];
+	      ekx1 -= x0*vdx_brick_a1[mz][my][mx];
+	      eky1 -= x0*vdy_brick_a1[mz][my][mx];
+	      ekz1 -= x0*vdz_brick_a1[mz][my][mx];
+              ekx2 -= x0*vdx_brick_a2[mz][my][mx];
+	      eky2 -= x0*vdy_brick_a2[mz][my][mx];
+	      ekz2 -= x0*vdz_brick_a2[mz][my][mx];
+	      ekx3 -= x0*vdx_brick_a3[mz][my][mx];
+	      eky3 -= x0*vdy_brick_a3[mz][my][mx];
+	      ekz3 -= x0*vdz_brick_a3[mz][my][mx];
+	      ekx4 -= x0*vdx_brick_a4[mz][my][mx];
+	      eky4 -= x0*vdy_brick_a4[mz][my][mx];
+	      ekz4 -= x0*vdz_brick_a4[mz][my][mx];
+              ekx5 -= x0*vdx_brick_a5[mz][my][mx];
+	      eky5 -= x0*vdy_brick_a5[mz][my][mx];
+	      ekz5 -= x0*vdz_brick_a5[mz][my][mx];
+              ekx6 -= x0*vdx_brick_a6[mz][my][mx];
+	      eky6 -= x0*vdy_brick_a6[mz][my][mx];
+	      ekz6 -= x0*vdz_brick_a6[mz][my][mx];
+            }
+          }
+        }
+
+        // convert D-field to force
+        type = atom->type[i];
+        lj0 = B[7*type+6];
+        lj1 = B[7*type+5];
+        lj2 = B[7*type+4];
+        lj3 = B[7*type+3];
+        lj4 = B[7*type+2];
+        lj5 = B[7*type+1];
+        lj6 = B[7*type];
+        f[i][0] += lj0*ekx0 + lj1*ekx1 + lj2*ekx2 + lj3*ekx3 + lj4*ekx4 + lj5*ekx5 + lj6*ekx6;
+        f[i][1] += lj0*eky0 + lj1*eky1 + lj2*eky2 + lj3*eky3 + lj4*eky4 + lj5*eky5 + lj6*eky6;
+        f[i][2] += lj0*ekz0 + lj1*ekz1 + lj2*ekz2 + lj3*ekz3 + lj4*ekz4 + lj5*ekz5 + lj6*ekz6;
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get dispersion field & force on my particles
+   for ad scheme and arithmetic mixing rule
+------------------------------------------------------------------------- */
+
+void PPPMDispTIP4POMP::fieldforce_a_ad()
+{
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+  // ek = 3 components of E-field on particle
+
+  const double * const * const x = atom->x;
+  const int nthreads = comm->nthreads;
+  const int nlocal = atom->nlocal;
+
+  double *prd;
+
+  if (triclinic == 0) prd = domain->prd;
+  else prd = domain->prd_lamda;
+
+  double xprd = prd[0];
+  double yprd = prd[1];
+  double zprd = prd[2];
+  double zprd_slab = zprd*slab_volfactor;
+
+  const double hx_inv = nx_pppm_6/xprd;
+  const double hy_inv = ny_pppm_6/yprd;
+  const double hz_inv = nz_pppm_6/zprd_slab;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+#if defined(_OPENMP)
+    // each thread works on a fixed chunk of atoms.
+    const int tid = omp_get_thread_num();
+    const int inum = nlocal;
+    const int idelta = 1 + inum/nthreads;
+    const int ifrom = tid*idelta;
+    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
+#else
+    const int ifrom = 0;
+    const int ito = nlocal;
+    const int tid = 0;
+#endif
+    ThrData *thr = fix->get_thr(tid);
+    double * const * const f = thr->get_f();
+    FFT_SCALAR * const * const r1d =  static_cast<FFT_SCALAR **>(thr->get_rho1d_6());
+    FFT_SCALAR * const * const dr1d = static_cast<FFT_SCALAR **>(thr->get_drho1d_6());
+
+    int l,m,n,nx,ny,nz,mx,my,mz;
+    FFT_SCALAR dx,dy,dz,x0,y0,z0;
+    FFT_SCALAR ekx0, eky0, ekz0, ekx1, eky1, ekz1, ekx2, eky2, ekz2;
+    FFT_SCALAR ekx3, eky3, ekz3, ekx4, eky4, ekz4, ekx5, eky5, ekz5;
+    FFT_SCALAR ekx6, eky6, ekz6;
+    int type;
+    double lj0,lj1,lj2,lj3,lj4,lj5,lj6;
+    double sf = 0.0;
+    double s1,s2,s3;
+
+    // this if protects against having more threads than local atoms
+    if (ifrom < nlocal) {
+      for (int i = ifrom; i < ito; i++) {
+
+        nx = part2grid_6[i][0];
+        ny = part2grid_6[i][1];
+        nz = part2grid_6[i][2];
+        dx = nx+shiftone_6 - (x[i][0]-boxlo[0])*delxinv_6;
+        dy = ny+shiftone_6 - (x[i][1]-boxlo[1])*delyinv_6;
+        dz = nz+shiftone_6 - (x[i][2]-boxlo[2])*delzinv_6;
+
+        compute_rho1d_thr(r1d,dx,dy,dz, order_6, rho_coeff_6);
+        compute_drho1d_thr(dr1d,dx,dy,dz, order_6, drho_coeff_6);
+
+        ekx0 = eky0 = ekz0 = ZEROF;
+        ekx1 = eky1 = ekz1 = ZEROF;
+        ekx2 = eky2 = ekz2 = ZEROF;
+        ekx3 = eky3 = ekz3 = ZEROF;
+        ekx4 = eky4 = ekz4 = ZEROF;
+        ekx5 = eky5 = ekz5 = ZEROF;
+        ekx6 = eky6 = ekz6 = ZEROF;
+        for (n = nlower_6; n <= nupper_6; n++) {
+          mz = n+nz;
+          for (m = nlower_6; m <= nupper_6; m++) {
+            my = m+ny;
+            for (l = nlower_6; l <= nupper_6; l++) {
+              mx = l+nx;
+              x0 = dr1d[0][l]*r1d[1][m]*r1d[2][n];
+              y0 = r1d[0][l]*dr1d[1][m]*r1d[2][n];
+              z0 = r1d[0][l]*r1d[1][m]*dr1d[2][n];
+
+              ekx0 += x0*u_brick_a0[mz][my][mx];
+              eky0 += y0*u_brick_a0[mz][my][mx];
+              ekz0 += z0*u_brick_a0[mz][my][mx];
+
+              ekx1 += x0*u_brick_a1[mz][my][mx];
+              eky1 += y0*u_brick_a1[mz][my][mx];
+              ekz1 += z0*u_brick_a1[mz][my][mx];
+
+              ekx2 += x0*u_brick_a2[mz][my][mx];
+              eky2 += y0*u_brick_a2[mz][my][mx];
+              ekz2 += z0*u_brick_a2[mz][my][mx];
+
+              ekx3 += x0*u_brick_a3[mz][my][mx];
+              eky3 += y0*u_brick_a3[mz][my][mx];
+              ekz3 += z0*u_brick_a3[mz][my][mx];
+
+              ekx4 += x0*u_brick_a4[mz][my][mx];
+              eky4 += y0*u_brick_a4[mz][my][mx];
+              ekz4 += z0*u_brick_a4[mz][my][mx];
+
+              ekx5 += x0*u_brick_a5[mz][my][mx];
+              eky5 += y0*u_brick_a5[mz][my][mx];
+              ekz5 += z0*u_brick_a5[mz][my][mx];
+
+              ekx6 += x0*u_brick_a6[mz][my][mx];
+              eky6 += y0*u_brick_a6[mz][my][mx];
+              ekz6 += z0*u_brick_a6[mz][my][mx];
+            }
+          }
+        }
+
+        ekx0 *= hx_inv;
+        eky0 *= hy_inv;
+        ekz0 *= hz_inv;
+
+        ekx1 *= hx_inv;
+        eky1 *= hy_inv;
+        ekz1 *= hz_inv;
+
+        ekx2 *= hx_inv;
+        eky2 *= hy_inv;
+        ekz2 *= hz_inv;
+
+        ekx3 *= hx_inv;
+        eky3 *= hy_inv;
+        ekz3 *= hz_inv;
+
+        ekx4 *= hx_inv;
+        eky4 *= hy_inv;
+        ekz4 *= hz_inv;
+
+        ekx5 *= hx_inv;
+        eky5 *= hy_inv;
+        ekz5 *= hz_inv;
+
+        ekx6 *= hx_inv;
+        eky6 *= hy_inv;
+        ekz6 *= hz_inv;
+
+        // convert D-field to force
+        type = atom->type[i];
+        lj0 = B[7*type+6];
+        lj1 = B[7*type+5];
+        lj2 = B[7*type+4];
+        lj3 = B[7*type+3];
+        lj4 = B[7*type+2];
+        lj5 = B[7*type+1];
+        lj6 = B[7*type];
+
+        s1 = x[i][0]*hx_inv;
+        s2 = x[i][1]*hy_inv;
+        s3 = x[i][2]*hz_inv;
+
+        sf = sf_coeff_6[0]*sin(2*MY_PI*s1);
+        sf += sf_coeff_6[1]*sin(4*MY_PI*s1);
+        sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3;
+        f[i][0] += lj0*ekx0 + lj1*ekx1 + lj2*ekx2 + lj3*ekx3 + lj4*ekx4 + lj5*ekx5 + lj6*ekx6 - sf;
+
+        sf = sf_coeff_6[2]*sin(2*MY_PI*s2);
+        sf += sf_coeff_6[3]*sin(4*MY_PI*s2);
+        sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3;
+        f[i][1] += lj0*eky0 + lj1*eky1 + lj2*eky2 + lj3*eky3 + lj4*eky4 + lj5*eky5 + lj6*eky6 - sf;
+
+        sf = sf_coeff_6[4]*sin(2*MY_PI*s3);
+        sf += sf_coeff_6[5]*sin(4*MY_PI*s3);
+        sf *= 4*lj0*lj6 + 4*lj1*lj5 + 4*lj2*lj4 + 2*lj3*lj3;
+        if (slabflag != 2) f[i][2] += lj0*ekz0 + lj1*ekz1 + lj2*ekz2 + lj3*ekz3 + lj4*ekz4 + lj5*ekz5 + lj6*ekz6 - sf;
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+ interpolate from grid to get per-atom energy/virial for dispersion
+ interaction and arithmetic mixing rule
+ ------------------------------------------------------------------------- */
+
+void PPPMDispTIP4POMP::fieldforce_a_peratom()
+{
+  // loop over my charges, interpolate from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  const double * const * const x = atom->x;
+  const int nthreads = comm->nthreads;
+  const int nlocal = atom->nlocal;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
+#endif
+  {
+#if defined(_OPENMP)
+    // each thread works on a fixed chunk of atoms.
+    const int tid = omp_get_thread_num();
+    const int inum = nlocal;
+    const int idelta = 1 + inum/nthreads;
+    const int ifrom = tid*idelta;
+    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
+#else
+    const int ifrom = 0;
+    const int ito = nlocal;
+    const int tid = 0;
+#endif
+    ThrData *thr = fix->get_thr(tid);
+    FFT_SCALAR * const * const r1d =  static_cast<FFT_SCALAR **>(thr->get_rho1d_6());
+
+    int i,l,m,n,nx,ny,nz,mx,my,mz;
+    FFT_SCALAR dx,dy,dz,x0,y0,z0;
+    FFT_SCALAR u0,v00,v10,v20,v30,v40,v50;
+    FFT_SCALAR u1,v01,v11,v21,v31,v41,v51;
+    FFT_SCALAR u2,v02,v12,v22,v32,v42,v52;
+    FFT_SCALAR u3,v03,v13,v23,v33,v43,v53;
+    FFT_SCALAR u4,v04,v14,v24,v34,v44,v54;
+    FFT_SCALAR u5,v05,v15,v25,v35,v45,v55;
+    FFT_SCALAR u6,v06,v16,v26,v36,v46,v56;
+    int type;
+    double lj0,lj1,lj2,lj3,lj4,lj5,lj6;
+
+    // this if protects against having more threads than local atoms
+    if (ifrom < nlocal) {
+      for (int i = ifrom; i < ito; i++) {
+
+        nx = part2grid_6[i][0];
+        ny = part2grid_6[i][1];
+        nz = part2grid_6[i][2];
+        dx = nx+shiftone_6 - (x[i][0]-boxlo[0])*delxinv_6;
+        dy = ny+shiftone_6 - (x[i][1]-boxlo[1])*delyinv_6;
+        dz = nz+shiftone_6 - (x[i][2]-boxlo[2])*delzinv_6;
+
+        compute_rho1d_thr(r1d,dx,dy,dz, order_6, rho_coeff_6);
+
+        u0 = v00 = v10 = v20 = v30 = v40 = v50 = ZEROF;
+        u1 = v01 = v11 = v21 = v31 = v41 = v51 = ZEROF;
+        u2 = v02 = v12 = v22 = v32 = v42 = v52 = ZEROF;
+        u3 = v03 = v13 = v23 = v33 = v43 = v53 = ZEROF;
+        u4 = v04 = v14 = v24 = v34 = v44 = v54 = ZEROF;
+        u5 = v05 = v15 = v25 = v35 = v45 = v55 = ZEROF;
+        u6 = v06 = v16 = v26 = v36 = v46 = v56 = ZEROF;
+        for (n = nlower_6; n <= nupper_6; n++) {
+          mz = n+nz;
+          z0 = r1d[2][n];
+          for (m = nlower_6; m <= nupper_6; m++) {
+            my = m+ny;
+            y0 = z0*r1d[1][m];
+            for (l = nlower_6; l <= nupper_6; l++) {
+              mx = l+nx;
+              x0 = y0*r1d[0][l];
+              if (eflag_atom) {
+                u0 += x0*u_brick_a0[mz][my][mx];
+                u1 += x0*u_brick_a1[mz][my][mx];
+                u2 += x0*u_brick_a2[mz][my][mx];
+                u3 += x0*u_brick_a3[mz][my][mx];
+                u4 += x0*u_brick_a4[mz][my][mx];
+                u5 += x0*u_brick_a5[mz][my][mx];
+                u6 += x0*u_brick_a6[mz][my][mx];
+	      }
+              if (vflag_atom) {
+                v00 += x0*v0_brick_a0[mz][my][mx];
+                v10 += x0*v1_brick_a0[mz][my][mx];
+                v20 += x0*v2_brick_a0[mz][my][mx];
+                v30 += x0*v3_brick_a0[mz][my][mx];
+                v40 += x0*v4_brick_a0[mz][my][mx];
+                v50 += x0*v5_brick_a0[mz][my][mx];
+                v01 += x0*v0_brick_a1[mz][my][mx];
+                v11 += x0*v1_brick_a1[mz][my][mx];
+                v21 += x0*v2_brick_a1[mz][my][mx];
+                v31 += x0*v3_brick_a1[mz][my][mx];
+                v41 += x0*v4_brick_a1[mz][my][mx];
+                v51 += x0*v5_brick_a1[mz][my][mx];
+                v02 += x0*v0_brick_a2[mz][my][mx];
+                v12 += x0*v1_brick_a2[mz][my][mx];
+                v22 += x0*v2_brick_a2[mz][my][mx];
+                v32 += x0*v3_brick_a2[mz][my][mx];
+                v42 += x0*v4_brick_a2[mz][my][mx];
+                v52 += x0*v5_brick_a2[mz][my][mx];
+                v03 += x0*v0_brick_a3[mz][my][mx];
+                v13 += x0*v1_brick_a3[mz][my][mx];
+                v23 += x0*v2_brick_a3[mz][my][mx];
+                v33 += x0*v3_brick_a3[mz][my][mx];
+                v43 += x0*v4_brick_a3[mz][my][mx];
+                v53 += x0*v5_brick_a3[mz][my][mx];
+                v04 += x0*v0_brick_a4[mz][my][mx];
+                v14 += x0*v1_brick_a4[mz][my][mx];
+                v24 += x0*v2_brick_a4[mz][my][mx];
+                v34 += x0*v3_brick_a4[mz][my][mx];
+                v44 += x0*v4_brick_a4[mz][my][mx];
+                v54 += x0*v5_brick_a4[mz][my][mx];
+                v05 += x0*v0_brick_a5[mz][my][mx];
+                v15 += x0*v1_brick_a5[mz][my][mx];
+                v25 += x0*v2_brick_a5[mz][my][mx];
+                v35 += x0*v3_brick_a5[mz][my][mx];
+                v45 += x0*v4_brick_a5[mz][my][mx];
+                v55 += x0*v5_brick_a5[mz][my][mx];
+                v06 += x0*v0_brick_a6[mz][my][mx];
+                v16 += x0*v1_brick_a6[mz][my][mx];
+                v26 += x0*v2_brick_a6[mz][my][mx];
+                v36 += x0*v3_brick_a6[mz][my][mx];
+                v46 += x0*v4_brick_a6[mz][my][mx];
+                v56 += x0*v5_brick_a6[mz][my][mx];
+              }
+            }
+          }
+        }
+
+        // convert D-field to force
+        type = atom->type[i];
+        lj0 = B[7*type+6]*0.5;
+        lj1 = B[7*type+5]*0.5;
+        lj2 = B[7*type+4]*0.5;
+        lj3 = B[7*type+3]*0.5;
+        lj4 = B[7*type+2]*0.5;
+        lj5 = B[7*type+1]*0.5;
+        lj6 = B[7*type]*0.5;
+
+        if (eflag_atom)
+          eatom[i] += u0*lj0 + u1*lj1 + u2*lj2 + 
+            u3*lj3 + u4*lj4 + u5*lj5 + u6*lj6;
+        if (vflag_atom) {
+          vatom[i][0] += v00*lj0 + v01*lj1 + v02*lj2 + v03*lj3 + 
+            v04*lj4 + v05*lj5 + v06*lj6;
+          vatom[i][1] += v10*lj0 + v11*lj1 + v12*lj2 + v13*lj3 + 
+            v14*lj4 + v15*lj5 + v16*lj6;
+          vatom[i][2] += v20*lj0 + v21*lj1 + v22*lj2 + v23*lj3 + 
+            v24*lj4 + v25*lj5 + v26*lj6;
+          vatom[i][3] += v30*lj0 + v31*lj1 + v32*lj2 + v33*lj3 + 
+            v34*lj4 + v35*lj5 + v36*lj6;
+          vatom[i][4] += v40*lj0 + v41*lj1 + v42*lj2 + v43*lj3 + 
+            v44*lj4 + v45*lj5 + v46*lj6;
+          vatom[i][5] += v50*lj0 + v51*lj1 + v52*lj2 + v53*lj3 + 
+            v54*lj4 + v55*lj5 + v56*lj6;
+        }
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   charge assignment into rho1d
+   dx,dy,dz = distance of particle from "lower left" grid point
+------------------------------------------------------------------------- */
+void PPPMDispTIP4POMP::compute_rho1d_thr(FFT_SCALAR * const * const r1d, const FFT_SCALAR &dx,
+				    const FFT_SCALAR &dy, const FFT_SCALAR &dz,
+                                    const int ord, FFT_SCALAR * const * const rho_c)
+{
+  int k,l;
+  FFT_SCALAR r1,r2,r3;
+
+  for (k = (1-ord)/2; k <= ord/2; k++) {
+    r1 = r2 = r3 = ZEROF;
+
+    for (l = ord-1; l >= 0; l--) {
+      r1 = rho_c[l][k] + r1*dx;
+      r2 = rho_c[l][k] + r2*dy;
+      r3 = rho_c[l][k] + r3*dz;
+    }
+    r1d[0][k] = r1;
+    r1d[1][k] = r2;
+    r1d[2][k] = r3;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   charge assignment into drho1d
+   dx,dy,dz = distance of particle from "lower left" grid point
+------------------------------------------------------------------------- */
+
+void PPPMDispTIP4POMP::compute_drho1d_thr(FFT_SCALAR * const * const dr1d, const FFT_SCALAR &dx,
+				    const FFT_SCALAR &dy, const FFT_SCALAR &dz,
+                                    const int ord, FFT_SCALAR * const * const drho_c)
+{
+  int k,l;
+  FFT_SCALAR r1,r2,r3;
+
+  for (k = (1-ord)/2; k <= ord/2; k++) {
+    r1 = r2 = r3 = ZEROF;
+
+    for (l = ord-2; l >= 0; l--) {
+      r1 = drho_c[l][k] + r1*dx;
+      r2 = drho_c[l][k] + r2*dy;
+      r3 = drho_c[l][k] + r3*dz;
+    }
+    dr1d[0][k] = r1;
+    dr1d[1][k] = r2;
+    dr1d[2][k] = r3;
+  }
+}
+
+/* ----------------------------------------------------------------------
+  find 2 H atoms bonded to O atom i
+  compute position xM of fictitious charge site for O atom
+  also return local indices iH1,iH2 of H atoms
+------------------------------------------------------------------------- */
+
+void PPPMDispTIP4POMP::find_M_thr(int i, int &iH1, int &iH2, dbl3_t &xM)
+{
+  iH1 = atom->map(atom->tag[i] + 1);
+  iH2 = atom->map(atom->tag[i] + 2);
+
+  if (iH1 == -1 || iH2 == -1) error->one(FLERR,"TIP4P hydrogen is missing");
+  if (atom->type[iH1] != typeH || atom->type[iH2] != typeH)
+    error->one(FLERR,"TIP4P hydrogen has incorrect atom type");
+
+  const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
+
+  double delx1 = x[iH1].x - x[i].x;
+  double dely1 = x[iH1].y - x[i].y;
+  double delz1 = x[iH1].z - x[i].z;
+  domain->minimum_image(delx1,dely1,delz1);
+
+  double delx2 = x[iH2].x - x[i].x;
+  double dely2 = x[iH2].y - x[i].y;
+  double delz2 = x[iH2].z - x[i].z;
+  domain->minimum_image(delx2,dely2,delz2);
+
+  xM.x = x[i].x + alpha * 0.5 * (delx1 + delx2);
+  xM.y = x[i].y + alpha * 0.5 * (dely1 + dely2);
+  xM.z = x[i].z + alpha * 0.5 * (delz1 + delz2);
+}
diff --git a/src/USER-OMP/pppm_disp_tip4p_omp.h b/src/USER-OMP/pppm_disp_tip4p_omp.h
new file mode 100644
index 0000000000..e05a52ac80
--- /dev/null
+++ b/src/USER-OMP/pppm_disp_tip4p_omp.h
@@ -0,0 +1,84 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef KSPACE_CLASS
+
+KSpaceStyle(pppm/disp/tip4p/omp,PPPMDispTIP4POMP)
+
+#else
+
+#ifndef LMP_PPPM_DISP_TIP4P_OMP_H
+#define LMP_PPPM_DISP_TIP4P_OMP_H
+
+#include "pppm_disp_tip4p.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+  class PPPMDispTIP4POMP : public PPPMDispTIP4P, public ThrOMP {
+ public:
+  PPPMDispTIP4POMP(class LAMMPS *, int, char **);
+  virtual ~PPPMDispTIP4POMP () {};
+
+ protected:
+  virtual void allocate();
+  virtual void deallocate();
+
+  virtual void compute_gf();
+  virtual void compute_gf_6();
+
+  virtual void compute(int,int);
+
+  virtual void particle_map(double, double, double,
+                            double, int **, int, int,
+                            int, int, int, int, int, int);
+  virtual void particle_map_c(double, double, double,
+                              double, int **, int, int,
+                              int, int, int, int, int, int);
+  virtual void make_rho_c();  // XXX: not (yet) multi-threaded
+  virtual void make_rho_g();
+  virtual void make_rho_a();
+
+  virtual void fieldforce_c_ik();
+  virtual void fieldforce_c_ad();
+  // virtual void fieldforce_peratom();  XXX: need to benchmark first.
+  virtual void fieldforce_g_ik();
+  virtual void fieldforce_g_ad();
+  virtual void fieldforce_g_peratom();
+  virtual void fieldforce_a_ik();
+  virtual void fieldforce_a_ad();
+  virtual void fieldforce_a_peratom();
+
+  private:
+  void compute_rho1d_thr(FFT_SCALAR * const * const, const FFT_SCALAR &,
+                         const FFT_SCALAR &, const FFT_SCALAR &,
+                         const int, FFT_SCALAR * const * const);
+  void compute_drho1d_thr(FFT_SCALAR * const * const, const FFT_SCALAR &,
+			  const FFT_SCALAR &, const FFT_SCALAR &,
+                          const int, FFT_SCALAR * const * const);
+  virtual void find_M_thr(int, int &, int &, dbl3_t &);
+
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Kspace style pppm/tip4p/omp requires newton on
+
+Self-explanatory.
+
+*/