diff --git a/src/GPU/gpu_extra.h b/src/GPU/gpu_extra.h
new file mode 100644
index 0000000000..30faef139d
--- /dev/null
+++ b/src/GPU/gpu_extra.h
@@ -0,0 +1,47 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Mike Brown (ORNL)
+------------------------------------------------------------------------- */
+
+#ifndef LMP_GPU_EXTRA_H
+#define LMP_GPU_EXTRA_H
+
+#include "error.h"
+
+namespace GPU_EXTRA {
+
+  inline void check_flag(int error_flag, LAMMPS_NS::Error *error,
+                         MPI_Comm &world) { 
+    int all_success;
+    MPI_Allreduce(&error_flag, &all_success, 1, MPI_INT, MPI_MIN, world);
+    if (all_success != 0) {
+      if (all_success == -1)
+	error->all("Accelerated style in input script but no fix gpu."); 
+      else if (all_success == -2)
+	error->all("Could not find/initialize a specified accelerator device.");
+      else if (all_success == -3)
+	error->all("Insufficient memory on accelerator.");
+      else if (all_success == -4)
+	error->all("GPU library not compiled for this accelerator.");
+      else if (all_success == -5)
+	error->all("Double precision is not supported on this accelerator.");
+      else
+	error->all("Unknown error in GPU library.");
+    }
+  }
+
+}
+
+#endif
diff --git a/src/GPU/pair_lj_expand_gpu.cpp b/src/GPU/pair_lj_expand_gpu.cpp
new file mode 100644
index 0000000000..b4c50d55c4
--- /dev/null
+++ b/src/GPU/pair_lj_expand_gpu.cpp
@@ -0,0 +1,236 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+   
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+   
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Inderaj Bains (NVIDIA), ibains@nvidia.com
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "pair_lj_expand_gpu.h"
+#include "lmptype.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "integrate.h"
+#include "memory.h"
+#include "error.h"
+#include "neigh_request.h"
+#include "universe.h"
+#include "update.h"
+#include "domain.h"
+#include "string.h"
+#include "gpu_extra.h"
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+// External functions from cuda library for atom decomposition
+
+int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+		 double **host_lj2, double **host_lj3, double **host_lj4, 
+		 double **offset, double **shift, double *special_lj, 
+		 const int nlocal, const int nall, const int max_nbors,
+		 const int maxspecial, const double cell_size, int &gpu_mode,
+		 FILE *screen);
+void lje_gpu_clear();
+int ** lje_gpu_compute_n(const int ago, const int inum, const int nall,
+			 double **host_x, int *host_type, double *sublo,
+			 double *subhi, int *tag, int **nspecial,
+			 int **special, const bool eflag, const bool vflag,
+			 const bool eatom, const bool vatom, int &host_start,
+			 int **ilist, int **jnum,
+			 const double cpu_time, bool &success);
+void lje_gpu_compute(const int ago, const int inum, const int nall,
+		     double **host_x, int *host_type, int *ilist, int *numj,
+		     int **firstneigh, const bool eflag, const bool vflag,
+		     const bool eatom, const bool vatom, int &host_start,
+		     const double cpu_time, bool &success);
+double lje_gpu_bytes();
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairLJExpandGPU::PairLJExpandGPU(LAMMPS *lmp) : PairLJExpand(lmp), gpu_mode(GPU_PAIR)
+{
+  respa_enable = 0;
+  cpu_time = 0.0;
+}
+
+/* ----------------------------------------------------------------------
+   free all arrays
+------------------------------------------------------------------------- */
+
+PairLJExpandGPU::~PairLJExpandGPU()
+{
+  lje_gpu_clear();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJExpandGPU::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+  
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+  
+  bool success = true;
+  int *ilist, *numneigh, **firstneigh;    
+  if (gpu_mode == GPU_NEIGH) {
+    inum = atom->nlocal;
+    firstneigh = lje_gpu_compute_n(neighbor->ago, inum, nall, atom->x,
+				   atom->type, domain->sublo, domain->subhi,
+				   atom->tag, atom->nspecial, atom->special,
+				   eflag, vflag, eflag_atom, vflag_atom,
+				   host_start, &ilist, &numneigh, cpu_time,
+				   success);
+  } else {
+    inum = list->inum;
+    ilist = list->ilist;
+    numneigh = list->numneigh;
+    firstneigh = list->firstneigh;
+    lje_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
+		    ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
+		    vflag_atom, host_start, cpu_time, success);
+  }
+  if (!success)
+    error->one("Out of memory on GPGPU");
+
+  if (host_start<inum) {
+    cpu_time = MPI_Wtime();
+    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
+    cpu_time = MPI_Wtime() - cpu_time;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairLJExpandGPU::init_style()
+{
+  if (force->newton_pair) 
+    error->all("Cannot use newton pair with GPU LJ pair style");
+
+  // Repeat cutsq calculation because done after call to init_style
+  double maxcut = -1.0;
+  double cut;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i,j);
+        cut *= cut;
+        if (cut > maxcut)
+          maxcut = cut;
+        cutsq[i][j] = cutsq[j][i] = cut;
+      } else
+        cutsq[i][j] = cutsq[j][i] = 0.0;
+    }
+  }
+  double cell_size = sqrt(maxcut) + neighbor->skin;
+
+  int maxspecial=0;
+  if (atom->molecular)
+    maxspecial=atom->maxspecial;
+  int success = lje_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
+			     offset, shift, force->special_lj, atom->nlocal,
+			     atom->nlocal+atom->nghost, 300, maxspecial,
+			     cell_size, gpu_mode, screen);
+  GPU_EXTRA::check_flag(success,error,world);
+
+  if (gpu_mode != GPU_NEIGH) {
+    int irequest = neighbor->request(this);
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->full = 1;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJExpandGPU::memory_usage()
+{
+  double bytes = Pair::memory_usage();
+  return bytes + lje_gpu_bytes();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJExpandGPU::cpu_compute(int start, int inum, int eflag, int vflag,
+				  int *ilist, int *numneigh, int **firstneigh)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r2inv,r6inv,forcelj,factor_lj;
+  double r,rshift,rshiftsq;
+  int *jlist;
+
+  double **x = atom->x;
+  double **f = atom->f;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+  double *special_lj = force->special_lj;
+
+  // loop over neighbors of my atoms
+
+  for (ii = start; ii < inum; ii++) {
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+	r = sqrt(rsq);
+	rshift = r - shift[itype][jtype];
+	rshiftsq = rshift*rshift;
+	r2inv = 1.0/rshiftsq;
+	r6inv = r2inv*r2inv*r2inv;
+	forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+	fpair = factor_lj*forcelj/rshift/r;
+
+	f[i][0] += delx*fpair;
+	f[i][1] += dely*fpair;
+	f[i][2] += delz*fpair;
+
+	if (eflag) {
+	  evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
+	    offset[itype][jtype];
+	  evdwl *= factor_lj;
+	}
+
+	if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
+      }
+    }
+  }
+}
diff --git a/src/GPU/pair_lj_expand_gpu.h b/src/GPU/pair_lj_expand_gpu.h
new file mode 100644
index 0000000000..96619034c7
--- /dev/null
+++ b/src/GPU/pair_lj_expand_gpu.h
@@ -0,0 +1,47 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/expand/gpu,PairLJExpandGPU)
+
+#else
+
+#ifndef LMP_PAIR_LJE_LIGHT_GPU_H
+#define LMP_PAIR_LJE_LIGHT_GPU_H
+
+#include "pair_lj_expand.h"
+
+namespace LAMMPS_NS {
+
+class PairLJExpandGPU : public PairLJExpand {
+ public:
+  PairLJExpandGPU(LAMMPS *lmp);
+  ~PairLJExpandGPU();
+  void cpu_compute(int, int, int, int, int *, int *, int **);
+  void compute(int, int);
+  void init_style();
+  double memory_usage();
+
+ enum { GPU_PAIR, GPU_NEIGH };
+
+ private:
+  int gpu_mode;
+  double cpu_time;
+  int *gpulist;
+};
+
+}
+#endif
+#endif
+
diff --git a/src/GPU/pair_morse_gpu.cpp b/src/GPU/pair_morse_gpu.cpp
new file mode 100644
index 0000000000..711359a300
--- /dev/null
+++ b/src/GPU/pair_morse_gpu.cpp
@@ -0,0 +1,230 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+   
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+   
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Mike Brown (SNL)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "pair_morse_gpu.h"
+#include "lmptype.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "integrate.h"
+#include "memory.h"
+#include "error.h"
+#include "neigh_request.h"
+#include "universe.h"
+#include "update.h"
+#include "domain.h"
+#include "string.h"
+#include "gpu_extra.h"
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+// External functions from cuda library for atom decomposition
+
+int mor_gpu_init(const int ntypes, double **cutsq, double **host_morse1,
+		 double **host_r0, double **host_alpha, double **host_d0, 
+		 double **offset, double *special_lj, const int nlocal, 
+		 const int nall, const int max_nbors, const int maxspecial,
+		 const double cell_size, int &gpu_mode, FILE *screen);
+void mor_gpu_clear();
+int ** mor_gpu_compute_n(const int ago, const int inum,
+			 const int nall, double **host_x, int *host_type, 
+			 double *sublo, double *subhi, int *tag, int **nspecial,
+			 int **special, const bool eflag, const bool vflag,
+			 const bool eatom, const bool vatom, int &host_start,
+			 int **ilist, int **jnum,
+			 const double cpu_time, bool &success);
+void mor_gpu_compute(const int ago, const int inum, const int nall, 
+		     double **host_x, int *host_type, int *ilist, int *numj,
+		     int **firstneigh, const bool eflag, const bool vflag,
+		     const bool eatom, const bool vatom, int &host_start,
+		     const double cpu_time, bool &success);
+double mor_gpu_bytes();
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairMorseGPU::PairMorseGPU(LAMMPS *lmp) : PairMorse(lmp), gpu_mode(GPU_PAIR)
+{
+  cpu_time = 0.0;
+}
+
+/* ----------------------------------------------------------------------
+   free all arrays
+------------------------------------------------------------------------- */
+
+PairMorseGPU::~PairMorseGPU()
+{
+  mor_gpu_clear();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairMorseGPU::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+  
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+  
+  bool success = true;
+  int *ilist, *numneigh, **firstneigh;    
+  if (gpu_mode == GPU_NEIGH) {
+    inum = atom->nlocal;
+    firstneigh = mor_gpu_compute_n(neighbor->ago, inum, nall,
+				   atom->x, atom->type, domain->sublo,
+				   domain->subhi, atom->tag, atom->nspecial,
+				   atom->special, eflag, vflag, eflag_atom,
+				   vflag_atom, host_start, &ilist, &numneigh,
+				   cpu_time, success);
+  } else {
+    inum = list->inum;
+    ilist = list->ilist;
+    numneigh = list->numneigh;
+    firstneigh = list->firstneigh;
+    mor_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
+		    ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
+		    vflag_atom, host_start, cpu_time, success);
+  }
+  if (!success)
+    error->one("Out of memory on GPGPU");
+
+  if (host_start<inum) {
+    cpu_time = MPI_Wtime();
+    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
+    cpu_time = MPI_Wtime() - cpu_time;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairMorseGPU::init_style()
+{
+  if (force->newton_pair) 
+    error->all("Cannot use newton pair with GPU Morse pair style");
+
+  // Repeat cutsq calculation because done after call to init_style
+  double maxcut = -1.0;
+  double cut;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i,j);
+        cut *= cut;
+        if (cut > maxcut)
+          maxcut = cut;
+        cutsq[i][j] = cutsq[j][i] = cut;
+      } else
+        cutsq[i][j] = cutsq[j][i] = 0.0;
+    }
+  }
+  double cell_size = sqrt(maxcut) + neighbor->skin;
+
+  int maxspecial=0;
+  if (atom->molecular)
+    maxspecial=atom->maxspecial;
+  int success = mor_gpu_init(atom->ntypes+1, cutsq, morse1, r0, alpha, d0,
+			     offset, force->special_lj, atom->nlocal,
+			     atom->nlocal+atom->nghost, 300, maxspecial,
+			     cell_size, gpu_mode, screen);
+  GPU_EXTRA::check_flag(success,error,world);
+
+  if (gpu_mode != GPU_NEIGH) {
+    int irequest = neighbor->request(this);
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->full = 1;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairMorseGPU::memory_usage()
+{
+  double bytes = Pair::memory_usage();
+  return bytes + mor_gpu_bytes();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairMorseGPU::cpu_compute(int start, int inum, int eflag, int vflag,
+			       int *ilist, int *numneigh, int **firstneigh)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r,dr,dexp,factor_lj;
+  int *jlist;
+
+  double **x = atom->x;
+  double **f = atom->f;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  int nall = nlocal + atom->nghost;
+  double *special_lj = force->special_lj;
+
+  // loop over neighbors of my atoms
+
+  for (ii = start; ii < inum; ii++) {
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+	r = sqrt(rsq);
+	dr = r - r0[itype][jtype];
+	dexp = exp(-alpha[itype][jtype] * dr);
+	fpair = factor_lj * morse1[itype][jtype] * (dexp*dexp - dexp) / r;
+
+	f[i][0] += delx*fpair;
+	f[i][1] += dely*fpair;
+	f[i][2] += delz*fpair;
+
+	if (eflag) {
+	  evdwl = d0[itype][jtype] * (dexp*dexp - 2.0*dexp) -
+	    offset[itype][jtype];
+	  evdwl *= factor_lj;
+	}
+
+	if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
+      }
+    }
+  }
+}
diff --git a/src/GPU/pair_morse_gpu.h b/src/GPU/pair_morse_gpu.h
new file mode 100644
index 0000000000..ee9ba576d5
--- /dev/null
+++ b/src/GPU/pair_morse_gpu.h
@@ -0,0 +1,47 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(morse/gpu,PairMorseGPU)
+
+#else
+
+#ifndef LMP_PAIR_MORSE_GPU_H
+#define LMP_PAIR_MORSE_GPU_H
+
+#include "pair_morse.h"
+
+namespace LAMMPS_NS {
+
+class PairMorseGPU : public PairMorse {
+ public:
+  PairMorseGPU(LAMMPS *lmp);
+  ~PairMorseGPU();
+  void cpu_compute(int, int, int, int, int *, int *, int **);
+  void compute(int, int);
+  void init_style();
+  double memory_usage();
+
+ enum { GPU_PAIR, GPU_NEIGH };
+
+ private:
+  int gpu_mode;
+  double cpu_time;
+  int *gpulist;
+};
+
+}
+#endif
+#endif
+
diff --git a/src/GPU/pppm_gpu.cpp b/src/GPU/pppm_gpu.cpp
new file mode 100644
index 0000000000..d447419ce7
--- /dev/null
+++ b/src/GPU/pppm_gpu.cpp
@@ -0,0 +1,1743 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL)
+------------------------------------------------------------------------- */
+
+#include "mpi.h"
+#include "string.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "math.h"
+#include "pppm_gpu.h"
+#include "lmptype.h"
+#include "atom.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "force.h"
+#include "pair.h"
+#include "bond.h"
+#include "angle.h"
+#include "domain.h"
+#include "fft3d_wrap.h"
+#include "remap_wrap.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+#define MAXORDER 7
+#define OFFSET 16384
+#define SMALL 0.00001
+#define LARGE 10000.0
+#define EPS_HOC 1.0e-7
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+#define PPPMGPUT PPPMGPU<grdtyp>
+
+/* ---------------------------------------------------------------------- */
+
+template <class grdtyp>
+PPPMGPUT::PPPMGPU(LAMMPS *lmp, int narg, char **arg) : KSpace(lmp, narg, arg)
+{
+  if (narg != 1) error->all("Illegal kspace_style pppm command");
+
+  precision = atof(arg[0]);
+  PI = 4.0*atan(1.0);
+  
+  nfactors = 3;
+  factors = new int[nfactors];
+  factors[0] = 2;
+  factors[1] = 3;
+  factors[2] = 5;
+
+  MPI_Comm_rank(world,&me);
+  MPI_Comm_size(world,&nprocs);
+
+  density_brick = NULL;
+  vd_brick = NULL;
+  density_fft = NULL;
+  greensfn = NULL;
+  work1 = work2 = NULL;
+  vg = NULL;
+  fkx = fky = fkz = NULL;
+  buf1 = buf2 = NULL;
+
+  gf_b = NULL;
+  rho1d = rho_coeff = NULL;
+
+  fft1 = fft2 = NULL;
+  remap = NULL;
+
+  nmax = 0;
+}
+
+/* ----------------------------------------------------------------------
+   free all memory 
+------------------------------------------------------------------------- */
+
+template <class grdtyp>
+PPPMGPUT::~PPPMGPU()
+{
+  delete [] factors;
+  deallocate();
+}
+
+/* ----------------------------------------------------------------------
+   called once before run 
+------------------------------------------------------------------------- */
+
+template <class grdtyp>
+void PPPMGPUT::base_init()
+{
+  if (me == 0) {
+    if (screen) fprintf(screen,"PPPM initialization ...\n");
+    if (logfile) fprintf(logfile,"PPPM initialization ...\n");
+  }
+
+  // error check
+
+  if (domain->triclinic)
+    error->all("Cannot (yet) use PPPM with triclinic box");
+  if (domain->dimension == 2) error->all("Cannot use PPPM with 2d simulation");
+
+  if (!atom->q_flag) error->all("Kspace style requires atom attribute q");
+
+  if (slabflag == 0 && domain->nonperiodic > 0)
+    error->all("Cannot use nonperiodic boundaries with PPPM");
+  if (slabflag == 1) {
+    if (domain->xperiodic != 1 || domain->yperiodic != 1 || 
+	domain->boundary[2][0] != 1 || domain->boundary[2][1] != 1)
+      error->all("Incorrect boundaries with slab PPPM");
+  }
+
+  if (order > MAXORDER) {
+    char str[128];
+    sprintf(str,"PPPM order cannot be greater than %d",MAXORDER);
+    error->all(str);
+  }
+
+  // free all arrays previously allocated
+
+  deallocate();
+
+  // extract short-range Coulombic cutoff from pair style
+
+  qqrd2e = force->qqrd2e;
+  scale = 1.0;
+
+  if (force->pair == NULL)
+    error->all("KSpace style is incompatible with Pair style");
+  int itmp;
+  double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp);
+  if (p_cutoff == NULL)
+    error->all("KSpace style is incompatible with Pair style");
+  cutoff = *p_cutoff;
+
+  // if kspace is TIP4P, extract TIP4P params from pair style
+  // bond/angle are not yet init(), so insure equilibrium request is valid
+
+  qdist = 0.0;
+
+  if (strcmp(force->kspace_style,"pppm/tip4p") == 0) {
+    if (force->pair == NULL)
+      error->all("KSpace style is incompatible with Pair style");
+    double *p_qdist = (double *) force->pair->extract("qdist",itmp);
+    int *p_typeO = (int *) force->pair->extract("typeO",itmp);
+    int *p_typeH = (int *) force->pair->extract("typeH",itmp);
+    int *p_typeA = (int *) force->pair->extract("typeA",itmp);
+    int *p_typeB = (int *) force->pair->extract("typeB",itmp);
+    if (!p_qdist || !p_typeO || !p_typeH || !p_typeA || !p_typeB)
+      error->all("KSpace style is incompatible with Pair style");
+    qdist = *p_qdist;
+    typeO = *p_typeO;
+    typeH = *p_typeH;
+    int typeA = *p_typeA;
+    int typeB = *p_typeB;
+
+    if (force->angle == NULL || force->bond == NULL)
+      error->all("Bond and angle potentials must be defined for TIP4P");
+    if (typeA < 1 || typeA > atom->nangletypes || 
+	force->angle->setflag[typeA] == 0)
+      error->all("Bad TIP4P angle type for PPPM/TIP4P");
+    if (typeB < 1 || typeA > atom->nbondtypes || 
+	force->bond->setflag[typeB] == 0)
+      error->all("Bad TIP4P bond type for PPPM/TIP4P");
+    double theta = force->angle->equilibrium_angle(typeA);
+    double blen = force->bond->equilibrium_distance(typeB);
+    alpha = qdist / (2.0 * cos(0.5*theta) * blen);
+  }
+
+  // compute qsum & qsqsum and warn if not charge-neutral
+
+  qsum = qsqsum = 0.0;
+  for (int i = 0; i < atom->nlocal; i++) {
+    qsum += atom->q[i];
+    qsqsum += atom->q[i]*atom->q[i];
+  }
+
+  double tmp;
+  MPI_Allreduce(&qsum,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
+  qsum = tmp;
+  MPI_Allreduce(&qsqsum,&tmp,1,MPI_DOUBLE,MPI_SUM,world);
+  qsqsum = tmp;
+
+  if (qsqsum == 0.0)
+    error->all("Cannot use kspace solver on system with no charge");
+  if (fabs(qsum) > SMALL && me == 0) {
+    char str[128];
+    sprintf(str,"System is not charge neutral, net charge = %g",qsum);
+    error->warning(str);
+  }
+
+  // setup FFT grid resolution and g_ewald
+  // normally one iteration thru while loop is all that is required
+  // if grid stencil extends beyond neighbor proc, reduce order and try again
+
+  int iteration = 0;
+
+  while (order > 0) {
+
+    if (iteration && me == 0)
+      error->warning("Reducing PPPM order b/c stencil extends "
+		     "beyond neighbor processor");
+    iteration++;
+
+    set_grid();
+
+    if (nx_pppm >= OFFSET || ny_pppm >= OFFSET || nz_pppm >= OFFSET)
+      error->all("PPPM grid is too large");
+
+    // global indices of PPPM grid range from 0 to N-1
+    // nlo_in,nhi_in = lower/upper limits of the 3d sub-brick of
+    //   global PPPM grid that I own without ghost cells
+    // for slab PPPM, assign z grid as if it were not extended
+
+    nxlo_in = comm->myloc[0]*nx_pppm / comm->procgrid[0];
+    nxhi_in = (comm->myloc[0]+1)*nx_pppm / comm->procgrid[0] - 1;
+    nylo_in = comm->myloc[1]*ny_pppm / comm->procgrid[1];
+    nyhi_in = (comm->myloc[1]+1)*ny_pppm / comm->procgrid[1] - 1;
+    nzlo_in = comm->myloc[2] * 
+      (static_cast<int> (nz_pppm/slab_volfactor)) / comm->procgrid[2];
+    nzhi_in = (comm->myloc[2]+1) * 
+      (static_cast<int> (nz_pppm/slab_volfactor)) / comm->procgrid[2] - 1;
+
+    // nlower,nupper = stencil size for mapping particles to PPPM grid
+
+    nlower = -(order-1)/2;
+    nupper = order/2;
+
+    // shift values for particle <-> grid mapping
+    // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
+
+    if (order % 2) shift = OFFSET + 0.5;
+    else shift = OFFSET;
+    if (order % 2) shiftone = 0.0;
+    else shiftone = 0.5;
+
+    // nlo_out,nhi_out = lower/upper limits of the 3d sub-brick of
+    //   global PPPM grid that my particles can contribute charge to
+    // effectively nlo_in,nhi_in + ghost cells
+    // nlo,nhi = global coords of grid pt to "lower left" of smallest/largest
+    //           position a particle in my box can be at
+    // dist[3] = particle position bound = subbox + skin/2.0 + qdist
+    //   qdist = offset due to TIP4P fictitious charge
+    //   convert to triclinic if necessary
+    // nlo_out,nhi_out = nlo,nhi + stencil size for particle mapping
+    // for slab PPPM, assign z grid as if it were not extended
+
+    triclinic = domain->triclinic;
+    double *prd,*sublo,*subhi;
+
+    if (triclinic == 0) {
+      prd = domain->prd;
+      boxlo = domain->boxlo;
+      sublo = domain->sublo;
+      subhi = domain->subhi;
+    } else {
+      prd = domain->prd_lamda;
+      boxlo = domain->boxlo_lamda;
+      sublo = domain->sublo_lamda;
+      subhi = domain->subhi_lamda;
+    }
+
+    double xprd = prd[0];
+    double yprd = prd[1];
+    double zprd = prd[2];
+    double zprd_slab = zprd*slab_volfactor;
+
+    double dist[3];
+    double cuthalf = 0.5*neighbor->skin + qdist;
+    if (triclinic == 0) dist[0] = dist[1] = dist[2] = cuthalf;
+    else {
+      dist[0] = cuthalf/domain->prd[0];
+      dist[1] = cuthalf/domain->prd[1];
+      dist[2] = cuthalf/domain->prd[2];
+    }
+    
+    int nlo,nhi;
+    
+    nlo = static_cast<int> ((sublo[0]-dist[0]-boxlo[0]) * 
+			    nx_pppm/xprd + shift) - OFFSET;
+    nhi = static_cast<int> ((subhi[0]+dist[0]-boxlo[0]) * 
+			    nx_pppm/xprd + shift) - OFFSET;
+    nxlo_out = nlo + nlower;
+    nxhi_out = nhi + nupper;
+
+    nlo = static_cast<int> ((sublo[1]-dist[1]-boxlo[1]) * 
+			    ny_pppm/yprd + shift) - OFFSET;
+    nhi = static_cast<int> ((subhi[1]+dist[1]-boxlo[1]) * 
+			    ny_pppm/yprd + shift) - OFFSET;
+    nylo_out = nlo + nlower;
+    nyhi_out = nhi + nupper;
+
+    nlo = static_cast<int> ((sublo[2]-dist[2]-boxlo[2]) * 
+			    nz_pppm/zprd_slab + shift) - OFFSET;
+    nhi = static_cast<int> ((subhi[2]+dist[2]-boxlo[2]) * 
+			    nz_pppm/zprd_slab + shift) - OFFSET;
+    nzlo_out = nlo + nlower;
+    nzhi_out = nhi + nupper;
+
+    // for slab PPPM, change the grid boundary for processors at +z end
+    //   to include the empty volume between periodically repeating slabs
+    // for slab PPPM, want charge data communicated from -z proc to +z proc,
+    //   but not vice versa, also want field data communicated from +z proc to
+    //   -z proc, but not vice versa
+    // this is accomplished by nzhi_in = nzhi_out on +z end (no ghost cells)
+
+    if (slabflag && ((comm->myloc[2]+1) == (comm->procgrid[2]))) {
+      nzhi_in =  nz_pppm - 1;
+      nzhi_out = nz_pppm - 1;
+    }
+  
+    // nlo_ghost,nhi_ghost = # of planes I will recv from 6 directions
+    //   that overlay domain I own
+    // proc in that direction tells me via sendrecv()
+    // if no neighbor proc, value is from self since I have ghosts regardless
+
+    int nplanes;
+    MPI_Status status;
+
+    nplanes = nxlo_in - nxlo_out;
+    if (comm->procneigh[0][0] != me)
+      MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[0][0],0,
+		   &nxhi_ghost,1,MPI_INT,comm->procneigh[0][1],0,
+		   world,&status);
+    else nxhi_ghost = nplanes;
+
+    nplanes = nxhi_out - nxhi_in;
+    if (comm->procneigh[0][1] != me)
+      MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[0][1],0,
+		   &nxlo_ghost,1,MPI_INT,comm->procneigh[0][0],
+		   0,world,&status);
+    else nxlo_ghost = nplanes;
+
+    nplanes = nylo_in - nylo_out;
+    if (comm->procneigh[1][0] != me)
+      MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[1][0],0,
+		   &nyhi_ghost,1,MPI_INT,comm->procneigh[1][1],0,
+		   world,&status);
+    else nyhi_ghost = nplanes;
+
+    nplanes = nyhi_out - nyhi_in;
+    if (comm->procneigh[1][1] != me)
+      MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[1][1],0,
+		   &nylo_ghost,1,MPI_INT,comm->procneigh[1][0],0,
+		   world,&status);
+    else nylo_ghost = nplanes;
+
+    nplanes = nzlo_in - nzlo_out;
+    if (comm->procneigh[2][0] != me)
+      MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[2][0],0,
+		   &nzhi_ghost,1,MPI_INT,comm->procneigh[2][1],0,
+		   world,&status);
+    else nzhi_ghost = nplanes;
+
+    nplanes = nzhi_out - nzhi_in;
+    if (comm->procneigh[2][1] != me)
+      MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[2][1],0,
+		   &nzlo_ghost,1,MPI_INT,comm->procneigh[2][0],0,
+		   world,&status);
+    else nzlo_ghost = nplanes;
+
+    // test that ghost overlap is not bigger than my sub-domain
+
+    int flag = 0;
+    if (nxlo_ghost > nxhi_in-nxlo_in+1) flag = 1;
+    if (nxhi_ghost > nxhi_in-nxlo_in+1) flag = 1;
+    if (nylo_ghost > nyhi_in-nylo_in+1) flag = 1;
+    if (nyhi_ghost > nyhi_in-nylo_in+1) flag = 1;
+    if (nzlo_ghost > nzhi_in-nzlo_in+1) flag = 1;
+    if (nzhi_ghost > nzhi_in-nzlo_in+1) flag = 1;
+
+    int flag_all;
+    MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world);
+
+    if (flag_all == 0) break;
+    order--;
+  }
+
+  if (order == 0) error->all("PPPM order has been reduced to 0");
+
+  // decomposition of FFT mesh
+  // global indices range from 0 to N-1
+  // proc owns entire x-dimension, clump of columns in y,z dimensions
+  // npey_fft,npez_fft = # of procs in y,z dims
+  // if nprocs is small enough, proc can own 1 or more entire xy planes,
+  //   else proc owns 2d sub-blocks of yz plane
+  // me_y,me_z = which proc (0-npe_fft-1) I am in y,z dimensions
+  // nlo_fft,nhi_fft = lower/upper limit of the section
+  //   of the global FFT mesh that I own
+
+  int npey_fft,npez_fft;
+  if (nz_pppm >= nprocs) {
+    npey_fft = 1;
+    npez_fft = nprocs;
+  } else procs2grid2d(nprocs,ny_pppm,nz_pppm,&npey_fft,&npez_fft);
+
+  int me_y = me % npey_fft;
+  int me_z = me / npey_fft;
+
+  nxlo_fft = 0;
+  nxhi_fft = nx_pppm - 1;
+  nylo_fft = me_y*ny_pppm/npey_fft;
+  nyhi_fft = (me_y+1)*ny_pppm/npey_fft - 1;
+  nzlo_fft = me_z*nz_pppm/npez_fft;
+  nzhi_fft = (me_z+1)*nz_pppm/npez_fft - 1;
+
+  // PPPM grid for this proc, including ghosts
+
+  ngrid = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) *
+    (nzhi_out-nzlo_out+1);
+
+  // FFT arrays on this proc, without ghosts
+  // nfft = FFT points in FFT decomposition on this proc
+  // nfft_brick = FFT points in 3d brick-decomposition on this proc
+  // nfft_both = greater of 2 values
+
+  nfft = (nxhi_fft-nxlo_fft+1) * (nyhi_fft-nylo_fft+1) *
+    (nzhi_fft-nzlo_fft+1);
+  int nfft_brick = (nxhi_in-nxlo_in+1) * (nyhi_in-nylo_in+1) *
+    (nzhi_in-nzlo_in+1);
+  nfft_both = MAX(nfft,nfft_brick);
+
+  // buffer space for use in brick2fft and fillbrick
+  // idel = max # of ghost planes to send or recv in +/- dir of each dim
+  // nx,ny,nz = owned planes (including ghosts) in each dim
+  // nxx,nyy,nzz = max # of grid cells to send in each dim
+  // nbuf = max in any dim, augment by 3x for components of vd_xyz in fillbrick
+
+  int idelx,idely,idelz,nx,ny,nz,nxx,nyy,nzz;
+
+  idelx = MAX(nxlo_ghost,nxhi_ghost);
+  idelx = MAX(idelx,nxhi_out-nxhi_in);
+  idelx = MAX(idelx,nxlo_in-nxlo_out);
+
+  idely = MAX(nylo_ghost,nyhi_ghost);
+  idely = MAX(idely,nyhi_out-nyhi_in);
+  idely = MAX(idely,nylo_in-nylo_out);
+
+  idelz = MAX(nzlo_ghost,nzhi_ghost);
+  idelz = MAX(idelz,nzhi_out-nzhi_in);
+  idelz = MAX(idelz,nzlo_in-nzlo_out);
+
+  nx = nxhi_out - nxlo_out + 1;
+  ny = nyhi_out - nylo_out + 1;
+  nz = nzhi_out - nzlo_out + 1;
+
+  nxx = idelx * ny * nz;
+  nyy = idely * nx * nz;
+  nzz = idelz * nx * ny;
+
+  nbuf = MAX(nxx,nyy);
+  nbuf = MAX(nbuf,nzz);
+  nbuf *= 3;
+
+  // print stats
+
+  int ngrid_max,nfft_both_max,nbuf_max;
+  MPI_Allreduce(&ngrid,&ngrid_max,1,MPI_INT,MPI_MAX,world);
+  MPI_Allreduce(&nfft_both,&nfft_both_max,1,MPI_INT,MPI_MAX,world);
+  MPI_Allreduce(&nbuf,&nbuf_max,1,MPI_INT,MPI_MAX,world);
+
+  if (me == 0) {
+    if (screen) fprintf(screen,"  brick FFT buffer size/proc = %d %d %d\n",
+			ngrid_max,nfft_both_max,nbuf_max);
+    if (logfile) fprintf(logfile,"  brick FFT buffer size/proc = %d %d %d\n",
+			 ngrid_max,nfft_both_max,nbuf_max);
+  }
+
+  // allocate K-space dependent memory
+
+  allocate();
+
+  // pre-compute Green's function denomiator expansion
+  // pre-compute 1d charge distribution coefficients
+
+  compute_gf_denom();
+  compute_rho_coeff();
+}
+
+/* ----------------------------------------------------------------------
+   adjust PPPM coeffs, called initially and whenever volume has changed 
+------------------------------------------------------------------------- */
+
+template <class grdtyp>
+void PPPMGPUT::setup()
+{
+  int i,j,k,l,m,n;
+  double *prd;
+
+  // volume-dependent factors
+  // adjust z dimension for 2d slab PPPM
+  // z dimension for 3d PPPM is zprd since slab_volfactor = 1.0
+
+  if (triclinic == 0) prd = domain->prd;
+  else prd = domain->prd_lamda;
+
+  double xprd = prd[0];
+  double yprd = prd[1];
+  double zprd = prd[2];
+  double zprd_slab = zprd*slab_volfactor;
+  volume = xprd * yprd * zprd_slab;
+    
+  delxinv = nx_pppm/xprd;
+  delyinv = ny_pppm/yprd;
+  delzinv = nz_pppm/zprd_slab;
+
+  delvolinv = delxinv*delyinv*delzinv;
+
+  double unitkx = (2.0*PI/xprd);
+  double unitky = (2.0*PI/yprd);
+  double unitkz = (2.0*PI/zprd_slab);
+
+  // fkx,fky,fkz for my FFT grid pts
+
+  double per;
+
+  for (i = nxlo_fft; i <= nxhi_fft; i++) {
+    per = i - nx_pppm*(2*i/nx_pppm);
+    fkx[i] = unitkx*per;
+  }
+
+  for (i = nylo_fft; i <= nyhi_fft; i++) {
+    per = i - ny_pppm*(2*i/ny_pppm);
+    fky[i] = unitky*per;
+  }
+
+  for (i = nzlo_fft; i <= nzhi_fft; i++) {
+    per = i - nz_pppm*(2*i/nz_pppm);
+    fkz[i] = unitkz*per;
+  }
+
+  // virial coefficients
+
+  double sqk,vterm;
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++) {
+    for (j = nylo_fft; j <= nyhi_fft; j++) {
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+	sqk = fkx[i]*fkx[i] + fky[j]*fky[j] + fkz[k]*fkz[k];
+	if (sqk == 0.0) {
+	  vg[n][0] = 0.0;
+	  vg[n][1] = 0.0;
+	  vg[n][2] = 0.0;
+	  vg[n][3] = 0.0;
+	  vg[n][4] = 0.0;
+	  vg[n][5] = 0.0;
+	} else {
+	  vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald));
+	  vg[n][0] = 1.0 + vterm*fkx[i]*fkx[i];
+	  vg[n][1] = 1.0 + vterm*fky[j]*fky[j];
+	  vg[n][2] = 1.0 + vterm*fkz[k]*fkz[k];
+	  vg[n][3] = vterm*fkx[i]*fky[j];
+	  vg[n][4] = vterm*fkx[i]*fkz[k];
+	  vg[n][5] = vterm*fky[j]*fkz[k];
+	}
+	n++;
+      }
+    }
+  }
+
+  // modified (Hockney-Eastwood) Coulomb Green's function
+
+  int nx,ny,nz,kper,lper,mper;
+  double snx,sny,snz,snx2,sny2,snz2;
+  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
+  double sum1,dot1,dot2;
+  double numerator,denominator;
+
+  int nbx = static_cast<int> ((g_ewald*xprd/(PI*nx_pppm)) * 
+			      pow(-log(EPS_HOC),0.25));
+  int nby = static_cast<int> ((g_ewald*yprd/(PI*ny_pppm)) * 
+			      pow(-log(EPS_HOC),0.25));
+  int nbz = static_cast<int> ((g_ewald*zprd_slab/(PI*nz_pppm)) * 
+			      pow(-log(EPS_HOC),0.25));
+
+  double form = 1.0;
+
+  n = 0;
+  for (m = nzlo_fft; m <= nzhi_fft; m++) {
+    mper = m - nz_pppm*(2*m/nz_pppm);
+    snz = sin(0.5*unitkz*mper*zprd_slab/nz_pppm);
+    snz2 = snz*snz;
+
+    for (l = nylo_fft; l <= nyhi_fft; l++) {
+      lper = l - ny_pppm*(2*l/ny_pppm);
+      sny = sin(0.5*unitky*lper*yprd/ny_pppm);
+      sny2 = sny*sny;
+
+      for (k = nxlo_fft; k <= nxhi_fft; k++) {
+	kper = k - nx_pppm*(2*k/nx_pppm);
+	snx = sin(0.5*unitkx*kper*xprd/nx_pppm);
+	snx2 = snx*snx;
+      
+	sqk = pow(unitkx*kper,2.0) + pow(unitky*lper,2.0) + 
+	  pow(unitkz*mper,2.0);
+
+	if (sqk != 0.0) {
+	  numerator = form*12.5663706/sqk;
+	  denominator = gf_denom(snx2,sny2,snz2);  
+	  sum1 = 0.0;
+	  for (nx = -nbx; nx <= nbx; nx++) {
+	    qx = unitkx*(kper+nx_pppm*nx);
+	    sx = exp(-.25*pow(qx/g_ewald,2.0));
+	    wx = 1.0;
+	    argx = 0.5*qx*xprd/nx_pppm;
+	    if (argx != 0.0) wx = pow(sin(argx)/argx,order);
+	    for (ny = -nby; ny <= nby; ny++) {
+	      qy = unitky*(lper+ny_pppm*ny);
+	      sy = exp(-.25*pow(qy/g_ewald,2.0));
+	      wy = 1.0;
+	      argy = 0.5*qy*yprd/ny_pppm;
+	      if (argy != 0.0) wy = pow(sin(argy)/argy,order);
+	      for (nz = -nbz; nz <= nbz; nz++) {
+		qz = unitkz*(mper+nz_pppm*nz);
+		sz = exp(-.25*pow(qz/g_ewald,2.0));
+		wz = 1.0;
+		argz = 0.5*qz*zprd_slab/nz_pppm;
+		if (argz != 0.0) wz = pow(sin(argz)/argz,order);
+
+		dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
+		dot2 = qx*qx+qy*qy+qz*qz;
+		sum1 += (dot1/dot2) * sx*sy*sz * pow(wx*wy*wz,2.0);
+	      }
+	    }
+	  }
+	  greensfn[n++] = numerator*sum1/denominator;
+	} else greensfn[n++] = 0.0;
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   allocate memory that depends on # of K-vectors and order 
+------------------------------------------------------------------------- */
+
+template <class grdtyp>
+void PPPMGPUT::allocate()
+{
+  memory->create(density_fft,nfft_both,"pppm:density_fft");
+  memory->create(greensfn,nfft_both,"pppm:greensfn");
+  memory->create(work1,2*nfft_both,"pppm:work1");
+  memory->create(work2,2*nfft_both,"pppm:work2");
+  memory->create(vg,nfft_both,6,"pppm:vg");
+
+  memory->create1d_offset(fkx,nxlo_fft,nxhi_fft,"pppm:fkx");
+  memory->create1d_offset(fky,nylo_fft,nyhi_fft,"pppm:fky");
+  memory->create1d_offset(fkz,nzlo_fft,nzhi_fft,"pppm:fkz");
+
+  memory->create(buf1,nbuf,"pppm:buf1");
+  memory->create(buf2,nbuf,"pppm:buf2");
+
+  // summation coeffs
+
+  gf_b = new double[order];
+  memory->create2d_offset(rho1d,3,-order/2,order/2,"pppm:rho1d");
+  memory->create2d_offset(rho_coeff,order,(1-order)/2,order/2,"pppm:rho_coeff");
+
+  // create 2 FFTs and a Remap
+  // 1st FFT keeps data in FFT decompostion
+  // 2nd FFT returns data in 3d brick decomposition
+  // remap takes data from 3d brick to FFT decomposition
+
+  int tmp;
+
+  fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
+		   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
+		   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
+		   0,0,&tmp);
+
+  fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
+		   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
+		   nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+		   0,0,&tmp);
+
+  remap = new Remap(lmp,world,
+		    nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+		    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
+		    1,0,0,2);
+}
+
+/* ----------------------------------------------------------------------
+   deallocate memory that depends on # of K-vectors and order 
+------------------------------------------------------------------------- */
+
+template <class grdtyp>
+void PPPMGPUT::deallocate()
+{
+  destroy_3d_offset(density_brick,nzlo_out,nylo_out);
+  destroy_3d_offset(vd_brick,nzlo_out,nylo_out);
+
+  memory->destroy(density_fft);
+  memory->destroy(greensfn);
+  memory->destroy(work1);
+  memory->destroy(work2);
+  memory->destroy(vg);
+
+  memory->destroy1d_offset(fkx,nxlo_fft);
+  memory->destroy1d_offset(fky,nylo_fft);
+  memory->destroy1d_offset(fkz,nzlo_fft);
+
+  memory->destroy(buf1);
+  memory->destroy(buf2);
+
+  delete [] gf_b;
+  memory->destroy2d_offset(rho1d,-order/2);
+  memory->destroy2d_offset(rho_coeff,(1-order)/2);
+
+  delete fft1;
+  delete fft2;
+  delete remap;
+}
+
+/* ----------------------------------------------------------------------
+   set size of FFT grid (nx,ny,nz_pppm) and g_ewald 
+------------------------------------------------------------------------- */
+
+template <class grdtyp>
+void PPPMGPUT::set_grid()
+{
+  // see JCP 109, pg 7698 for derivation of coefficients
+  // higher order coefficients may be computed if needed
+
+  double **acons;
+  memory->create(acons,8,7,"pppm:acons");
+
+  acons[1][0] = 2.0 / 3.0;
+  acons[2][0] = 1.0 / 50.0;
+  acons[2][1] = 5.0 / 294.0;
+  acons[3][0] = 1.0 / 588.0;
+  acons[3][1] = 7.0 / 1440.0;
+  acons[3][2] = 21.0 / 3872.0;
+  acons[4][0] = 1.0 / 4320.0;
+  acons[4][1] = 3.0 / 1936.0;
+  acons[4][2] = 7601.0 / 2271360.0;
+  acons[4][3] = 143.0 / 28800.0;
+  acons[5][0] = 1.0 / 23232.0;
+  acons[5][1] = 7601.0 / 13628160.0;
+  acons[5][2] = 143.0 / 69120.0;
+  acons[5][3] = 517231.0 / 106536960.0;
+  acons[5][4] = 106640677.0 / 11737571328.0;
+  acons[6][0] = 691.0 / 68140800.0;
+  acons[6][1] = 13.0 / 57600.0;
+  acons[6][2] = 47021.0 / 35512320.0;
+  acons[6][3] = 9694607.0 / 2095994880.0;
+  acons[6][4] = 733191589.0 / 59609088000.0;
+  acons[6][5] = 326190917.0 / 11700633600.0;
+  acons[7][0] = 1.0 / 345600.0;
+  acons[7][1] = 3617.0 / 35512320.0;
+  acons[7][2] = 745739.0 / 838397952.0;
+  acons[7][3] = 56399353.0 / 12773376000.0;
+  acons[7][4] = 25091609.0 / 1560084480.0;
+  acons[7][5] = 1755948832039.0 / 36229939200000.0;
+  acons[7][6] = 4887769399.0 / 37838389248.0;
+
+  double q2 = qsqsum / force->dielectric;
+  bigint natoms = atom->natoms;
+
+  // use xprd,yprd,zprd even if triclinic so grid size is the same
+  // adjust z dimension for 2d slab PPPM
+  // 3d PPPM just uses zprd since slab_volfactor = 1.0
+
+  double xprd = domain->xprd;
+  double yprd = domain->yprd;
+  double zprd = domain->zprd;
+  double zprd_slab = zprd*slab_volfactor;
+  
+  // make initial g_ewald estimate
+  // based on desired error and real space cutoff
+  // fluid-occupied volume used to estimate real-space error
+  // zprd used rather than zprd_slab
+
+  double hx,hy,hz;
+
+  if (!gewaldflag)
+    g_ewald = sqrt(-log(precision*sqrt(natoms*cutoff*xprd*yprd*zprd) / 
+			(2.0*q2))) / cutoff;
+
+  // set optimal nx_pppm,ny_pppm,nz_pppm based on order and precision
+  // nz_pppm uses extended zprd_slab instead of zprd
+  // h = 1/g_ewald is upper bound on h such that h*g_ewald <= 1
+  // reduce it until precision target is met
+
+  if (!gridflag) {
+    double err;
+    hx = hy = hz = 1/g_ewald;  
+
+    nx_pppm = static_cast<int> (xprd/hx + 1);
+    ny_pppm = static_cast<int> (yprd/hy + 1);
+    nz_pppm = static_cast<int> (zprd_slab/hz + 1);
+
+    err = rms(hx,xprd,natoms,q2,acons);
+    while (err > precision) {
+      err = rms(hx,xprd,natoms,q2,acons);
+      nx_pppm++;
+      hx = xprd/nx_pppm;
+    }
+
+    err = rms(hy,yprd,natoms,q2,acons);
+    while (err > precision) {
+      err = rms(hy,yprd,natoms,q2,acons);
+      ny_pppm++;
+      hy = yprd/ny_pppm;
+    }
+
+    err = rms(hz,zprd_slab,natoms,q2,acons);
+    while (err > precision) {
+      err = rms(hz,zprd_slab,natoms,q2,acons);
+      nz_pppm++;
+      hz = zprd_slab/nz_pppm;
+    }
+  }
+
+  // boost grid size until it is factorable
+
+  while (!factorable(nx_pppm)) nx_pppm++;
+  while (!factorable(ny_pppm)) ny_pppm++;
+  while (!factorable(nz_pppm)) nz_pppm++;
+
+  // adjust g_ewald for new grid size
+
+  hx = xprd/nx_pppm;
+  hy = yprd/ny_pppm;
+  hz = zprd_slab/nz_pppm;
+
+  if (!gewaldflag) {
+    double gew1,gew2,dgew,f,fmid,hmin,rtb;
+    int ncount;
+
+    gew1 = 0.0;
+    g_ewald = gew1;
+    f = diffpr(hx,hy,hz,q2,acons);
+
+    hmin = MIN(hx,MIN(hy,hz));
+    gew2 = 10/hmin;
+    g_ewald = gew2;
+    fmid = diffpr(hx,hy,hz,q2,acons);
+
+    if (f*fmid >= 0.0) error->all("Cannot compute PPPM G");
+    rtb = f < 0.0 ? (dgew=gew2-gew1,gew1) : (dgew=gew1-gew2,gew2);
+    ncount = 0;
+    while (fabs(dgew) > SMALL && fmid != 0.0) {
+      dgew *= 0.5;
+      g_ewald = rtb + dgew;
+      fmid = diffpr(hx,hy,hz,q2,acons);      
+      if (fmid <= 0.0) rtb = g_ewald;
+      ncount++;
+      if (ncount > LARGE) error->all("Cannot compute PPPM G");
+    }
+  }
+
+  // final RMS precision
+
+  double lprx = rms(hx,xprd,natoms,q2,acons);
+  double lpry = rms(hy,yprd,natoms,q2,acons);
+  double lprz = rms(hz,zprd_slab,natoms,q2,acons);
+  double lpr = sqrt(lprx*lprx + lpry*lpry + lprz*lprz) / sqrt(3.0);
+  double spr = 2.0*q2 * exp(-g_ewald*g_ewald*cutoff*cutoff) / 
+    sqrt(natoms*cutoff*xprd*yprd*zprd_slab);
+
+  // free local memory
+
+  memory->destroy(acons);
+
+  // print info
+
+  if (me == 0) {
+    if (screen) {
+      fprintf(screen,"  G vector = %g\n",g_ewald);
+      fprintf(screen,"  grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm);
+      fprintf(screen,"  stencil order = %d\n",order);
+      fprintf(screen,"  RMS precision = %g\n",MAX(lpr,spr));
+    }
+    if (logfile) {
+      fprintf(logfile,"  G vector = %g\n",g_ewald);
+      fprintf(logfile,"  grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm);
+      fprintf(logfile,"  stencil order = %d\n",order);
+      fprintf(logfile,"  RMS precision = %g\n",MAX(lpr,spr));
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   check if all factors of n are in list of factors
+   return 1 if yes, 0 if no 
+------------------------------------------------------------------------- */
+
+template <class grdtyp>
+int PPPMGPUT::factorable(int n)
+{
+  int i;
+
+  while (n > 1) {
+    for (i = 0; i < nfactors; i++) {
+      if (n % factors[i] == 0) {
+	n /= factors[i];
+	break;
+      }
+    }
+    if (i == nfactors) return 0;
+  }
+
+  return 1;
+}
+
+/* ----------------------------------------------------------------------
+   compute RMS precision for a dimension
+------------------------------------------------------------------------- */
+
+template <class grdtyp>
+double PPPMGPUT::rms(double h, double prd, bigint natoms,
+		     double q2, double **acons)
+{
+  double sum = 0.0;
+  for (int m = 0; m < order; m++) 
+    sum += acons[order][m] * pow(h*g_ewald,2.0*m);
+  double value = q2 * pow(h*g_ewald,order) *
+    sqrt(g_ewald*prd*sqrt(2.0*PI)*sum/natoms) / (prd*prd);
+  return value;
+}
+
+/* ----------------------------------------------------------------------
+   compute difference in real-space and kspace RMS precision
+------------------------------------------------------------------------- */
+
+template <class grdtyp>
+double PPPMGPUT::diffpr(double hx, double hy, double hz, double q2,
+			double **acons)
+{
+  double lprx,lpry,lprz,kspace_prec,real_prec;
+  double xprd = domain->xprd;
+  double yprd = domain->yprd;
+  double zprd = domain->zprd;
+  bigint natoms = atom->natoms;
+
+  lprx = rms(hx,xprd,natoms,q2,acons);
+  lpry = rms(hy,yprd,natoms,q2,acons);
+  lprz = rms(hz,zprd*slab_volfactor,natoms,q2,acons);
+  kspace_prec = sqrt(lprx*lprx + lpry*lpry + lprz*lprz) / sqrt(3.0);
+  real_prec = 2.0*q2 * exp(-g_ewald*g_ewald*cutoff*cutoff) / 
+   sqrt(natoms*cutoff*xprd*yprd*zprd);
+  double value = kspace_prec - real_prec;
+  return value;
+}
+
+/* ----------------------------------------------------------------------
+   denominator for Hockney-Eastwood Green's function
+     of x,y,z = sin(kx*deltax/2), etc
+
+            inf                 n-1
+   S(n,k) = Sum  W(k+pi*j)**2 = Sum b(l)*(z*z)**l
+           j=-inf               l=0
+
+          = -(z*z)**n /(2n-1)! * (d/dx)**(2n-1) cot(x)  at z = sin(x)
+   gf_b = denominator expansion coeffs 
+------------------------------------------------------------------------- */
+
+template <class grdtyp>
+double PPPMGPUT::gf_denom(double x, double y, double z)
+{
+  double sx,sy,sz;
+  sz = sy = sx = 0.0;
+  for (int l = order-1; l >= 0; l--) {
+    sx = gf_b[l] + sx*x;
+    sy = gf_b[l] + sy*y;
+    sz = gf_b[l] + sz*z;
+  }
+  double s = sx*sy*sz;
+  return s*s;
+}
+
+/* ----------------------------------------------------------------------
+   pre-compute Green's function denominator expansion coeffs, Gamma(2n) 
+------------------------------------------------------------------------- */
+
+template <class grdtyp>
+void PPPMGPUT::compute_gf_denom()
+{
+  int k,l,m;
+  
+  for (l = 1; l < order; l++) gf_b[l] = 0.0;
+  gf_b[0] = 1.0;
+  
+  for (m = 1; m < order; m++) {
+    for (l = m; l > 0; l--) 
+      gf_b[l] = 4.0 * (gf_b[l]*(l-m)*(l-m-0.5)-gf_b[l-1]*(l-m-1)*(l-m-1));
+    gf_b[0] = 4.0 * (gf_b[0]*(l-m)*(l-m-0.5));
+  }
+
+  int ifact = 1;
+  for (k = 1; k < 2*order; k++) ifact *= k;
+  double gaminv = 1.0/ifact;
+  for (l = 0; l < order; l++) gf_b[l] *= gaminv;
+}
+
+/* ----------------------------------------------------------------------
+   ghost-swap to accumulate full density in brick decomposition 
+   remap density from 3d brick decomposition to FFT decomposition
+------------------------------------------------------------------------- */
+
+template <class grdtyp>
+void PPPMGPUT::brick2fft()
+{
+  int i,n,ix,iy,iz;
+  MPI_Request request;
+  MPI_Status status;
+
+  // pack my ghosts for +x processor
+  // pass data to self or +x processor
+  // unpack and sum recv data into my real cells
+
+  n = 0;
+  for (iz = nzlo_out; iz <= nzhi_out; iz++)
+    for (iy = nylo_out; iy <= nyhi_out; iy++)
+      for (ix = nxhi_in+1; ix <= nxhi_out; ix++)
+	buf1[n++] = density_brick[iz][iy][ix];
+
+  if (comm->procneigh[0][1] == me)
+    for (i = 0; i < n; i++) buf2[i] = buf1[i];
+  else {
+    MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[0][0],0,world,&request);
+    MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[0][1],0,world);
+    MPI_Wait(&request,&status);
+  }
+
+  n = 0;
+  for (iz = nzlo_out; iz <= nzhi_out; iz++)
+    for (iy = nylo_out; iy <= nyhi_out; iy++)
+      for (ix = nxlo_in; ix < nxlo_in+nxlo_ghost; ix++)
+	density_brick[iz][iy][ix] += buf2[n++];
+
+  // pack my ghosts for -x processor
+  // pass data to self or -x processor
+  // unpack and sum recv data into my real cells
+
+  n = 0;
+  for (iz = nzlo_out; iz <= nzhi_out; iz++)
+    for (iy = nylo_out; iy <= nyhi_out; iy++)
+      for (ix = nxlo_out; ix < nxlo_in; ix++)
+	buf1[n++] = density_brick[iz][iy][ix];
+
+  if (comm->procneigh[0][0] == me)
+    for (i = 0; i < n; i++) buf2[i] = buf1[i];
+  else {
+    MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[0][1],0,world,&request);
+    MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[0][0],0,world);
+    MPI_Wait(&request,&status);
+  }
+
+  n = 0;
+  for (iz = nzlo_out; iz <= nzhi_out; iz++)
+    for (iy = nylo_out; iy <= nyhi_out; iy++)
+      for (ix = nxhi_in-nxhi_ghost+1; ix <= nxhi_in; ix++)
+	density_brick[iz][iy][ix] += buf2[n++];
+
+  // pack my ghosts for +y processor
+  // pass data to self or +y processor
+  // unpack and sum recv data into my real cells
+
+  n = 0;
+  for (iz = nzlo_out; iz <= nzhi_out; iz++)
+    for (iy = nyhi_in+1; iy <= nyhi_out; iy++)
+      for (ix = nxlo_in; ix <= nxhi_in; ix++)
+	buf1[n++] = density_brick[iz][iy][ix];
+
+  if (comm->procneigh[1][1] == me)
+    for (i = 0; i < n; i++) buf2[i] = buf1[i];
+  else {
+    MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[1][0],0,world,&request);
+    MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[1][1],0,world);
+    MPI_Wait(&request,&status);
+  }
+
+  n = 0;
+  for (iz = nzlo_out; iz <= nzhi_out; iz++)
+    for (iy = nylo_in; iy < nylo_in+nylo_ghost; iy++)
+      for (ix = nxlo_in; ix <= nxhi_in; ix++)
+	density_brick[iz][iy][ix] += buf2[n++];
+
+  // pack my ghosts for -y processor
+  // pass data to self or -y processor
+  // unpack and sum recv data into my real cells
+
+  n = 0;
+  for (iz = nzlo_out; iz <= nzhi_out; iz++)
+    for (iy = nylo_out; iy < nylo_in; iy++)
+      for (ix = nxlo_in; ix <= nxhi_in; ix++)
+	buf1[n++] = density_brick[iz][iy][ix];
+
+  if (comm->procneigh[1][0] == me)
+    for (i = 0; i < n; i++) buf2[i] = buf1[i];
+  else {
+    MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[1][1],0,world,&request);
+    MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[1][0],0,world);
+    MPI_Wait(&request,&status);
+  }
+
+  n = 0;
+  for (iz = nzlo_out; iz <= nzhi_out; iz++)
+    for (iy = nyhi_in-nyhi_ghost+1; iy <= nyhi_in; iy++)
+      for (ix = nxlo_in; ix <= nxhi_in; ix++)
+	density_brick[iz][iy][ix] += buf2[n++];
+
+  // pack my ghosts for +z processor
+  // pass data to self or +z processor
+  // unpack and sum recv data into my real cells
+
+  n = 0;
+  for (iz = nzhi_in+1; iz <= nzhi_out; iz++)
+    for (iy = nylo_in; iy <= nyhi_in; iy++)
+      for (ix = nxlo_in; ix <= nxhi_in; ix++)
+	buf1[n++] = density_brick[iz][iy][ix];
+
+  if (comm->procneigh[2][1] == me)
+    for (i = 0; i < n; i++) buf2[i] = buf1[i];
+  else {
+    MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[2][0],0,world,&request);
+    MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[2][1],0,world);
+    MPI_Wait(&request,&status);
+  }
+
+  n = 0;
+  for (iz = nzlo_in; iz < nzlo_in+nzlo_ghost; iz++)
+    for (iy = nylo_in; iy <= nyhi_in; iy++)
+      for (ix = nxlo_in; ix <= nxhi_in; ix++)
+	density_brick[iz][iy][ix] += buf2[n++];
+
+  // pack my ghosts for -z processor
+  // pass data to self or -z processor
+  // unpack and sum recv data into my real cells
+
+  n = 0;
+  for (iz = nzlo_out; iz < nzlo_in; iz++)
+    for (iy = nylo_in; iy <= nyhi_in; iy++)
+      for (ix = nxlo_in; ix <= nxhi_in; ix++)
+	buf1[n++] = density_brick[iz][iy][ix];
+
+  if (comm->procneigh[2][0] == me)
+    for (i = 0; i < n; i++) buf2[i] = buf1[i];
+  else {
+    MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[2][1],0,world,&request);
+    MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[2][0],0,world);
+    MPI_Wait(&request,&status);
+  }
+
+  n = 0;
+  for (iz = nzhi_in-nzhi_ghost+1; iz <= nzhi_in; iz++)
+    for (iy = nylo_in; iy <= nyhi_in; iy++)
+      for (ix = nxlo_in; ix <= nxhi_in; ix++)
+	density_brick[iz][iy][ix] += buf2[n++];
+
+  // remap from 3d brick decomposition to FFT decomposition
+  // copy grabs inner portion of density from 3d brick
+  // remap could be done as pre-stage of FFT,
+  //   but this works optimally on only double values, not complex values
+
+  n = 0;
+  for (iz = nzlo_in; iz <= nzhi_in; iz++)
+    for (iy = nylo_in; iy <= nyhi_in; iy++)
+      for (ix = nxlo_in; ix <= nxhi_in; ix++)
+	density_fft[n++] = density_brick[iz][iy][ix];
+
+  remap->perform(density_fft,density_fft,work1);
+}
+
+/* ----------------------------------------------------------------------
+   ghost-swap to fill ghost cells of my brick with field values
+------------------------------------------------------------------------- */
+
+template <class grdtyp>
+void PPPMGPUT::fillbrick()
+{
+  int i,n,ix,iy,iz;
+  MPI_Request request;
+  MPI_Status status;
+
+  // pack my real cells for +z processor
+  // pass data to self or +z processor
+  // unpack and sum recv data into my ghost cells
+
+  n = 0;
+  int x_lo = nxlo_in * 4;
+  int x_hi = nxhi_in * 4 + 1;
+  for (iz = nzhi_in-nzhi_ghost+1; iz <= nzhi_in; iz++)
+    for (iy = nylo_in; iy <= nyhi_in; iy++)
+      for (ix = x_lo; ix < x_hi; ix+=4) {
+	buf1[n++] = vd_brick[iz][iy][ix];
+	buf1[n++] = vd_brick[iz][iy][ix+1];
+	buf1[n++] = vd_brick[iz][iy][ix+2];
+      }
+
+  if (comm->procneigh[2][1] == me)
+    for (i = 0; i < n; i++) buf2[i] = buf1[i];
+  else {
+    MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[2][0],0,world,&request);
+    MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[2][1],0,world);
+    MPI_Wait(&request,&status);
+  }
+
+  n = 0;
+  for (iz = nzlo_out; iz < nzlo_in; iz++)
+    for (iy = nylo_in; iy <= nyhi_in; iy++)
+      for (ix = x_lo; ix < x_hi; ix+=4) {
+	vd_brick[iz][iy][ix] = buf2[n++];
+	vd_brick[iz][iy][ix+1] = buf2[n++];
+	vd_brick[iz][iy][ix+2] = buf2[n++];
+      }
+
+  // pack my real cells for -z processor
+  // pass data to self or -z processor
+  // unpack and sum recv data into my ghost cells
+
+  n = 0;
+  for (iz = nzlo_in; iz < nzlo_in+nzlo_ghost; iz++)
+    for (iy = nylo_in; iy <= nyhi_in; iy++)
+      for (ix = x_lo; ix < x_hi; ix+=4) {
+	buf1[n++] = vd_brick[iz][iy][ix];
+	buf1[n++] = vd_brick[iz][iy][ix+1];
+	buf1[n++] = vd_brick[iz][iy][ix+2];
+      }
+
+  if (comm->procneigh[2][0] == me)
+    for (i = 0; i < n; i++) buf2[i] = buf1[i];
+  else {
+    MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[2][1],0,world,&request);
+    MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[2][0],0,world);
+    MPI_Wait(&request,&status);
+  }
+
+  n = 0;
+  for (iz = nzhi_in+1; iz <= nzhi_out; iz++)
+    for (iy = nylo_in; iy <= nyhi_in; iy++)
+      for (ix = x_lo; ix < x_hi; ix+=4) {
+	vd_brick[iz][iy][ix] = buf2[n++];
+	vd_brick[iz][iy][ix+1] = buf2[n++];
+	vd_brick[iz][iy][ix+2] = buf2[n++];
+      }
+
+  // pack my real cells for +y processor
+  // pass data to self or +y processor
+  // unpack and sum recv data into my ghost cells
+
+  n = 0;
+  for (iz = nzlo_out; iz <= nzhi_out; iz++)
+    for (iy = nyhi_in-nyhi_ghost+1; iy <= nyhi_in; iy++)
+      for (ix = x_lo; ix < x_hi; ix+=4) {
+	buf1[n++] = vd_brick[iz][iy][ix];
+	buf1[n++] = vd_brick[iz][iy][ix+1];
+	buf1[n++] = vd_brick[iz][iy][ix+2];
+      }
+
+  if (comm->procneigh[1][1] == me)
+    for (i = 0; i < n; i++) buf2[i] = buf1[i];
+  else {
+    MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[1][0],0,world,&request);
+    MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[1][1],0,world);
+    MPI_Wait(&request,&status);
+  }
+
+  n = 0;
+  for (iz = nzlo_out; iz <= nzhi_out; iz++)
+    for (iy = nylo_out; iy < nylo_in; iy++)
+      for (ix = x_lo; ix < x_hi; ix+=4) {
+	vd_brick[iz][iy][ix] = buf2[n++];
+	vd_brick[iz][iy][ix+1] = buf2[n++];
+	vd_brick[iz][iy][ix+2] = buf2[n++];
+      }
+
+  // pack my real cells for -y processor
+  // pass data to self or -y processor
+  // unpack and sum recv data into my ghost cells
+
+  n = 0;
+  for (iz = nzlo_out; iz <= nzhi_out; iz++)
+    for (iy = nylo_in; iy < nylo_in+nylo_ghost; iy++)
+      for (ix = x_lo; ix < x_hi; ix+=4) {
+	buf1[n++] = vd_brick[iz][iy][ix];
+	buf1[n++] = vd_brick[iz][iy][ix+1];
+	buf1[n++] = vd_brick[iz][iy][ix+2];
+      }
+
+  if (comm->procneigh[1][0] == me)
+    for (i = 0; i < n; i++) buf2[i] = buf1[i];
+  else {
+    MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[1][1],0,world,&request);
+    MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[1][0],0,world);
+    MPI_Wait(&request,&status);
+  }
+
+  n = 0;
+  for (iz = nzlo_out; iz <= nzhi_out; iz++)
+    for (iy = nyhi_in+1; iy <= nyhi_out; iy++)
+      for (ix = x_lo; ix < x_hi; ix+=4) {
+	vd_brick[iz][iy][ix] = buf2[n++];
+	vd_brick[iz][iy][ix+1] = buf2[n++];
+	vd_brick[iz][iy][ix+2] = buf2[n++];
+      }
+
+  // pack my real cells for +x processor
+  // pass data to self or +x processor
+  // unpack and sum recv data into my ghost cells
+
+  n = 0;
+  x_lo = (nxhi_in-nxhi_ghost+1) * 4;
+  for (iz = nzlo_out; iz <= nzhi_out; iz++)
+    for (iy = nylo_out; iy <= nyhi_out; iy++)
+      for (ix = x_lo; ix < x_hi; ix+=4) {
+	buf1[n++] = vd_brick[iz][iy][ix];
+	buf1[n++] = vd_brick[iz][iy][ix+1];
+	buf1[n++] = vd_brick[iz][iy][ix+2];
+      }
+
+  if (comm->procneigh[0][1] == me)
+    for (i = 0; i < n; i++) buf2[i] = buf1[i];
+  else {
+    MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[0][0],0,world,&request);
+    MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[0][1],0,world);
+    MPI_Wait(&request,&status);
+  }
+
+  n = 0;
+  x_lo = nxlo_out * 4;
+  x_hi = nxlo_in * 4;
+  for (iz = nzlo_out; iz <= nzhi_out; iz++)
+    for (iy = nylo_out; iy <= nyhi_out; iy++)
+      for (ix = x_lo; ix < x_hi; ix+=4) {
+	vd_brick[iz][iy][ix] = buf2[n++];
+	vd_brick[iz][iy][ix+1] = buf2[n++];
+	vd_brick[iz][iy][ix+2] = buf2[n++];
+      }
+
+  // pack my real cells for -x processor
+  // pass data to self or -x processor
+  // unpack and sum recv data into my ghost cells
+
+  n = 0;
+  x_lo = x_hi;
+  x_hi = (nxlo_in+nxlo_ghost) * 4;
+  for (iz = nzlo_out; iz <= nzhi_out; iz++)
+    for (iy = nylo_out; iy <= nyhi_out; iy++)
+      for (ix = x_lo; ix < x_hi; ix+=4) {
+	buf1[n++] = vd_brick[iz][iy][ix];
+	buf1[n++] = vd_brick[iz][iy][ix+1];
+	buf1[n++] = vd_brick[iz][iy][ix+2];
+      }
+
+  if (comm->procneigh[0][0] == me)
+    for (i = 0; i < n; i++) buf2[i] = buf1[i];
+  else {
+    MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[0][1],0,world,&request);
+    MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[0][0],0,world);
+    MPI_Wait(&request,&status);
+  }
+
+  n = 0;
+  x_lo = (nxhi_in + 1) * 4;
+  x_hi = nxhi_out * 4 + 1;
+  for (iz = nzlo_out; iz <= nzhi_out; iz++)
+    for (iy = nylo_out; iy <= nyhi_out; iy++)
+      for (ix = x_lo; ix < x_hi; ix+=4) {
+	vd_brick[iz][iy][ix] = buf2[n++];
+	vd_brick[iz][iy][ix+1] = buf2[n++];
+	vd_brick[iz][iy][ix+2] = buf2[n++];
+      }
+}
+
+/* ----------------------------------------------------------------------
+   FFT-based Poisson solver 
+------------------------------------------------------------------------- */
+
+template <class grdtyp>
+void PPPMGPUT::poisson(int eflag, int vflag)
+{
+  int i,j,k,n;
+  double eng;
+
+  // transform charge density (r -> k) 
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work1[n++] = density_fft[i];
+    work1[n++] = 0.0;
+  }
+ 
+  fft1->compute(work1,work1,1);
+
+  // if requested, compute energy and virial contribution
+
+  double scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm);
+  double s2 = scaleinv*scaleinv;
+
+  if (eflag || vflag) {
+    if (vflag) {
+      n = 0;
+      for (i = 0; i < nfft; i++) {
+	eng = s2 * greensfn[i] * (work1[n]*work1[n] + work1[n+1]*work1[n+1]);
+	for (j = 0; j < 6; j++) virial[j] += eng*vg[i][j];
+	energy += eng;
+	n += 2;
+      }
+    } else {
+      n = 0;
+      for (i = 0; i < nfft; i++) {
+	energy += 
+	  s2 * greensfn[i] * (work1[n]*work1[n] + work1[n+1]*work1[n+1]);
+	n += 2;
+      }
+    }
+  }
+
+  // scale by 1/total-grid-pts to get rho(k)
+  // multiply by Green's function to get V(k)
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work1[n++] *= scaleinv * greensfn[i];
+    work1[n++] *= scaleinv * greensfn[i];
+  }
+
+  // compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k)
+  // FFT leaves data in 3d brick decomposition
+  // copy it into inner portion of vdx,vdy,vdz arrays
+
+  // x direction gradient
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+	work2[n] = fkx[i]*work1[n+1];
+	work2[n+1] = -fkx[i]*work1[n];
+	n += 2;
+      }
+
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  int x_hi = nxhi_in * 4 + 3;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in * 4; i < x_hi; i+=4) {
+	vd_brick[k][j][i] = work2[n];
+	n += 2;
+      }
+
+  // y direction gradient
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+	work2[n] = fky[j]*work1[n+1];
+	work2[n+1] = -fky[j]*work1[n];
+	n += 2;
+      }
+
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in * 4 + 1; i < x_hi; i+=4) {
+	vd_brick[k][j][i] = work2[n];
+	n += 2;
+      }
+
+  // z direction gradient
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+	work2[n] = fkz[k]*work1[n+1];
+	work2[n+1] = -fkz[k]*work1[n];
+	n += 2;
+      }
+
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in * 4 + 2; i < x_hi; i+=4) {
+	vd_brick[k][j][i] = work2[n];
+	n += 2;
+      }
+}
+
+/* ----------------------------------------------------------------------
+   map nprocs to NX by NY grid as PX by PY procs - return optimal px,py 
+------------------------------------------------------------------------- */
+
+template <class grdtyp>
+void PPPMGPUT::procs2grid2d(int nprocs, int nx, int ny, int *px, int *py)
+{
+  // loop thru all possible factorizations of nprocs
+  // surf = surface area of largest proc sub-domain
+  // innermost if test minimizes surface area and surface/volume ratio
+
+  int bestsurf = 2 * (nx + ny);
+  int bestboxx = 0;
+  int bestboxy = 0;
+
+  int boxx,boxy,surf,ipx,ipy;
+
+  ipx = 1;
+  while (ipx <= nprocs) {
+    if (nprocs % ipx == 0) {
+      ipy = nprocs/ipx;
+      boxx = nx/ipx;
+      if (nx % ipx) boxx++;
+      boxy = ny/ipy;
+      if (ny % ipy) boxy++;
+      surf = boxx + boxy;
+      if (surf < bestsurf || 
+	  (surf == bestsurf && boxx*boxy > bestboxx*bestboxy)) {
+	bestsurf = surf;
+	bestboxx = boxx;
+	bestboxy = boxy;
+	*px = ipx;
+	*py = ipy;
+      }
+    }
+    ipx++;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   charge assignment into rho1d
+   dx,dy,dz = distance of particle from "lower left" grid point 
+------------------------------------------------------------------------- */
+
+template <class grdtyp>
+void PPPMGPUT::compute_rho1d(double dx, double dy, double dz)
+{
+  int k,l;
+
+  for (k = (1-order)/2; k <= order/2; k++) {
+    rho1d[0][k] = 0.0;
+    rho1d[1][k] = 0.0;
+    rho1d[2][k] = 0.0;
+    for (l = order-1; l >= 0; l--) {
+      rho1d[0][k] = rho_coeff[l][k] + rho1d[0][k]*dx;
+      rho1d[1][k] = rho_coeff[l][k] + rho1d[1][k]*dy;
+      rho1d[2][k] = rho_coeff[l][k] + rho1d[2][k]*dz;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   generate coeffients for the weight function of order n
+
+              (n-1)
+  Wn(x) =     Sum    wn(k,x) , Sum is over every other integer
+           k=-(n-1)
+  For k=-(n-1),-(n-1)+2, ....., (n-1)-2,n-1
+      k is odd integers if n is even and even integers if n is odd
+              ---
+             | n-1
+             | Sum a(l,j)*(x-k/2)**l   if abs(x-k/2) < 1/2
+  wn(k,x) = <  l=0
+             |
+             |  0                       otherwise
+              ---
+  a coeffients are packed into the array rho_coeff to eliminate zeros
+  rho_coeff(l,((k+mod(n+1,2))/2) = a(l,k) 
+------------------------------------------------------------------------- */
+
+template <class grdtyp>
+void PPPMGPUT::compute_rho_coeff()
+{
+  int j,k,l,m;
+  double s;
+
+  double **a;
+  memory->create2d_offset(a,order,-order,order,"pppm:a");
+
+  for (k = -order; k <= order; k++) 
+    for (l = 0; l < order; l++)
+      a[l][k] = 0.0;
+        
+  a[0][0] = 1.0;
+  for (j = 1; j < order; j++) {
+    for (k = -j; k <= j; k += 2) {
+      s = 0.0;
+      for (l = 0; l < j; l++) {
+	a[l+1][k] = (a[l][k+1]-a[l][k-1]) / (l+1);
+	s += pow(0.5,(double) l+1) * 
+	  (a[l][k-1] + pow(-1.0,(double) l) * a[l][k+1]) / (l+1);
+      }
+      a[0][k] = s;
+    }
+  }
+
+  m = (1-order)/2;
+  for (k = -(order-1); k < order; k += 2) {
+    for (l = 0; l < order; l++)
+      rho_coeff[l][m] = a[l][k];
+    m++;
+  }
+
+  memory->destroy2d_offset(a,-order);
+}
+
+/* ----------------------------------------------------------------------
+   Slab-geometry correction term to dampen inter-slab interactions between
+   periodically repeating slabs.  Yields good approximation to 2D Ewald if 
+   adequate empty space is left between repeating slabs (J. Chem. Phys. 
+   111, 3155).  Slabs defined here to be parallel to the xy plane. 
+------------------------------------------------------------------------- */
+
+template <class grdtyp>
+void PPPMGPUT::slabcorr(int eflag)
+{
+  // compute local contribution to global dipole moment
+
+  double *q = atom->q;
+  double **x = atom->x;
+  int nlocal = atom->nlocal;
+
+  double dipole = 0.0;
+  for (int i = 0; i < nlocal; i++) dipole += q[i]*x[i][2];
+  
+  // sum local contributions to get global dipole moment
+
+  double dipole_all;
+  MPI_Allreduce(&dipole,&dipole_all,1,MPI_DOUBLE,MPI_SUM,world);
+
+  // compute corrections
+  
+  double e_slabcorr = 2.0*PI*dipole_all*dipole_all/volume;
+  
+  if (eflag) energy += qqrd2e*scale * e_slabcorr;
+
+  // add on force corrections
+
+  double ffact = -4.0*PI*dipole_all/volume; 
+  double **f = atom->f;
+
+  for (int i = 0; i < nlocal; i++) f[i][2] += qqrd2e*scale * q[i]*ffact;
+}
+
+/* ----------------------------------------------------------------------
+   perform and time the 4 FFTs required for N timesteps
+------------------------------------------------------------------------- */
+
+template <class grdtyp>
+void PPPMGPUT::timing(int n, double &time3d, double &time1d)
+{
+  double time1,time2;
+
+  for (int i = 0; i < 2*nfft_both; i++) work1[i] = 0.0;
+
+  MPI_Barrier(world);
+  time1 = MPI_Wtime();
+
+  for (int i = 0; i < n; i++) {
+    fft1->compute(work1,work1,1);
+    fft2->compute(work1,work1,-1);
+    fft2->compute(work1,work1,-1);
+    fft2->compute(work1,work1,-1);
+  }
+
+  MPI_Barrier(world);
+  time2 = MPI_Wtime();
+  time3d = time2 - time1;
+
+  MPI_Barrier(world);
+  time1 = MPI_Wtime();
+
+  for (int i = 0; i < n; i++) {
+    fft1->timing1d(work1,nfft_both,1);
+    fft2->timing1d(work1,nfft_both,-1);
+    fft2->timing1d(work1,nfft_both,-1);
+    fft2->timing1d(work1,nfft_both,-1);
+  }
+
+  MPI_Barrier(world);
+  time2 = MPI_Wtime();
+  time1d = time2 - time1;
+}
+
+/* ----------------------------------------------------------------------
+   Create array using offsets from pinned memory allocation
+------------------------------------------------------------------------- */
+
+template <class grdtyp>
+grdtyp ***PPPMGPUT::create_3d_offset(int n1lo, int n1hi, int n2lo, int n2hi,
+				     int n3lo, int n3hi, const char *name,
+				     grdtyp *data, int vec_length)
+{
+  int i,j;
+  int n1 = n1hi - n1lo + 1;
+  int n2 = n2hi - n2lo + 1;
+  int n3 = n3hi - n3lo + 1;
+
+  grdtyp **plane = (grdtyp **)memory->smalloc(n1*n2*sizeof(grdtyp *),name);
+  grdtyp ***array = (grdtyp ***)memory->smalloc(n1*sizeof(grdtyp **),name);
+
+  int n = 0;
+  for (i = 0; i < n1; i++) {
+    array[i] = &plane[i*n2];
+    for (j = 0; j < n2; j++) {
+      plane[i*n2+j] = &data[n];
+      n += n3*vec_length;
+    }
+  }
+
+  for (i = 0; i < n1*n2; i++) array[0][i] -= n3lo*vec_length;
+  for (i = 0; i < n1; i++) array[i] -= n2lo;
+  return array-n1lo;
+}
+
+/* ----------------------------------------------------------------------
+   templated 3d memory offsets
+------------------------------------------------------------------------- */
+
+template <class grdtyp>
+void PPPMGPUT::destroy_3d_offset(grdtyp ***array, int n1_offset,
+				 int n2_offset)
+{
+  if (array == NULL) return;
+  memory->sfree(&array[n1_offset][n2_offset]);
+  memory->sfree(array + n1_offset);
+}
+
+template class PPPMGPU<float>;
+template class PPPMGPU<double>;
diff --git a/src/GPU/pppm_gpu.h b/src/GPU/pppm_gpu.h
new file mode 100644
index 0000000000..ba0b3e3e93
--- /dev/null
+++ b/src/GPU/pppm_gpu.h
@@ -0,0 +1,103 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef LMP_PPPM_GPU_H
+#define LMP_PPPM_GPU_H
+
+#include "kspace.h"
+#include "lmptype.h"
+
+namespace LAMMPS_NS {
+
+template <class grdtyp>
+class PPPMGPU : public KSpace {
+ public:
+  PPPMGPU(class LAMMPS *, int, char **);
+  virtual ~PPPMGPU();
+  virtual void init() = 0;
+  void base_init();
+  void setup();
+  virtual void compute(int, int) = 0;
+  void timing(int, double &, double &);
+  virtual double memory_usage() = 0;
+
+ protected:
+  int me,nprocs;
+  double PI;
+  double precision;
+  int nfactors;
+  int *factors;
+  double qsum,qsqsum;
+  double qqrd2e;
+  double cutoff;
+  double volume;
+  double delxinv,delyinv,delzinv,delvolinv;
+  double shift,shiftone;
+
+  int nxlo_in,nylo_in,nzlo_in,nxhi_in,nyhi_in,nzhi_in;
+  int nxlo_out,nylo_out,nzlo_out,nxhi_out,nyhi_out,nzhi_out;
+  int nxlo_ghost,nxhi_ghost,nylo_ghost,nyhi_ghost,nzlo_ghost,nzhi_ghost;
+  int nxlo_fft,nylo_fft,nzlo_fft,nxhi_fft,nyhi_fft,nzhi_fft;
+  int nlower,nupper;
+  int ngrid,nfft,nbuf,nfft_both;
+
+  grdtyp ***density_brick;
+  grdtyp ***vd_brick;
+  double *greensfn;
+  double **vg;
+  double *fkx,*fky,*fkz;
+  double *density_fft;
+  double *work1,*work2;
+  double *buf1,*buf2;
+
+  double *gf_b;
+  double **rho1d,**rho_coeff;
+
+  class FFT3d *fft1,*fft2;
+  class Remap *remap;
+
+  int nmax;
+
+  int triclinic;               // domain settings, orthog or triclinic
+  double *boxlo;
+                               // TIP4P settings
+  int typeH,typeO;             // atom types of TIP4P water H and O atoms
+  double qdist;                // distance from O site to negative charge
+  double alpha;                // geometric factor
+
+  void set_grid();
+  void allocate();
+  void deallocate();
+  int factorable(int);
+  double rms(double, double, bigint, double, double **);
+  double diffpr(double, double, double, double, double **);
+  void compute_gf_denom();
+  double gf_denom(double, double, double);
+  void brick2fft();
+  void fillbrick();
+  void poisson(int, int);
+  void procs2grid2d(int,int,int,int *, int*);
+  void compute_rho1d(double, double, double);
+  void compute_rho_coeff();
+  void slabcorr(int);
+
+  double poisson_time;  
+
+  grdtyp ***create_3d_offset(int, int, int, int, int, int, const char *,
+			     grdtyp *, int);
+  void destroy_3d_offset(grdtyp ***, int, int);
+};
+
+}
+
+#endif
diff --git a/src/GPU/pppm_gpu_double.cpp b/src/GPU/pppm_gpu_double.cpp
new file mode 100644
index 0000000000..4ddc7e5ad5
--- /dev/null
+++ b/src/GPU/pppm_gpu_double.cpp
@@ -0,0 +1,217 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL)
+------------------------------------------------------------------------- */
+
+#include "mpi.h"
+#include "string.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "math.h"
+#include "pppm_gpu_double.h"
+#include "lmptype.h"
+#include "atom.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "force.h"
+#include "pair.h"
+#include "bond.h"
+#include "angle.h"
+#include "domain.h"
+#include "fft3d_wrap.h"
+#include "remap_wrap.h"
+#include "memory.h"
+#include "error.h"
+#include "gpu_extra.h"
+
+#define grdtyp double
+
+// External functions from cuda library for atom decomposition
+
+grdtyp* pppm_gpu_init_d(const int nlocal, const int nall, FILE *screen,
+		        const int order, const int nxlo_out, 
+			const int nylo_out, const int nzlo_out,
+			const int nxhi_out, const int nyhi_out,
+			const int nzhi_out, double **rho_coeff,
+			grdtyp **_vd_brick, const double slab_volfactor,
+			const int nx_pppm, const int ny_pppm,
+			const int nz_pppm, int &success);
+void pppm_gpu_clear_d(const double poisson_time);
+int pppm_gpu_spread_d(const int ago, const int nlocal, const int nall,
+                      double **host_x, int *host_type, bool &success,
+                      double *host_q, double *boxlo, const double delxinv,
+                      const double delyinv, const double delzinv);
+void pppm_gpu_interp_d(const grdtyp qqrd2e_scale);
+double pppm_gpu_bytes_d();
+
+using namespace LAMMPS_NS;
+
+#define MAXORDER 7
+#define OFFSET 16384
+#define SMALL 0.00001
+#define LARGE 10000.0
+#define EPS_HOC 1.0e-7
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PPPMGPUDouble::PPPMGPUDouble(LAMMPS *lmp, int narg, char **arg) :
+  PPPMGPU<grdtyp>(lmp, narg, arg)
+{
+}
+
+/* ----------------------------------------------------------------------
+   free all memory 
+------------------------------------------------------------------------- */
+
+PPPMGPUDouble::~PPPMGPUDouble()
+{
+  pppm_gpu_clear_d(poisson_time);
+}
+
+/* ----------------------------------------------------------------------
+   called once before run 
+------------------------------------------------------------------------- */
+
+void PPPMGPUDouble::init()
+{
+  base_init();
+
+  if (order>8)
+    error->all("Cannot use order greater than 8 with pppm/gpu.");
+  pppm_gpu_clear_d(poisson_time);
+
+  int success;
+  grdtyp *data, *h_brick;
+  h_brick = pppm_gpu_init_d(atom->nlocal, atom->nlocal+atom->nghost, screen,
+			    order, nxlo_out, nylo_out, nzlo_out, nxhi_out,
+			    nyhi_out, nzhi_out, rho_coeff, &data,
+			    slab_volfactor, nx_pppm, ny_pppm, nz_pppm, 
+			    success);
+
+  GPU_EXTRA::check_flag(success,error,world);
+
+  density_brick =
+    create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out,
+		     nxlo_out,nxhi_out,"pppm:density_brick",h_brick,1);
+  vd_brick =
+    create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out,
+		     nxlo_out,nxhi_out,"pppm:vd_brick",data,4);
+
+  poisson_time=0;
+}
+
+/* ----------------------------------------------------------------------
+   compute the PPPMGPU long-range force, energy, virial 
+------------------------------------------------------------------------- */
+
+void PPPMGPUDouble::compute(int eflag, int vflag)
+{
+  bool success = true;
+  int flag=pppm_gpu_spread_d(neighbor->ago, atom->nlocal, atom->nlocal + 
+			     atom->nghost, atom->x, atom->type, success,
+			     atom->q, domain->boxlo, delxinv, delyinv,
+			     delzinv);
+  if (!success)
+    error->one("Out of memory on GPGPU");
+  if (flag != 0)
+    error->one("Out of range atoms - cannot compute PPPM");
+
+  int i;
+
+  // convert atoms from box to lamda coords
+  
+  if (triclinic == 0) boxlo = domain->boxlo;
+  else {
+    boxlo = domain->boxlo_lamda;
+    domain->x2lamda(atom->nlocal);
+  }
+
+  energy = 0.0;
+  if (vflag) for (i = 0; i < 6; i++) virial[i] = 0.0;
+
+  double t3=MPI_Wtime();
+
+  // all procs communicate density values from their ghost cells
+  //   to fully sum contribution in their 3d bricks
+  // remap from 3d decomposition to FFT decomposition
+
+  brick2fft();
+
+  // compute potential gradient on my FFT grid and
+  //   portion of e_long on this proc's FFT grid
+  // return gradients (electric fields) in 3d brick decomposition
+  
+  poisson(eflag,vflag);
+
+  // all procs communicate E-field values to fill ghost cells
+  //   surrounding their 3d bricks
+
+  fillbrick();
+
+  poisson_time+=MPI_Wtime()-t3;
+
+  // calculate the force on my particles
+
+  grdtyp qqrd2e_scale=qqrd2e*scale;
+  pppm_gpu_interp_d(qqrd2e_scale);
+
+  // sum energy across procs and add in volume-dependent term
+
+  if (eflag) {
+    double energy_all;
+    MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
+    energy = energy_all;
+   
+    energy *= 0.5*volume;
+    energy -= g_ewald*qsqsum/1.772453851 +
+      0.5*PI*qsum*qsum / (g_ewald*g_ewald*volume);
+    energy *= qqrd2e*scale;
+  }
+
+  // sum virial across procs
+
+  if (vflag) {
+    double virial_all[6];
+    MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
+    for (i = 0; i < 6; i++) virial[i] = 0.5*qqrd2e*scale*volume*virial_all[i];
+  }
+
+  // 2d slab correction
+
+  if (slabflag) slabcorr(eflag);
+
+  // convert atoms back from lamda to box coords
+  
+  if (triclinic) domain->lamda2x(atom->nlocal);
+}
+
+/* ----------------------------------------------------------------------
+   memory usage of local arrays 
+------------------------------------------------------------------------- */
+
+double PPPMGPUDouble::memory_usage()
+{
+  double bytes = nmax*3 * sizeof(double);
+  int nbrick = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) * 
+    (nzhi_out-nzlo_out+1);
+  bytes += 4 * nbrick * sizeof(grdtyp);
+  bytes += 6 * nfft_both * sizeof(double);
+  bytes += nfft_both*6 * sizeof(double);
+  bytes += 2 * nbuf * sizeof(double);
+  return bytes + pppm_gpu_bytes_d();
+}
diff --git a/src/GPU/pppm_gpu_double.h b/src/GPU/pppm_gpu_double.h
new file mode 100644
index 0000000000..a64612a7c0
--- /dev/null
+++ b/src/GPU/pppm_gpu_double.h
@@ -0,0 +1,42 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef KSPACE_CLASS
+
+KSpaceStyle(pppm/gpu/double,PPPMGPUDouble)
+
+#else
+
+#ifndef LMP_PPPM_GPU_DOUBLE_H
+#define LMP_PPPM_GPU_DOUBLE_H
+
+#include "pppm_gpu.h"
+#include "lmptype.h"
+
+namespace LAMMPS_NS {
+
+class PPPMGPUDouble : public PPPMGPU<double> {
+ public:
+  PPPMGPUDouble(class LAMMPS *, int, char **);
+  ~PPPMGPUDouble();
+  void init();
+  void compute(int, int);
+  double memory_usage();
+
+ protected:
+};
+
+}
+
+#endif
+#endif
diff --git a/src/GPU/pppm_gpu_single.cpp b/src/GPU/pppm_gpu_single.cpp
new file mode 100644
index 0000000000..cee6b438bf
--- /dev/null
+++ b/src/GPU/pppm_gpu_single.cpp
@@ -0,0 +1,216 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Mike Brown (ORNL)
+------------------------------------------------------------------------- */
+
+#include "mpi.h"
+#include "string.h"
+#include "stdio.h"
+#include "stdlib.h"
+#include "math.h"
+#include "pppm_gpu_single.h"
+#include "lmptype.h"
+#include "atom.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "force.h"
+#include "pair.h"
+#include "bond.h"
+#include "angle.h"
+#include "domain.h"
+#include "fft3d_wrap.h"
+#include "remap_wrap.h"
+#include "memory.h"
+#include "error.h"
+#include "gpu_extra.h"
+
+#define grdtyp float
+
+// External functions from cuda library for atom decomposition
+
+grdtyp* pppm_gpu_init_f(const int nlocal, const int nall, FILE *screen,
+		        const int order, const int nxlo_out, 
+			const int nylo_out, const int nzlo_out,
+			const int nxhi_out, const int nyhi_out,
+			const int nzhi_out, double **rho_coeff,
+			grdtyp **_vd_brick, const double slab_volfactor,
+			const int nx_pppm, const int ny_pppm,
+			const int nz_pppm, int &success);
+void pppm_gpu_clear_f(const double poisson_time);
+int pppm_gpu_spread_f(const int ago, const int nlocal, const int nall,
+                      double **host_x, int *host_type, bool &success,
+                      double *host_q, double *boxlo, const double delxinv,
+                      const double delyinv, const double delzinv);
+void pppm_gpu_interp_f(const grdtyp qqrd2e_scale);
+double pppm_gpu_bytes_f();
+
+using namespace LAMMPS_NS;
+
+#define MAXORDER 7
+#define OFFSET 16384
+#define SMALL 0.00001
+#define LARGE 10000.0
+#define EPS_HOC 1.0e-7
+
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+/* ---------------------------------------------------------------------- */
+
+PPPMGPUSingle::PPPMGPUSingle(LAMMPS *lmp, int narg, char **arg) :
+  PPPMGPU<grdtyp>(lmp, narg, arg)
+{
+}
+
+/* ----------------------------------------------------------------------
+   free all memory 
+------------------------------------------------------------------------- */
+
+PPPMGPUSingle::~PPPMGPUSingle()
+{
+  pppm_gpu_clear_f(poisson_time);
+}
+
+/* ----------------------------------------------------------------------
+   called once before run 
+------------------------------------------------------------------------- */
+
+void PPPMGPUSingle::init()
+{
+  base_init();
+
+  if (order>8)
+    error->all("Cannot use order greater than 8 with pppm/gpu.");
+  pppm_gpu_clear_f(poisson_time);
+
+  int success;
+  grdtyp *data, *h_brick;
+  h_brick = pppm_gpu_init_f(atom->nlocal, atom->nlocal+atom->nghost, screen,
+			    order, nxlo_out, nylo_out, nzlo_out, nxhi_out,
+			    nyhi_out, nzhi_out, rho_coeff, &data, 
+			    slab_volfactor,nx_pppm,ny_pppm,nz_pppm,success);
+
+  GPU_EXTRA::check_flag(success,error,world);
+
+  density_brick =
+    create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out,
+		     nxlo_out,nxhi_out,"pppm:density_brick",h_brick,1);
+  vd_brick =
+    create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out,
+		     nxlo_out,nxhi_out,"pppm:vd_brick",data,4);
+
+  poisson_time=0;
+}
+
+/* ----------------------------------------------------------------------
+   compute the PPPMGPU long-range force, energy, virial 
+------------------------------------------------------------------------- */
+
+void PPPMGPUSingle::compute(int eflag, int vflag)
+{
+  bool success = true;
+  int flag=pppm_gpu_spread_f(neighbor->ago, atom->nlocal, atom->nlocal + 
+			     atom->nghost, atom->x, atom->type, success,
+			     atom->q, domain->boxlo, delxinv, delyinv,
+			     delzinv);
+  if (!success)
+    error->one("Out of memory on GPGPU");
+  if (flag != 0)
+    error->one("Out of range atoms - cannot compute PPPM");
+
+  int i;
+
+  // convert atoms from box to lamda coords
+  
+  if (triclinic == 0) boxlo = domain->boxlo;
+  else {
+    boxlo = domain->boxlo_lamda;
+    domain->x2lamda(atom->nlocal);
+  }
+
+  energy = 0.0;
+  if (vflag) for (i = 0; i < 6; i++) virial[i] = 0.0;
+
+  double t3=MPI_Wtime();
+
+  // all procs communicate density values from their ghost cells
+  //   to fully sum contribution in their 3d bricks
+  // remap from 3d decomposition to FFT decomposition
+
+  brick2fft();
+
+  // compute potential gradient on my FFT grid and
+  //   portion of e_long on this proc's FFT grid
+  // return gradients (electric fields) in 3d brick decomposition
+  
+  poisson(eflag,vflag);
+
+  // all procs communicate E-field values to fill ghost cells
+  //   surrounding their 3d bricks
+
+  fillbrick();
+
+  poisson_time+=MPI_Wtime()-t3;
+
+  // calculate the force on my particles
+
+  grdtyp qqrd2e_scale=qqrd2e*scale;
+  pppm_gpu_interp_f(qqrd2e_scale);
+
+  // sum energy across procs and add in volume-dependent term
+
+  if (eflag) {
+    double energy_all;
+    MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
+    energy = energy_all;
+   
+    energy *= 0.5*volume;
+    energy -= g_ewald*qsqsum/1.772453851 +
+      0.5*PI*qsum*qsum / (g_ewald*g_ewald*volume);
+    energy *= qqrd2e*scale;
+  }
+
+  // sum virial across procs
+
+  if (vflag) {
+    double virial_all[6];
+    MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
+    for (i = 0; i < 6; i++) virial[i] = 0.5*qqrd2e*scale*volume*virial_all[i];
+  }
+
+  // 2d slab correction
+
+  if (slabflag) slabcorr(eflag);
+
+  // convert atoms back from lamda to box coords
+  
+  if (triclinic) domain->lamda2x(atom->nlocal);
+}
+
+/* ----------------------------------------------------------------------
+   memory usage of local arrays 
+------------------------------------------------------------------------- */
+
+double PPPMGPUSingle::memory_usage()
+{
+  double bytes = nmax*3 * sizeof(double);
+  int nbrick = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) * 
+    (nzhi_out-nzlo_out+1);
+  bytes += 4 * nbrick * sizeof(grdtyp);
+  bytes += 6 * nfft_both * sizeof(double);
+  bytes += nfft_both*6 * sizeof(double);
+  bytes += 2 * nbuf * sizeof(double);
+  return bytes + pppm_gpu_bytes_f();
+}
diff --git a/src/GPU/pppm_gpu_single.h b/src/GPU/pppm_gpu_single.h
new file mode 100644
index 0000000000..c17454dca4
--- /dev/null
+++ b/src/GPU/pppm_gpu_single.h
@@ -0,0 +1,42 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under 
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef KSPACE_CLASS
+
+KSpaceStyle(pppm/gpu/single,PPPMGPUSingle)
+
+#else
+
+#ifndef LMP_PPPM_GPU_SINGLE_H
+#define LMP_PPPM_GPU_SINGLE_H
+
+#include "pppm_gpu.h"
+#include "lmptype.h"
+
+namespace LAMMPS_NS {
+
+class PPPMGPUSingle : public PPPMGPU<float> {
+ public:
+  PPPMGPUSingle(class LAMMPS *, int, char **);
+  ~PPPMGPUSingle();
+  void init();
+  void compute(int, int);
+  double memory_usage();
+
+ protected:
+};
+
+}
+
+#endif
+#endif