From e2707ca96f37c10fc982ea99fdde9db68b7c0784 Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Thu, 28 Apr 2011 16:16:54 +0000
Subject: [PATCH 01/21] git-svn-id:
 svn://svn.icms.temple.edu/lammps-ro/trunk@6026
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 src/fix_rigid.cpp  | 35 +++++++++++------------------------
 src/math_extra.cpp |  6 ++----
 src/math_extra.h   | 12 ++++++++++++
 3 files changed, 25 insertions(+), 28 deletions(-)

diff --git a/src/fix_rigid.cpp b/src/fix_rigid.cpp
index 1099f09d7a..4d0686409d 100644
--- a/src/fix_rigid.cpp
+++ b/src/fix_rigid.cpp
@@ -648,8 +648,7 @@ void FixRigid::init()
 	sum[ibody][0] += 0.4 * massone * radius[i]*radius[i];
 	sum[ibody][1] += 0.4 * massone * radius[i]*radius[i];
 	sum[ibody][2] += 0.4 * massone * radius[i]*radius[i];
-      }
-      if (eflags[i] & INERTIA_ELLIPSOID) {
+      } else if (eflags[i] & INERTIA_ELLIPSOID) {
 	shape = ebonus[ellipsoid[i]].shape;
 	quatatom = ebonus[ellipsoid[i]].quat;
 	MathExtra::inertia_ellipsoid(shape,quatatom,massone,ivec);
@@ -665,11 +664,12 @@ void FixRigid::init()
   
   MPI_Allreduce(sum[0],all[0],6*nbody,MPI_DOUBLE,MPI_SUM,world);
   
+  // diagonalize inertia tensor for each body via Jacobi rotations
   // inertia = 3 eigenvalues = principal moments of inertia
-  // ex_space,ey_space,ez_space = 3 eigenvectors = principal axes of rigid body
-  
+  // evectors and exzy_space = 3 evectors = principal axes of rigid body
+
   int ierror;
-  double ez0,ez1,ez2;
+  double cross[3];
   double tensor[3][3],evectors[3][3];
 
   for (ibody = 0; ibody < nbody; ibody++) {
@@ -686,11 +686,9 @@ void FixRigid::init()
     ex_space[ibody][0] = evectors[0][0];
     ex_space[ibody][1] = evectors[1][0];
     ex_space[ibody][2] = evectors[2][0];
-    
     ey_space[ibody][0] = evectors[0][1];
     ey_space[ibody][1] = evectors[1][1];
     ey_space[ibody][2] = evectors[2][1];
-    
     ez_space[ibody][0] = evectors[0][2];
     ez_space[ibody][1] = evectors[1][2];
     ez_space[ibody][2] = evectors[2][2];
@@ -706,21 +704,11 @@ void FixRigid::init()
     if (inertia[ibody][2] < EPSILON*max) inertia[ibody][2] = 0.0;
 
     // enforce 3 evectors as a right-handed coordinate system
-    // flip 3rd evector if needed
-  
-    ez0 = ex_space[ibody][1]*ey_space[ibody][2] -
-      ex_space[ibody][2]*ey_space[ibody][1];
-    ez1 = ex_space[ibody][2]*ey_space[ibody][0] -
-      ex_space[ibody][0]*ey_space[ibody][2];
-    ez2 = ex_space[ibody][0]*ey_space[ibody][1] -
-      ex_space[ibody][1]*ey_space[ibody][0];
-  
-    if (ez0*ez_space[ibody][0] + ez1*ez_space[ibody][1] + 
-	ez2*ez_space[ibody][2] < 0.0) {
-      ez_space[ibody][0] = -ez_space[ibody][0];
-      ez_space[ibody][1] = -ez_space[ibody][1];
-      ez_space[ibody][2] = -ez_space[ibody][2];
-    }
+    // flip 3rd vector if needed
+
+    MathExtra::cross3(ex_space[ibody],ey_space[ibody],cross);
+    if (MathExtra::dot3(cross,ez_space[ibody]) < 0.0)
+      MathExtra::negate3(ez_space[ibody]);
 
     // create initial quaternion
   
@@ -823,8 +811,7 @@ void FixRigid::init()
 	sum[ibody][0] += 0.4 * massone * radius[i]*radius[i];
 	sum[ibody][1] += 0.4 * massone * radius[i]*radius[i];
 	sum[ibody][2] += 0.4 * massone * radius[i]*radius[i];
-      }
-      if (eflags[i] & INERTIA_ELLIPSOID) {
+      } else if (eflags[i] & INERTIA_ELLIPSOID) {
 	shape = ebonus[ellipsoid[i]].shape;
 	MathExtra::inertia_ellipsoid(shape,qorient[i],massone,ivec);
 	sum[ibody][0] += ivec[0];
diff --git a/src/math_extra.cpp b/src/math_extra.cpp
index c4318e8bbe..5160262aff 100644
--- a/src/math_extra.cpp
+++ b/src/math_extra.cpp
@@ -487,7 +487,7 @@ void inertia_line(double length, double theta, double mass, double *inertia)
 /* ----------------------------------------------------------------------
    compute space-frame inertia tensor of a triangle
    v0,v1,v2 = 3 vertices of triangle
-   from http://en.wikipedia.org/wiki/Inertia_tensor_of_triangle:
+   from http://en.wikipedia.org/wiki/Inertia_tensor_of_triangle
    inertia tensor = a/24 (v0^2 + v1^2 + v2^2 + (v0+v1+v2)^2) I - a Vt S V
    a = 2*area of tri = |(v1-v0) x (v2-v0)|
    I = 3x3 identity matrix
@@ -523,9 +523,7 @@ void inertia_triangle(double *v0, double *v1, double *v2,
   sub3(v2,v0,v2mv0);
   cross3(v1mv0,v2mv0,normal);
   double a = len3(normal);
-  double inv24 = 1.0/24.0;
-
-  // NOTE: use mass
+  double inv24 = mass/24.0;
 
   inertia[0] = inv24*a*(sum-vtsv[0][0]);
   inertia[1] = inv24*a*(sum-vtsv[1][1]);
diff --git a/src/math_extra.h b/src/math_extra.h
index 3ca98f8f12..44af2e9a8a 100755
--- a/src/math_extra.h
+++ b/src/math_extra.h
@@ -30,6 +30,7 @@ namespace MathExtra {
   inline void norm3(double *v);
   inline void normalize3(const double *v, double *ans);
   inline void snormalize3(const double, const double *v, double *ans);
+  inline void negate3(double *v);
   inline void add3(const double *v1, const double *v2, double *ans);
   inline void sub3(const double *v1, const double *v2, double *ans);
   inline double len3(const double *v);
@@ -156,6 +157,17 @@ void MathExtra::snormalize3(const double length, const double *v, double *ans)
   ans[2] = v[2]*scale;
 }
 
+/* ----------------------------------------------------------------------
+   negate vector v
+------------------------------------------------------------------------- */
+
+void MathExtra::negate3(double *v)
+{
+  v[0] = -v[0];
+  v[1] = -v[1];
+  v[2] = -v[2];
+}
+
 /* ----------------------------------------------------------------------
    ans = v1 + v2
 ------------------------------------------------------------------------- */

From bce11d1402ab028a34a9b63e10883c56e7be8da1 Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Thu, 28 Apr 2011 18:52:25 +0000
Subject: [PATCH 02/21] git-svn-id:
 svn://svn.icms.temple.edu/lammps-ro/trunk@6029
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 src/compute_cluster_atom.cpp | 2 +-
 src/fix_langevin.cpp         | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/compute_cluster_atom.cpp b/src/compute_cluster_atom.cpp
index 38a63d078c..0f3f3dd709 100644
--- a/src/compute_cluster_atom.cpp
+++ b/src/compute_cluster_atom.cpp
@@ -104,7 +104,7 @@ void ComputeClusterAtom::compute_peratom()
 
   // grow clusterID array if necessary
 
-  if (atom->nlocal > nmax) {
+  if (atom->nlocal+atom->nghost > nmax) {
     memory->destroy(clusterID);
     nmax = atom->nmax;
     memory->create(clusterID,nmax,"cluster/atom:clusterID");
diff --git a/src/fix_langevin.cpp b/src/fix_langevin.cpp
index 47699bfac7..37da93f90d 100644
--- a/src/fix_langevin.cpp
+++ b/src/fix_langevin.cpp
@@ -338,7 +338,6 @@ void FixLangevin::post_force_no_tally()
       }
     }
   }
-
 }
 
 /* ---------------------------------------------------------------------- */

From 995a92b9f3efa35a4a2574069583011d7e2772ca Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Thu, 28 Apr 2011 18:52:32 +0000
Subject: [PATCH 03/21] git-svn-id:
 svn://svn.icms.temple.edu/lammps-ro/trunk@6030
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 src/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/version.h b/src/version.h
index 6970bbeb4a..66a97ccc65 100644
--- a/src/version.h
+++ b/src/version.h
@@ -1 +1 @@
-#define LAMMPS_VERSION "20 Apr 2011"
+#define LAMMPS_VERSION "27 Apr 2011"

From 199c005d935e49b772befeedfc2150f1dbfe7f82 Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Fri, 29 Apr 2011 15:52:26 +0000
Subject: [PATCH 04/21] git-svn-id:
 svn://svn.icms.temple.edu/lammps-ro/trunk@6033
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 src/ASPHERE/compute_temp_asphere.cpp | 227 ++++++++++++++++++---------
 src/ASPHERE/compute_temp_asphere.h   |   2 +-
 src/ASPHERE/fix_nve_asphere.cpp      |   8 +-
 src/USER-EFF/fix_langevin_eff.cpp    |  16 +-
 src/compute_temp_sphere.cpp          | 143 +++++++++++------
 src/compute_temp_sphere.h            |   2 +-
 src/fix_langevin.cpp                 | 176 +++++++++++++++++++--
 src/fix_langevin.h                   |   6 +-
 src/fix_rigid.cpp                    | 166 ++++++++++++++++++--
 src/fix_rigid.h                      |   8 +
 10 files changed, 601 insertions(+), 153 deletions(-)

diff --git a/src/ASPHERE/compute_temp_asphere.cpp b/src/ASPHERE/compute_temp_asphere.cpp
index e4e1177a7e..f2d34ac72a 100755
--- a/src/ASPHERE/compute_temp_asphere.cpp
+++ b/src/ASPHERE/compute_temp_asphere.cpp
@@ -32,13 +32,16 @@
 
 using namespace LAMMPS_NS;
 
+enum{ROTATE,ALL};
+
+#define INERTIA 0.2          // moment of inertia for ellipsoid
+
 /* ---------------------------------------------------------------------- */
 
 ComputeTempAsphere::ComputeTempAsphere(LAMMPS *lmp, int narg, char **arg) :
   Compute(lmp, narg, arg)
 {
-  if (narg != 3 && narg != 4)
-    error->all("Illegal compute temp/asphere command");
+  if (narg < 3) error->all("Illegal compute temp/asphere command");
 
   scalar_flag = vector_flag = 1;
   size_vector = 6;
@@ -48,11 +51,24 @@ ComputeTempAsphere::ComputeTempAsphere(LAMMPS *lmp, int narg, char **arg) :
 
   tempbias = 0;
   id_bias = NULL;
-  if (narg == 4) {
-    tempbias = 1;
-    int n = strlen(arg[3]) + 1;
-    id_bias = new char[n];
-    strcpy(id_bias,arg[3]);
+  mode = ALL;
+
+  int iarg = 3;
+  while (iarg < narg) {
+    if (strcmp(arg[iarg],"bias") == 0) {
+      if (iarg+2 > narg) error->all("Illegal compute temp/asphere command");
+      tempbias = 1;
+      int n = strlen(arg[iarg+1]) + 1;
+      id_bias = new char[n];
+      strcpy(id_bias,arg[iarg+1]);
+      iarg += 2;
+    } else if (strcmp(arg[iarg],"dof") == 0) {
+      if (iarg+2 > narg) error->all("Illegal compute temp/asphere command");
+      if (strcmp(arg[iarg+1],"rotate") == 0) mode = ROTATE;
+      else if (strcmp(arg[iarg+1],"all") == 0) mode = ALL;
+      else error->all("Illegal compute temp/asphere command");
+      iarg += 2;
+    } else error->all("Illegal compute temp/asphere command");
   }
 
   vector = new double[6];
@@ -76,8 +92,7 @@ ComputeTempAsphere::~ComputeTempAsphere()
 
 void ComputeTempAsphere::init()
 {
-  // check that all particles are finite-size
-  // no point particles allowed, spherical is OK
+  // check that all particles are finite-size, no point particles allowed
 
   int *ellipsoid = atom->ellipsoid;
   int *mask = atom->mask;
@@ -114,18 +129,26 @@ void ComputeTempAsphere::init()
 void ComputeTempAsphere::dof_compute()
 {
   // 6 dof for 3d, 3 dof for 2d
+  // which dof are included also depends on mode
   // assume full rotation of extended particles
   // user should correct this via compute_modify if needed
 
   double natoms = group->count(igroup);
-  int nper = 6;
-  if (domain->dimension == 2) nper = 3;
+  int nper;
+  if (domain->dimension == 3) {
+    if (mode == ALL) nper = 6;
+    else nper = 3;
+  } else {
+    if (mode == ALL) nper = 3;
+    else nper = 1;
+  }
   dof = nper*natoms;
 
   // additional adjustments to dof
 
-  if (tempbias == 1) dof -= tbias->dof_remove(-1) * natoms;
-  else if (tempbias == 2) {
+  if (tempbias == 1) {
+    if (mode == ALL) dof -= tbias->dof_remove(-1) * natoms;
+  } else if (tempbias == 2) {
     int *mask = atom->mask;
     int nlocal = atom->nlocal;
     int count = 0;
@@ -154,46 +177,73 @@ double ComputeTempAsphere::compute_scalar()
   }
 
   AtomVecEllipsoid::Bonus *bonus = avec->bonus;
-  int *ellipsoid = atom->ellipsoid;
   double **v = atom->v;
   double **angmom = atom->angmom;
   double *rmass = atom->rmass;
+  int *ellipsoid = atom->ellipsoid;
   int *mask = atom->mask;
   int nlocal = atom->nlocal;
 
   double *shape,*quat;
   double wbody[3],inertia[3];
   double rot[3][3];
-  double t = 0.0;
   
-  // sum translationals and rotational energy for each particle
+  // sum translational and rotational energy for each particle
   // no point particles since divide by inertia
 
-  for (int i = 0; i < nlocal; i++)
-    if (mask[i] & groupbit) {
+  double t = 0.0;
 
-      shape = bonus[ellipsoid[i]].shape;
-      quat = bonus[ellipsoid[i]].quat;
+  if (mode == ALL) {
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit) {
+	t += (v[i][0]*v[i][0] + v[i][1]*v[i][1] + v[i][2]*v[i][2]) * rmass[i];
 
-      t += (v[i][0]*v[i][0] + v[i][1]*v[i][1] + v[i][2]*v[i][2]) * rmass[i];
+	// principal moments of inertia
 
-      // principal moments of inertia
+	shape = bonus[ellipsoid[i]].shape;
+	quat = bonus[ellipsoid[i]].quat;
 
-      inertia[0] = rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]) / 5.0;
-      inertia[1] = rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]) / 5.0;
-      inertia[2] = rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]) / 5.0;
+	inertia[0] = INERTIA*rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]);
+	inertia[1] = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]);
+	inertia[2] = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]);
 
-      // wbody = angular velocity in body frame
+	// wbody = angular velocity in body frame
       
-      MathExtra::quat_to_mat(quat,rot);
-      MathExtra::transpose_matvec(rot,angmom[i],wbody);
-      wbody[0] /= inertia[0];
-      wbody[1] /= inertia[1];
-      wbody[2] /= inertia[2];
+	MathExtra::quat_to_mat(quat,rot);
+	MathExtra::transpose_matvec(rot,angmom[i],wbody);
+	wbody[0] /= inertia[0];
+	wbody[1] /= inertia[1];
+	wbody[2] /= inertia[2];
+	
+	t += inertia[0]*wbody[0]*wbody[0] +
+	  inertia[1]*wbody[1]*wbody[1] + inertia[2]*wbody[2]*wbody[2];
+      }
 
-      t += inertia[0]*wbody[0]*wbody[0] +
-	inertia[1]*wbody[1]*wbody[1] + inertia[2]*wbody[2]*wbody[2];
-    }
+  } else {
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit) {
+
+	// principal moments of inertia
+
+	shape = bonus[ellipsoid[i]].shape;
+	quat = bonus[ellipsoid[i]].quat;
+
+	inertia[0] = INERTIA*rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]);
+	inertia[1] = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]);
+	inertia[2] = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]);
+
+	// wbody = angular velocity in body frame
+      
+	MathExtra::quat_to_mat(quat,rot);
+	MathExtra::transpose_matvec(rot,angmom[i],wbody);
+	wbody[0] /= inertia[0];
+	wbody[1] /= inertia[1];
+	wbody[2] /= inertia[2];
+	
+	t += inertia[0]*wbody[0]*wbody[0] +
+	  inertia[1]*wbody[1]*wbody[1] + inertia[2]*wbody[2]*wbody[2];
+      }
+  }
 
   if (tempbias) tbias->restore_bias_all();
 
@@ -217,58 +267,93 @@ void ComputeTempAsphere::compute_vector()
   }
 
   AtomVecEllipsoid::Bonus *bonus = avec->bonus;
-  int *ellipsoid = atom->ellipsoid;
   double **v = atom->v;
   double **angmom = atom->angmom;
   double *rmass = atom->rmass;
+  int *ellipsoid = atom->ellipsoid;
   int *mask = atom->mask;
   int nlocal = atom->nlocal;
 
   double *shape,*quat;
-  double wbody[3],inertia[3];
+  double wbody[3],inertia[3],t[6];
   double rot[3][3];
-  double massone,t[6];
+  double massone;
+
+  // sum translational and rotational energy for each particle
+  // no point particles since divide by inertia
+
   for (i = 0; i < 6; i++) t[i] = 0.0;
 
-  for (i = 0; i < nlocal; i++)
-    if (mask[i] & groupbit) {
+  if (mode == ALL) {
+    for (i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit) {
+	massone = rmass[i];
+	t[0] += massone * v[i][0]*v[i][0];
+	t[1] += massone * v[i][1]*v[i][1];
+	t[2] += massone * v[i][2]*v[i][2];
+	t[3] += massone * v[i][0]*v[i][1];
+	t[4] += massone * v[i][0]*v[i][2];
+	t[5] += massone * v[i][1]*v[i][2];
+	
+	// principal moments of inertia
 
-      shape = bonus[ellipsoid[i]].shape;
-      quat = bonus[ellipsoid[i]].quat;
+	shape = bonus[ellipsoid[i]].shape;
+	quat = bonus[ellipsoid[i]].quat;
 
-      // translational kinetic energy
+	inertia[0] = INERTIA*massone * (shape[1]*shape[1]+shape[2]*shape[2]);
+	inertia[1] = INERTIA*massone * (shape[0]*shape[0]+shape[2]*shape[2]);
+	inertia[2] = INERTIA*massone * (shape[0]*shape[0]+shape[1]*shape[1]);
+	
+	// wbody = angular velocity in body frame
+	
+	MathExtra::quat_to_mat(quat,rot);
+	MathExtra::transpose_matvec(rot,angmom[i],wbody);
+	wbody[0] /= inertia[0];
+	wbody[1] /= inertia[1];
+	wbody[2] /= inertia[2];
+	
+	// rotational kinetic energy
+	
+	t[0] += inertia[0]*wbody[0]*wbody[0];
+	t[1] += inertia[1]*wbody[1]*wbody[1];
+	t[2] += inertia[2]*wbody[2]*wbody[2];
+	t[3] += inertia[0]*wbody[0]*wbody[1];
+	t[4] += inertia[1]*wbody[0]*wbody[2];
+	t[5] += inertia[2]*wbody[1]*wbody[2];
+      }
 
-      massone = rmass[i];
-      t[0] += massone * v[i][0]*v[i][0];
-      t[1] += massone * v[i][1]*v[i][1];
-      t[2] += massone * v[i][2]*v[i][2];
-      t[3] += massone * v[i][0]*v[i][1];
-      t[4] += massone * v[i][0]*v[i][2];
-      t[5] += massone * v[i][1]*v[i][2];
+  } else {
+    for (i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit) {
+	
+	// principal moments of inertia
 
-      // principal moments of inertia
+	shape = bonus[ellipsoid[i]].shape;
+	quat = bonus[ellipsoid[i]].quat;
+	massone = rmass[i];
 
-      inertia[0] = rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]) / 5.0;
-      inertia[1] = rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]) / 5.0;
-      inertia[2] = rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]) / 5.0;
-
-      // wbody = angular velocity in body frame
-
-      MathExtra::quat_to_mat(quat,rot);
-      MathExtra::transpose_matvec(rot,angmom[i],wbody);
-      wbody[0] /= inertia[0];
-      wbody[1] /= inertia[1];
-      wbody[2] /= inertia[2];
-
-      // rotational kinetic energy
-
-      t[0] += inertia[0]*wbody[0]*wbody[0];
-      t[1] += inertia[1]*wbody[1]*wbody[1];
-      t[2] += inertia[2]*wbody[2]*wbody[2];
-      t[3] += inertia[0]*wbody[0]*wbody[1];
-      t[4] += inertia[1]*wbody[0]*wbody[2];
-      t[5] += inertia[2]*wbody[1]*wbody[2];
-    }
+	inertia[0] = INERTIA*massone * (shape[1]*shape[1]+shape[2]*shape[2]);
+	inertia[1] = INERTIA*massone * (shape[0]*shape[0]+shape[2]*shape[2]);
+	inertia[2] = INERTIA*massone * (shape[0]*shape[0]+shape[1]*shape[1]);
+	
+	// wbody = angular velocity in body frame
+	
+	MathExtra::quat_to_mat(quat,rot);
+	MathExtra::transpose_matvec(rot,angmom[i],wbody);
+	wbody[0] /= inertia[0];
+	wbody[1] /= inertia[1];
+	wbody[2] /= inertia[2];
+	
+	// rotational kinetic energy
+	
+	t[0] += inertia[0]*wbody[0]*wbody[0];
+	t[1] += inertia[1]*wbody[1]*wbody[1];
+	t[2] += inertia[2]*wbody[2]*wbody[2];
+	t[3] += inertia[0]*wbody[0]*wbody[1];
+	t[4] += inertia[1]*wbody[0]*wbody[2];
+	t[5] += inertia[2]*wbody[1]*wbody[2];
+      }
+  }
 
   if (tempbias) tbias->restore_bias_all();
 
diff --git a/src/ASPHERE/compute_temp_asphere.h b/src/ASPHERE/compute_temp_asphere.h
index 19e29ebf1b..dde67a1bd5 100755
--- a/src/ASPHERE/compute_temp_asphere.h
+++ b/src/ASPHERE/compute_temp_asphere.h
@@ -36,7 +36,7 @@ class ComputeTempAsphere : public Compute {
   void restore_bias(int, double *);
 
  private:
-  int fix_dof;
+  int fix_dof,mode;
   double tfactor;
   char *id_bias;
   class Compute *tbias;              // ptr to additional bias compute
diff --git a/src/ASPHERE/fix_nve_asphere.cpp b/src/ASPHERE/fix_nve_asphere.cpp
index 4f8c94e208..9e4155581f 100755
--- a/src/ASPHERE/fix_nve_asphere.cpp
+++ b/src/ASPHERE/fix_nve_asphere.cpp
@@ -29,6 +29,8 @@
 
 using namespace LAMMPS_NS;
 
+#define INERTIA 0.2          // moment of inertia for ellipsoid
+
 /* ---------------------------------------------------------------------- */
 
 FixNVEAsphere::FixNVEAsphere(LAMMPS *lmp, int narg, char **arg) : 
@@ -103,9 +105,9 @@ void FixNVEAsphere::initial_integrate(int vflag)
       shape = bonus[ellipsoid[i]].shape;
       quat = bonus[ellipsoid[i]].quat;
 
-      inertia[0] = rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]) / 5.0;
-      inertia[1] = rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]) / 5.0;
-      inertia[2] = rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]) / 5.0;
+      inertia[0] = INERTIA*rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]);
+      inertia[1] = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]);
+      inertia[2] = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]);
 
       // compute omega at 1/2 step from angmom at 1/2 step and current q
       // update quaternion a full step via Richardson iteration
diff --git a/src/USER-EFF/fix_langevin_eff.cpp b/src/USER-EFF/fix_langevin_eff.cpp
index 2758ebea07..7328fdb23b 100644
--- a/src/USER-EFF/fix_langevin_eff.cpp
+++ b/src/USER-EFF/fix_langevin_eff.cpp
@@ -88,7 +88,9 @@ void FixLangevinEff::post_force_no_tally()
 	f[i][0] += gamma1*v[i][0] + gamma2*(random->uniform()-0.5);
 	f[i][1] += gamma1*v[i][1] + gamma2*(random->uniform()-0.5);
 	f[i][2] += gamma1*v[i][2] + gamma2*(random->uniform()-0.5);
-        if (abs(spin[i])==1) erforce[i] += 0.75*gamma1*ervel[i] + 0.866025404*gamma2*(random->uniform()-0.5);
+        if (abs(spin[i])==1) 
+	  erforce[i] += 0.75*gamma1*ervel[i] + 
+	    0.866025404*gamma2*(random->uniform()-0.5);
       }
     }
   } else if (which == BIAS) {
@@ -105,7 +107,8 @@ void FixLangevinEff::post_force_no_tally()
 	if (v[i][2] != 0.0)
 	  f[i][2] += gamma1*v[i][2] + gamma2*(random->uniform()-0.5);
         if (abs(spin[i])==1 && ervel[i] != 0.0)
-          erforce[i] += 0.75*gamma1*ervel[i] + 0.866025404*gamma2*(random->uniform()-0.5);
+          erforce[i] += 0.75*gamma1*ervel[i] + 
+	    0.866025404*gamma2*(random->uniform()-0.5);
 	temperature->restore_bias(i,v[i]);
       }
     }
@@ -158,7 +161,8 @@ void FixLangevinEff::post_force_tally()
 	flangevin[i][0] = gamma1*v[i][0] + gamma2*(random->uniform()-0.5);
 	flangevin[i][1] = gamma1*v[i][1] + gamma2*(random->uniform()-0.5);
 	flangevin[i][2] = gamma1*v[i][2] + gamma2*(random->uniform()-0.5);
-        erforcelangevin[i] = 0.75*gamma1*ervel[i]+0.866025404*gamma2*(random->uniform()-0.5);
+        erforcelangevin[i] = 0.75*gamma1*ervel[i] + 
+	  0.866025404*gamma2*(random->uniform()-0.5);
 	f[i][0] += flangevin[i][0];
 	f[i][1] += flangevin[i][1];
 	f[i][2] += flangevin[i][2];
@@ -175,14 +179,16 @@ void FixLangevinEff::post_force_tally()
 	flangevin[i][0] = gamma1*v[i][0] + gamma2*(random->uniform()-0.5);
 	flangevin[i][1] = gamma1*v[i][1] + gamma2*(random->uniform()-0.5);
 	flangevin[i][2] = gamma1*v[i][2] + gamma2*(random->uniform()-0.5);
-        erforcelangevin[i] = 0.75*gamma1*ervel[i]+0.866025404*gamma2*(random->uniform()-0.5);
+        erforcelangevin[i] = 0.75*gamma1*ervel[i] + 
+	  0.866025404*gamma2*(random->uniform()-0.5);
 	if (v[i][0] != 0.0) f[i][0] += flangevin[i][0];
 	else flangevin[i][0] = 0.0;
 	if (v[i][1] != 0.0) f[i][1] += flangevin[i][1];
 	else flangevin[i][1] = 0.0;
 	if (v[i][2] != 0.0) f[i][2] += flangevin[i][2];
 	else flangevin[i][2] = 0.0;
-        if (abs(spin[i])==1 && ervel[i] != 0.0) erforce[i] += erforcelangevin[i];
+        if (abs(spin[i])==1 && ervel[i] != 0.0)
+	  erforce[i] += erforcelangevin[i];
 	temperature->restore_bias(i,v[i]);
       }
     }
diff --git a/src/compute_temp_sphere.cpp b/src/compute_temp_sphere.cpp
index bad55efdb8..93c9ec74aa 100644
--- a/src/compute_temp_sphere.cpp
+++ b/src/compute_temp_sphere.cpp
@@ -26,6 +26,8 @@
 
 using namespace LAMMPS_NS;
 
+enum{ROTATE,ALL};
+
 #define INERTIA 0.4          // moment of inertia for sphere
 
 /* ---------------------------------------------------------------------- */
@@ -33,8 +35,7 @@ using namespace LAMMPS_NS;
 ComputeTempSphere::ComputeTempSphere(LAMMPS *lmp, int narg, char **arg) : 
   Compute(lmp, narg, arg)
 {
-  if (narg != 3 && narg != 4)
-    error->all("Illegal compute temp/sphere command");
+  if (narg < 3) error->all("Illegal compute temp/sphere command");
 
   scalar_flag = vector_flag = 1;
   size_vector = 6;
@@ -44,11 +45,24 @@ ComputeTempSphere::ComputeTempSphere(LAMMPS *lmp, int narg, char **arg) :
 
   tempbias = 0;
   id_bias = NULL;
-  if (narg == 4) {
-    tempbias = 1;
-    int n = strlen(arg[3]) + 1;
-    id_bias = new char[n];
-    strcpy(id_bias,arg[3]);
+  mode = ALL;
+  
+  int iarg = 3;
+  while (iarg < narg) {
+    if (strcmp(arg[iarg],"bias") == 0) {
+      if (iarg+2 > narg) error->all("Illegal compute temp/sphere command");
+      tempbias = 1;
+      int n = strlen(arg[iarg+1]) + 1;
+      id_bias = new char[n];
+      strcpy(id_bias,arg[iarg+1]);
+      iarg += 2;
+    } else if (strcmp(arg[iarg],"dof") == 0) {
+      if (iarg+2 > narg) error->all("Illegal compute temp/sphere command");
+      if (strcmp(arg[iarg+1],"rotate") == 0) mode = ROTATE;
+      else if (strcmp(arg[iarg+1],"all") == 0) mode = ALL;
+      else error->all("Illegal compute temp/sphere command");
+      iarg += 2;
+    } else error->all("Illegal compute temp/sphere command");
   }
 
   vector = new double[6];
@@ -100,27 +114,34 @@ void ComputeTempSphere::dof_compute()
 
   // 6 or 3 dof for extended/point particles for 3d
   // 3 or 2 dof for extended/point particles for 2d
+  // which dof are included also depends on mode
   // assume full rotation of extended particles
   // user should correct this via compute_modify if needed
 
-  int dimension = domain->dimension;
-
   double *radius = atom->radius;
   int *mask = atom->mask;
   int nlocal = atom->nlocal;
 
   count = 0;
-  if (dimension == 3) {
+  if (domain->dimension == 3) {
     for (int i = 0; i < nlocal; i++)
       if (mask[i] & groupbit) {
-	if (radius[i] == 0.0) count += 3;
-	else count += 6;
+	if (radius[i] == 0.0) {
+	  if (mode == ALL) count += 3;
+	} else {
+	  if (mode == ALL) count += 6;
+	  else count += 3;
+	}
       }
   } else {
     for (int i = 0; i < nlocal; i++)
       if (mask[i] & groupbit) {
-	if (radius[i] == 0.0) count += 2;
-	else count += 3;
+	if (radius[i] == 0.0) {
+	  if (mode == ALL) count += 2;
+	} else {
+	  if (mode == ALL) count += 3;
+	  else count += 1;
+	}
       }
   }
 
@@ -130,28 +151,38 @@ void ComputeTempSphere::dof_compute()
   // additional adjustments to dof
 
   if (tempbias == 1) {
-    double natoms = group->count(igroup);
-    dof -= tbias->dof_remove(-1) * natoms;
+    if (mode == ALL) {
+      double natoms = group->count(igroup);
+      dof -= tbias->dof_remove(-1) * natoms;
+    }
 
   } else if (tempbias == 2) {
     int *mask = atom->mask;
     int nlocal = atom->nlocal;
 
     count = 0;
-    if (dimension == 3) {
+    if (domain->dimension == 3) {
       for (int i = 0; i < nlocal; i++)
 	if (mask[i] & groupbit) {
 	  if (tbias->dof_remove(i)) {
-	    if (radius[i] == 0.0) count += 3;
-	    else count += 6;
+	    if (radius[i] == 0.0) {
+	      if (mode == ALL) count += 3;
+	    } else {
+	      if (mode == ALL) count += 6;
+	      else count += 3;
+	    }
 	  }
 	}
     } else {
       for (int i = 0; i < nlocal; i++)
 	if (mask[i] & groupbit) {
 	  if (tbias->dof_remove(i)) {
-	    if (radius[i] == 0.0) count += 2;
-	    else count += 3;
+	    if (radius[i] == 0.0) {
+	      if (mode == ALL) count += 2;
+	    } else {
+	      if (mode == ALL) count += 3;
+	      else count += 1;
+	    }
 	  }
 	}
     }
@@ -187,12 +218,19 @@ double ComputeTempSphere::compute_scalar()
 
   double t = 0.0;
 
-  for (int i = 0; i < nlocal; i++)
-    if (mask[i] & groupbit) {
-      t += (v[i][0]*v[i][0] + v[i][1]*v[i][1] + v[i][2]*v[i][2]) * rmass[i];
-      t += (omega[i][0]*omega[i][0] + omega[i][1]*omega[i][1] + 
-	    omega[i][2]*omega[i][2]) * INERTIA*radius[i]*radius[i]*rmass[i];
-    }
+  if (mode == ALL) {
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit) {
+	t += (v[i][0]*v[i][0] + v[i][1]*v[i][1] + v[i][2]*v[i][2]) * rmass[i];
+	t += (omega[i][0]*omega[i][0] + omega[i][1]*omega[i][1] + 
+	      omega[i][2]*omega[i][2]) * INERTIA*rmass[i]*radius[i]*radius[i];
+      }
+  } else {
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit)
+	t += (omega[i][0]*omega[i][0] + omega[i][1]*omega[i][1] + 
+	      omega[i][2]*omega[i][2]) * INERTIA*rmass[i]*radius[i]*radius[i];
+  }
 
   if (tempbias) tbias->restore_bias_all();
 
@@ -225,25 +263,38 @@ void ComputeTempSphere::compute_vector()
   double massone,inertiaone,t[6];
   for (int i = 0; i < 6; i++) t[i] = 0.0;
 
-  for (int i = 0; i < nlocal; i++)
-    if (mask[i] & groupbit) {
-      massone = rmass[i];
-      t[0] += massone * v[i][0]*v[i][0];
-      t[1] += massone * v[i][1]*v[i][1];
-      t[2] += massone * v[i][2]*v[i][2];
-      t[3] += massone * v[i][0]*v[i][1];
-      t[4] += massone * v[i][0]*v[i][2];
-      t[5] += massone * v[i][1]*v[i][2];
-      
-      inertiaone = INERTIA*radius[i]*radius[i]*rmass[i];
-      t[0] += inertiaone * omega[i][0]*omega[i][0];
-      t[1] += inertiaone * omega[i][1]*omega[i][1];
-      t[2] += inertiaone * omega[i][2]*omega[i][2];
-      t[3] += inertiaone * omega[i][0]*omega[i][1];
-      t[4] += inertiaone * omega[i][0]*omega[i][2];
-      t[5] += inertiaone * omega[i][1]*omega[i][2];
-    }
-
+  if (mode == ALL) {
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit) {
+	massone = rmass[i];
+	t[0] += massone * v[i][0]*v[i][0];
+	t[1] += massone * v[i][1]*v[i][1];
+	t[2] += massone * v[i][2]*v[i][2];
+	t[3] += massone * v[i][0]*v[i][1];
+	t[4] += massone * v[i][0]*v[i][2];
+	t[5] += massone * v[i][1]*v[i][2];
+	
+	inertiaone = INERTIA*rmass[i]*radius[i]*radius[i];
+	t[0] += inertiaone * omega[i][0]*omega[i][0];
+	t[1] += inertiaone * omega[i][1]*omega[i][1];
+	t[2] += inertiaone * omega[i][2]*omega[i][2];
+	t[3] += inertiaone * omega[i][0]*omega[i][1];
+	t[4] += inertiaone * omega[i][0]*omega[i][2];
+	t[5] += inertiaone * omega[i][1]*omega[i][2];
+      }
+  } else {
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit) {
+	inertiaone = INERTIA*rmass[i]*radius[i]*radius[i];
+	t[0] += inertiaone * omega[i][0]*omega[i][0];
+	t[1] += inertiaone * omega[i][1]*omega[i][1];
+	t[2] += inertiaone * omega[i][2]*omega[i][2];
+	t[3] += inertiaone * omega[i][0]*omega[i][1];
+	t[4] += inertiaone * omega[i][0]*omega[i][2];
+	t[5] += inertiaone * omega[i][1]*omega[i][2];
+      }
+  }
+    
   if (tempbias) tbias->restore_bias_all();
 
   MPI_Allreduce(t,vector,6,MPI_DOUBLE,MPI_SUM,world);
diff --git a/src/compute_temp_sphere.h b/src/compute_temp_sphere.h
index 86285061bd..c0b29dce59 100644
--- a/src/compute_temp_sphere.h
+++ b/src/compute_temp_sphere.h
@@ -36,7 +36,7 @@ class ComputeTempSphere : public Compute {
   void restore_bias(int, double *);
 
  private:
-  int fix_dof;
+  int fix_dof,mode;
   double tfactor;
   double *inertia;
   char *id_bias;
diff --git a/src/fix_langevin.cpp b/src/fix_langevin.cpp
index 37da93f90d..0d233126ef 100644
--- a/src/fix_langevin.cpp
+++ b/src/fix_langevin.cpp
@@ -20,7 +20,9 @@
 #include "string.h"
 #include "stdlib.h"
 #include "fix_langevin.h"
+#include "math_extra.h"
 #include "atom.h"
+#include "atom_vec_ellipsoid.h"
 #include "force.h"
 #include "update.h"
 #include "modify.h"
@@ -38,6 +40,9 @@ using namespace LAMMPS_NS;
 
 enum{NOBIAS,BIAS};
 
+#define SINERTIA 0.4          // moment of inertia for sphere
+#define EINERTIA 0.2          // moment of inertia for ellipsoid
+
 /* ---------------------------------------------------------------------- */
 
 FixLangevin::FixLangevin(LAMMPS *lmp, int narg, char **arg) :
@@ -71,6 +76,7 @@ FixLangevin::FixLangevin(LAMMPS *lmp, int narg, char **arg) :
   // optional args
 
   for (int i = 1; i <= atom->ntypes; i++) ratio[i] = 1.0;
+  oflag = aflag = 0;
   tally = 0;
   zeroflag = 0;
 
@@ -96,9 +102,29 @@ FixLangevin::FixLangevin(LAMMPS *lmp, int narg, char **arg) :
       else if (strcmp(arg[iarg+1],"yes") == 0) zeroflag = 1;
       else error->all("Illegal fix langevin command");
       iarg += 2;
+    } else if (strcmp(arg[iarg],"omega") == 0) {
+      if (iarg+2 > narg) error->all("Illegal fix langevin command");
+      if (strcmp(arg[iarg+1],"no") == 0) oflag = 0;
+      else if (strcmp(arg[iarg+1],"yes") == 0) oflag = 1;
+      else error->all("Illegal fix langevin command");
+      iarg += 2;
+    } else if (strcmp(arg[iarg],"angmom") == 0) {
+      if (iarg+2 > narg) error->all("Illegal fix langevin command");
+      if (strcmp(arg[iarg+1],"no") == 0) aflag = 0;
+      else if (strcmp(arg[iarg+1],"yes") == 0) aflag = 1;
+      else error->all("Illegal fix langevin command");
+      iarg += 2;
     } else error->all("Illegal fix langevin command");
   }
 
+  // error check
+
+  if (aflag) {
+    avec = (AtomVecEllipsoid *) atom->style_match("ellipsoid");
+    if (!avec) 
+      error->all("Fix langevin angmom requires atom style ellipsoid");
+  }
+
   // set temperature = NULL, user can override via fix_modify if wants bias
 
   id_temp = NULL;
@@ -140,6 +166,35 @@ int FixLangevin::setmask()
 
 void FixLangevin::init()
 {
+  if (oflag && !atom->sphere_flag)
+    error->all("Fix langevin omega require atom style sphere");
+  if (aflag && !atom->ellipsoid_flag)
+    error->all("Fix langevin angmom require atom style ellipsoid");
+
+  // if oflag or aflag set, check that all group particles are finite-size
+
+  if (oflag) {
+    double *radius = atom->radius;
+    int *mask = atom->mask;
+    int nlocal = atom->nlocal;
+
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit)
+	if (radius[i] == 0.0)
+	  error->one("Fix langevin omega requires extended particles");
+  }
+
+  if (aflag) {
+    int *ellipsoid = atom->ellipsoid;
+    int *mask = atom->mask;
+    int nlocal = atom->nlocal;
+
+    for (int i = 0; i < nlocal; i++)
+      if (mask[i] & groupbit)
+	if (ellipsoid[i] < 0)
+	  error->one("Fix langevin angmom requires extended particles");
+  }
+
   // set force prefactors
 
   if (!atom->rmass) {
@@ -219,6 +274,11 @@ void FixLangevin::post_force_no_tally()
   double fran[3],fsum[3],fsumall[3];
   fsum[0] = fsum[1] = fsum[2] = 0.0;
   bigint count;
+
+  double boltz = force->boltz;
+  double dt = update->dt;
+  double mvv2e = force->mvv2e;
+  double ftm2v = force->ftm2v;
   
   if (zeroflag) {
     count = group->count(igroup);
@@ -227,11 +287,6 @@ void FixLangevin::post_force_no_tally()
   }
   
   if (rmass) {
-    double boltz = force->boltz;
-    double dt = update->dt;
-    double mvv2e = force->mvv2e;
-    double ftm2v = force->ftm2v;
-
     if (which == NOBIAS) {
       for (int i = 0; i < nlocal; i++) {
 	if (mask[i] & groupbit) {
@@ -280,7 +335,6 @@ void FixLangevin::post_force_no_tally()
   } else {
     
     if (which == NOBIAS) {
-
       for (int i = 0; i < nlocal; i++) {
 	if (mask[i] & groupbit) {
 	  gamma1 = gfactor1[type[i]];
@@ -295,7 +349,6 @@ void FixLangevin::post_force_no_tally()
 	  fsum[1] += fran[1];
 	  fsum[2] += fran[2];
 	}
-
       }
 
     } else if (which == BIAS) {
@@ -338,6 +391,11 @@ void FixLangevin::post_force_no_tally()
       }
     }
   }
+
+  // thermostat omega and angmom
+
+  if (oflag) omega_thermostat(tsqrt);
+  if (aflag) angmom_thermostat(tsqrt);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -373,12 +431,12 @@ void FixLangevin::post_force_tally()
   //   test v = 0 since some computes mask non-participating atoms via v = 0
   //   and added force has extra term not multiplied by v = 0
 
-  if (rmass) {
-    double boltz = force->boltz;
-    double dt = update->dt;
-    double mvv2e = force->mvv2e;
-    double ftm2v = force->ftm2v;
+  double boltz = force->boltz;
+  double dt = update->dt;
+  double mvv2e = force->mvv2e;
+  double ftm2v = force->ftm2v;
 
+  if (rmass) {
     if (which == NOBIAS) {
       for (int i = 0; i < nlocal; i++) {
 	if (mask[i] & groupbit) {
@@ -454,6 +512,100 @@ void FixLangevin::post_force_tally()
       }
     }
   }
+
+  // thermostat omega and angmom
+
+  if (oflag) omega_thermostat(tsqrt);
+  if (aflag) angmom_thermostat(tsqrt);
+}
+
+/* ----------------------------------------------------------------------
+   thermostat rotational dof via omega
+------------------------------------------------------------------------- */
+
+void FixLangevin::omega_thermostat(double tsqrt)
+{
+  double gamma1,gamma2;
+
+  double boltz = force->boltz;
+  double dt = update->dt;
+  double mvv2e = force->mvv2e;
+  double ftm2v = force->ftm2v;
+
+  double **torque = atom->torque;
+  double **omega = atom->omega;
+  double *radius = atom->radius;
+  double *rmass = atom->rmass;
+  int *mask = atom->mask;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+
+  double tran[3];
+  double inertiaone;
+
+  for (int i = 0; i < nlocal; i++) {
+    if (mask[i] & groupbit) {
+      inertiaone = SINERTIA*radius[i]*radius[i]*rmass[i];
+      gamma1 = -inertiaone / t_period / ftm2v;
+      gamma2 = sqrt(inertiaone) * sqrt(24.0*boltz/t_period/dt/mvv2e) / ftm2v;
+      gamma1 *= 1.0/ratio[type[i]];
+      gamma2 *= 1.0/sqrt(ratio[type[i]]) * tsqrt;
+      tran[0] = gamma2*(random->uniform()-0.5);
+      tran[1] = gamma2*(random->uniform()-0.5);
+      tran[2] = gamma2*(random->uniform()-0.5);
+      torque[i][0] += gamma1*omega[i][0] + tran[0];
+      torque[i][1] += gamma1*omega[i][1] + tran[1];
+      torque[i][2] += gamma1*omega[i][2] + tran[2];
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   thermostat rotational dof via angmom
+------------------------------------------------------------------------- */
+
+void FixLangevin::angmom_thermostat(double tsqrt)
+{
+  double gamma1,gamma2;
+
+  double boltz = force->boltz;
+  double dt = update->dt;
+  double mvv2e = force->mvv2e;
+  double ftm2v = force->ftm2v;
+
+  AtomVecEllipsoid::Bonus *bonus = avec->bonus;
+  double **torque = atom->torque;
+  double **angmom = atom->angmom;
+  double *rmass = atom->rmass;
+  int *ellipsoid = atom->ellipsoid;
+  int *mask = atom->mask;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+
+  double inertia[3],wbody[3],omega[3],tran[3],rot[3][3];
+  double *shape,*quat;
+
+  for (int i = 0; i < nlocal; i++) {
+    if (mask[i] & groupbit) {
+      shape = bonus[ellipsoid[i]].shape;
+      inertia[0] = EINERTIA*rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]);
+      inertia[1] = EINERTIA*rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]);
+      inertia[2] = EINERTIA*rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]);
+      quat = bonus[ellipsoid[i]].quat;
+      MathExtra::mq_to_omega(angmom[i],quat,inertia,omega);
+      
+      gamma1 = -1.0 / t_period / ftm2v;
+      gamma2 = sqrt(24.0*boltz/t_period/dt/mvv2e) / ftm2v;
+      gamma1 *= 1.0/ratio[type[i]];
+      gamma2 *= 1.0/sqrt(ratio[type[i]]) * tsqrt;
+      tran[0] = sqrt(inertia[0])*gamma2*(random->uniform()-0.5);
+      tran[1] = sqrt(inertia[1])*gamma2*(random->uniform()-0.5);
+      tran[2] = sqrt(inertia[2])*gamma2*(random->uniform()-0.5);
+      torque[i][0] += inertia[0]*gamma1*omega[0] + tran[0];
+      torque[i][1] += inertia[1]*gamma1*omega[1] + tran[1];
+      torque[i][2] += inertia[2]*gamma1*omega[2] + tran[2];
+    }
+  }
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/fix_langevin.h b/src/fix_langevin.h
index f325befd46..735f6cdcf0 100644
--- a/src/fix_langevin.h
+++ b/src/fix_langevin.h
@@ -41,11 +41,13 @@ class FixLangevin : public Fix {
   double memory_usage();
 
  protected:
-  int which,tally,zeroflag;
+  int which,tally,zeroflag,oflag,aflag;
   double t_start,t_stop,t_period;
   double *gfactor1,*gfactor2,*ratio;
   double energy,energy_onestep;
 
+  class AtomVecEllipsoid *avec;
+
   int nmax;
   double **flangevin;
 
@@ -57,6 +59,8 @@ class FixLangevin : public Fix {
 
   virtual void post_force_no_tally();
   virtual void post_force_tally();
+  void omega_thermostat(double);
+  void angmom_thermostat(double);
 };
 
 }
diff --git a/src/fix_rigid.cpp b/src/fix_rigid.cpp
index 4d0686409d..6d43b9949e 100644
--- a/src/fix_rigid.cpp
+++ b/src/fix_rigid.cpp
@@ -25,6 +25,7 @@
 #include "modify.h"
 #include "group.h"
 #include "comm.h"
+#include "random_mars.h"
 #include "force.h"
 #include "output.h"
 #include "memory.h"
@@ -45,11 +46,16 @@ FixRigid::FixRigid(LAMMPS *lmp, int narg, char **arg) :
 {
   int i,ibody;
 
+  scalar_flag = 1;
+  extscalar = 0;
   time_integrate = 1;
   rigid_flag = 1;
   virial_flag = 1;
   create_attribute = 1;
 
+  MPI_Comm_rank(world,&me);
+  MPI_Comm_size(world,&nprocs);
+
   // perform initial allocation of atom-based arrays
   // register with Atom class
 
@@ -193,12 +199,14 @@ FixRigid::FixRigid(LAMMPS *lmp, int narg, char **arg) :
   memory->create(imagebody,nbody,"rigid:imagebody");
   memory->create(fflag,nbody,3,"rigid:fflag");
   memory->create(tflag,nbody,3,"rigid:tflag");
+  memory->create(langextra,nbody,6,"rigid:langextra");
 
   memory->create(sum,nbody,6,"rigid:sum");
   memory->create(all,nbody,6,"rigid:all");
   memory->create(remapflag,nbody,4,"rigid:remapflag");
 
   // initialize force/torque flags to default = 1.0
+  // for 2d: fz, tx, ty = 0.0
 
   array_flag = 1;
   size_array_rows = nbody;
@@ -209,10 +217,13 @@ FixRigid::FixRigid(LAMMPS *lmp, int narg, char **arg) :
   for (i = 0; i < nbody; i++) {
     fflag[i][0] = fflag[i][1] = fflag[i][2] = 1.0;
     tflag[i][0] = tflag[i][1] = tflag[i][2] = 1.0;
+    if (domain->dimension == 2) fflag[i][2] = tflag[i][0] = tflag[i][1] = 0.0;
   }
 
   // parse optional args
 
+  int seed;
+  langflag = 0;
   tempflag = 0;
   pressflag = 0;
   t_chain = 10;
@@ -238,6 +249,9 @@ FixRigid::FixRigid(LAMMPS *lmp, int narg, char **arg) :
       else if (strcmp(arg[iarg+4],"on") == 0) zflag = 1.0;
       else error->all("Illegal fix rigid command");
 
+      if (domain->dimension == 2 && zflag == 1.0)
+	error->all("Fix rigid z force cannot be on for 2d simulation");
+
       int count = 0;
       for (int m = mlo; m <= mhi; m++) {
 	fflag[m-1][0] = xflag;
@@ -266,6 +280,9 @@ FixRigid::FixRigid(LAMMPS *lmp, int narg, char **arg) :
       else if (strcmp(arg[iarg+4],"on") == 0) zflag = 1.0;
       else error->all("Illegal fix rigid command");
 
+      if (domain->dimension == 2 && (xflag == 1.0 || yflag == 1.0))
+	  error->all("Fix rigid xy torque cannot be on for 2d simulation");
+
       int count = 0;
       for (int m = mlo; m <= mhi; m++) {
 	tflag[m-1][0] = xflag;
@@ -277,10 +294,24 @@ FixRigid::FixRigid(LAMMPS *lmp, int narg, char **arg) :
 
       iarg += 5;
 
+    } else if (strcmp(arg[iarg],"langevin") == 0) {
+      if (iarg+5 > narg) error->all("Illegal fix rigid command");
+      if (strcmp(style,"rigid") != 0 && strcmp(style,"rigid/nve") != 0)
+	error->all("Illegal fix rigid command");
+      langflag = 1;
+      t_start = atof(arg[iarg+1]);
+      t_stop = atof(arg[iarg+2]);
+      t_period = atof(arg[iarg+3]);
+      seed = atoi(arg[iarg+4]);
+      if (t_period <= 0.0) 
+	error->all("Fix rigid langevin period must be > 0.0");
+      if (seed <= 0) error->all("Illegal fix rigid command");
+      iarg += 5;
+
     } else if (strcmp(arg[iarg],"temp") == 0) {
       if (iarg+4 > narg) error->all("Illegal fix rigid command");
       if (strcmp(style,"rigid/nvt") != 0 && strcmp(style,"rigid/npt") != 0)
-	error->all("Illegal fix/rigid command");
+	error->all("Illegal fix rigid command");
       tempflag = 1;
       t_start = atof(arg[iarg+1]);
       t_stop = atof(arg[iarg+2]);
@@ -290,7 +321,7 @@ FixRigid::FixRigid(LAMMPS *lmp, int narg, char **arg) :
     } else if (strcmp(arg[iarg],"press") == 0) {
       if (iarg+4 > narg) error->all("Illegal fix rigid command");
       if (strcmp(style,"rigid/npt") != 0)
-	error->all("Illegal fix/rigid command");
+	error->all("Illegal fix rigid command");
       pressflag = 1;
       p_start = atof(arg[iarg+1]);
       p_stop = atof(arg[iarg+2]);
@@ -300,7 +331,7 @@ FixRigid::FixRigid(LAMMPS *lmp, int narg, char **arg) :
     } else if (strcmp(arg[iarg],"tparam") == 0) {
       if (iarg+4 > narg) error->all("Illegal fix rigid command");
       if (strcmp(style,"rigid/nvt") != 0)
-	error->all("Illegal fix/rigid command");
+	error->all("Illegal fix rigid command");
       t_chain = atoi(arg[iarg+1]);
       t_iter = atoi(arg[iarg+2]);
       t_order = atoi(arg[iarg+3]);
@@ -309,13 +340,18 @@ FixRigid::FixRigid(LAMMPS *lmp, int narg, char **arg) :
     } else if (strcmp(arg[iarg],"pparam") == 0) {
       if (iarg+2 > narg) error->all("Illegal fix rigid command");
       if (strcmp(style,"rigid/npt") != 0)
-	error->all("Illegal fix/rigid command");
+	error->all("Illegal fix rigid command");
       p_chain = atoi(arg[iarg+1]);
       iarg += 2;
 
     } else error->all("Illegal fix rigid command");
   }
 
+  // initialize Marsaglia RNG with processor-unique seed
+
+  if (langflag) random = new RanMars(lmp,seed + me);
+  else random = NULL;
+
   // initialize vector output quantities in case accessed before run
 
   for (i = 0; i < nbody; i++) {
@@ -369,7 +405,7 @@ FixRigid::FixRigid(LAMMPS *lmp, int narg, char **arg) :
   int nsum = 0;
   for (ibody = 0; ibody < nbody; ibody++) nsum += nrigid[ibody];
   
-  if (comm->me == 0) {
+  if (me == 0) {
     if (screen) fprintf(screen,"%d rigid bodies with %d atoms\n",nbody,nsum);
     if (logfile) fprintf(logfile,"%d rigid bodies with %d atoms\n",nbody,nsum);
   }
@@ -383,6 +419,8 @@ FixRigid::~FixRigid()
 
   atom->delete_callback(id,0);
   
+  delete random;
+
   // delete locally stored arrays
   
   memory->destroy(body);
@@ -409,6 +447,7 @@ FixRigid::~FixRigid()
   memory->destroy(imagebody);
   memory->destroy(fflag);
   memory->destroy(tflag);
+  memory->destroy(langextra);
 
   memory->destroy(sum);
   memory->destroy(all);
@@ -422,6 +461,7 @@ int FixRigid::setmask()
   int mask = 0;
   mask |= INITIAL_INTEGRATE;
   mask |= FINAL_INTEGRATE;
+  if (langflag) mask |= POST_FORCE;
   mask |= PRE_NEIGHBOR;
   mask |= INITIAL_INTEGRATE_RESPA;
   mask |= FINAL_INTEGRATE_RESPA;
@@ -441,7 +481,7 @@ void FixRigid::init()
   int count = 0;
   for (int i = 0; i < modify->nfix; i++)
     if (strcmp(modify->fix[i]->style,"rigid") == 0) count++;
-  if (count > 1 && comm->me == 0) error->warning("More than one fix rigid");
+  if (count > 1 && me == 0) error->warning("More than one fix rigid");
 
   // error if npt,nph fix comes before rigid fix
 
@@ -855,6 +895,15 @@ void FixRigid::init()
 	fabs(all[ibody][5]/norm) > TOLERANCE)
       error->all("Fix rigid: Bad principal moments");
   }
+
+  // temperature scale factor
+
+  double ndof = 0.0;
+  for (ibody = 0; ibody < nbody; ibody++) {
+    ndof += fflag[ibody][0] + fflag[ibody][1] + fflag[ibody][2];
+    ndof += tflag[ibody][0] + tflag[ibody][1] + tflag[ibody][2];
+  }
+  tfactor = force->mvv2e / (ndof * force->boltz);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -998,6 +1047,13 @@ void FixRigid::setup(int vflag)
     torque[ibody][2] = all[ibody][5];
   }
 
+  // zero langextra in case Langevin thermostat not used
+  // no point to calling post_force() here since langextra
+  //   is only added to fcm/torque in final_integrate()
+
+  for (ibody = 0; ibody < nbody; ibody++)
+    for (i = 0; i < 6; i++) langextra[ibody][i] = 0.0;
+
   // virial setup before call to set_v
 
   if (vflag) v_setup(vflag);
@@ -1072,6 +1128,50 @@ void FixRigid::initial_integrate(int vflag)
   set_xv();
 }
 
+/* ----------------------------------------------------------------------
+   apply Langevin thermostat to all 6 DOF of rigid bodies
+   computed by proc 0, broadcast to other procs
+   unlike fix langevin, this stores extra force in extra arrays,
+     which are added in when final_integrate() calculates a new fcm/torque
+------------------------------------------------------------------------- */
+
+void FixRigid::post_force(int vflag)
+{
+  if (me == 0) {
+    double gamma1,gamma2;
+
+    double delta = update->ntimestep - update->beginstep;
+    delta /= update->endstep - update->beginstep;
+    double t_target = t_start + delta * (t_stop-t_start);
+    double tsqrt = sqrt(t_target);
+
+    double boltz = force->boltz;
+    double dt = update->dt;
+    double mvv2e = force->mvv2e;
+    double ftm2v = force->ftm2v;
+    
+    for (int i = 0; i < nbody; i++) {
+      gamma1 = -masstotal[i] / t_period / ftm2v;
+      gamma2 = sqrt(masstotal[i]) * tsqrt * 
+	sqrt(24.0*boltz/t_period/dt/mvv2e) / ftm2v;
+      langextra[i][0] = gamma1*vcm[i][0] + gamma2*(random->uniform()-0.5);
+      langextra[i][1] = gamma1*vcm[i][1] + gamma2*(random->uniform()-0.5);
+      langextra[i][2] = gamma1*vcm[i][2] + gamma2*(random->uniform()-0.5);
+
+      gamma1 = -1.0 / t_period / ftm2v;
+      gamma2 = tsqrt * sqrt(24.0*boltz/t_period/dt/mvv2e) / ftm2v;
+      langextra[i][3] = inertia[i][0]*gamma1*omega[i][0] + 
+	sqrt(inertia[i][0])*gamma2*(random->uniform()-0.5);
+      langextra[i][4] = inertia[i][1]*gamma1*omega[i][1] + 
+	sqrt(inertia[i][1])*gamma2*(random->uniform()-0.5);
+      langextra[i][5] = inertia[i][2]*gamma1*omega[i][2] + 
+	sqrt(inertia[i][2])*gamma2*(random->uniform()-0.5);
+    }
+  }
+
+  MPI_Bcast(&langextra[0][0],6*nbody,MPI_DOUBLE,0,world);
+}
+
 /* ---------------------------------------------------------------------- */
 
 void FixRigid::final_integrate()
@@ -1150,13 +1250,17 @@ void FixRigid::final_integrate()
 
   MPI_Allreduce(sum[0],all[0],6*nbody,MPI_DOUBLE,MPI_SUM,world);
   
+  // update vcm and angmom
+  // include Langevin thermostat forces
+  // fflag,tflag = 0 for some dimensions in 2d
+
   for (ibody = 0; ibody < nbody; ibody++) {
-    fcm[ibody][0] = all[ibody][0];
-    fcm[ibody][1] = all[ibody][1];
-    fcm[ibody][2] = all[ibody][2];
-    torque[ibody][0] = all[ibody][3];
-    torque[ibody][1] = all[ibody][4];
-    torque[ibody][2] = all[ibody][5];
+    fcm[ibody][0] = all[ibody][0] + langextra[ibody][0];
+    fcm[ibody][1] = all[ibody][1] + langextra[ibody][1];
+    fcm[ibody][2] = all[ibody][2] + langextra[ibody][2];
+    torque[ibody][0] = all[ibody][3] + langextra[ibody][3];
+    torque[ibody][1] = all[ibody][4] + langextra[ibody][4];
+    torque[ibody][2] = all[ibody][5] + langextra[ibody][5];
 
     // update vcm by 1/2 step
   
@@ -1360,7 +1464,7 @@ int FixRigid::dof(int igroup)
     if (nall[ibody]+mall[ibody] > 0 && 
 	nall[ibody]+mall[ibody] != nrigid[ibody]) flag = 1;
   }
-  if (flag && comm->me == 0)
+  if (flag && me == 0)
     error->warning("Computing temperature of portions of rigid bodies");
 
   // remove appropriate DOFs for each rigid body wholly in temperature group
@@ -1834,6 +1938,42 @@ void FixRigid::reset_dt()
   dtq = 0.5 * update->dt;
 }
 
+/* ----------------------------------------------------------------------
+   return temperature of collection of rigid bodies
+   non-active DOF are removed by fflag/tflag and in tfactor
+------------------------------------------------------------------------- */
+
+double FixRigid::compute_scalar()
+{
+  double wbody[3],rot[3][3];
+
+  double t = 0.0;
+
+  for (int i = 0; i < nbody; i++) {
+    t += masstotal[i] * (fflag[i][0]*vcm[i][0]*vcm[i][0] + 
+    			 fflag[i][1]*vcm[i][1]*vcm[i][1] +	\
+    			 fflag[i][2]*vcm[i][2]*vcm[i][2]);
+    
+    // wbody = angular velocity in body frame
+      
+    MathExtra::quat_to_mat(quat[i],rot);
+    MathExtra::transpose_matvec(rot,angmom[i],wbody);
+    if (inertia[i][0] == 0.0) wbody[0] = 0.0;
+    else wbody[0] /= inertia[i][0];
+    if (inertia[i][1] == 0.0) wbody[1] = 0.0;
+    else wbody[1] /= inertia[i][1];
+    if (inertia[i][2] == 0.0) wbody[2] = 0.0;
+    else wbody[2] /= inertia[i][2];
+    
+    t += tflag[i][0]*inertia[i][0]*wbody[0]*wbody[0] +
+      tflag[i][1]*inertia[i][1]*wbody[1]*wbody[1] + 
+      tflag[i][2]*inertia[i][2]*wbody[2]*wbody[2];
+  }
+
+  t *= tfactor;
+  return t;
+}
+
 /* ----------------------------------------------------------------------
    return attributes of a rigid body
    15 values per body
diff --git a/src/fix_rigid.h b/src/fix_rigid.h
index 3aa343015a..06121ad47a 100644
--- a/src/fix_rigid.h
+++ b/src/fix_rigid.h
@@ -32,9 +32,11 @@ class FixRigid : public Fix {
   virtual void init();
   virtual void setup(int);
   virtual void initial_integrate(int);
+  void post_force(int);
   virtual void final_integrate();
   void initial_integrate_respa(int, int, int);
   void final_integrate_respa(int, int);
+  virtual double compute_scalar();
 
   double memory_usage();
   void grow_arrays(int);
@@ -50,6 +52,7 @@ class FixRigid : public Fix {
   double compute_array(int, int);
 
  protected:
+  int me,nprocs;
   double dtv,dtf,dtq;
   double *step_respa;
   int triclinic;
@@ -70,6 +73,7 @@ class FixRigid : public Fix {
   int *imagebody;           // image flags of xcm of each rigid body
   double **fflag;           // flag for on/off of center-of-mass force
   double **tflag;           // flag for on/off of center-of-mass torque
+  double **langextra;       // Langevin thermostat forces and torques
 
   int *body;                // which body each atom is part of (-1 if none)
   double **displace;        // displacement of each atom in body coords
@@ -85,6 +89,9 @@ class FixRigid : public Fix {
   double **qorient;         // rotation state of ext particle wrt rigid body
   double **dorient;         // orientation of dipole mu wrt rigid body
 
+  double tfactor;           // scale factor on temperature of rigid bodies
+  int langflag;             // 0/1 = no/yes Langevin thermostat
+
   int tempflag;             // NVT settings
   double t_start,t_stop;
   double t_period,t_freq;
@@ -95,6 +102,7 @@ class FixRigid : public Fix {
   double p_period,p_freq;
   int p_chain;
 
+  class RanMars *random;
   class AtomVecEllipsoid *avec_ellipsoid;
 
                             // bitmasks for eflags

From 72644dcb8d26290fcc21da6992876b810be9b737 Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Fri, 29 Apr 2011 16:27:56 +0000
Subject: [PATCH 05/21] git-svn-id:
 svn://svn.icms.temple.edu/lammps-ro/trunk@6034
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 doc/compute_temp_asphere.html | 52 ++++++++++++++++------
 doc/compute_temp_asphere.txt  | 47 ++++++++++++++------
 doc/compute_temp_sphere.html  | 56 ++++++++++++++++-------
 doc/compute_temp_sphere.txt   | 51 ++++++++++++++-------
 doc/fix_langevin.html         | 38 +++++++++++++---
 doc/fix_langevin.txt          | 37 ++++++++++++---
 doc/fix_rigid.html            | 84 +++++++++++++++++++++++++----------
 doc/fix_rigid.txt             | 84 +++++++++++++++++++++++++----------
 8 files changed, 333 insertions(+), 116 deletions(-)

diff --git a/doc/compute_temp_asphere.html b/doc/compute_temp_asphere.html
index 3b29b68e74..daaad528a9 100644
--- a/doc/compute_temp_asphere.html
+++ b/doc/compute_temp_asphere.html
@@ -13,16 +13,29 @@
 </H3>
 <P><B>Syntax:</B>
 </P>
-<PRE>compute ID group-ID temp/asphere bias-ID 
+<PRE>compute ID group-ID temp/asphere keyword value ... 
 </PRE>
-<UL><LI>ID, group-ID are documented in <A HREF = "compute.html">compute</A> command
-<LI>temp/asphere = style name of this compute command
-<LI>bias-ID = ID of a temperature compute that removes a velocity bias (optional) 
+<UL><LI>ID, group-ID are documented in <A HREF = "compute.html">compute</A> command 
+
+<LI>temp/asphere = style name of this compute command 
+
+<LI>zero or more keyword/value pairs may be appended 
+
+<LI>keyword = <I>bias</I> or <I>dof</I> 
+
+<PRE>  <I>bias</I> value = bias-ID<I>uniform</I> or <I>gaussian</I>
+    bias-ID = ID of a temperature compute that removes a velocity bias
+  <I>dof</I> value = <I>all</I> or <I>rotate</I>
+    all = compute temperature of translational and rotational degrees of freedom
+    rotate = compute temperature of just rotational degrees of freedom 
+</PRE>
+
 </UL>
 <P><B>Examples:</B>
 </P>
 <PRE>compute 1 all temp/asphere
-compute myTemp mobile temp/asphere tempCOM 
+compute myTemp mobile temp/asphere bias tempCOM
+compute myTemp mobile temp/asphere dof rotate 
 </PRE>
 <P><B>Description:</B>
 </P>
@@ -75,15 +88,6 @@ vector are ordered xx, yy, zz, xy, xz, yz.
 constant for the duration of the run; use the <I>dynamic</I> option of the
 <A HREF = "compute_modify.html">compute_modify</A> command if this is not the case.
 </P>
-<P>If a <I>bias-ID</I> is specified it must be the ID of a temperature compute
-that removes a "bias" velocity from each atom.  This allows compute
-temp/sphere to compute its thermal temperature after the translational
-kinetic energy components have been altered in a prescribed way,
-e.g. to remove a velocity profile.  Thermostats that use this compute
-will work with this bias term.  See the doc pages for individual
-computes that calculate a temperature and the doc pages for fixes that
-perform thermostatting for more details.
-</P>
 <P>This compute subtracts out translational degrees-of-freedom due to
 fixes that constrain molecular motion, such as <A HREF = "fix_shake.html">fix
 shake</A> and <A HREF = "fix_rigid.html">fix rigid</A>.  This means the
@@ -96,6 +100,26 @@ be altered using the <I>extra</I> option of the
 discussion of different ways to compute temperature and perform
 thermostatting.
 </P>
+<HR>
+
+<P>The keyword/value option pairs are used in the following ways.
+</P>
+<P>For the <I>bias</I> keyword, <I>bias-ID</I> refers to the ID of a temperature
+compute that removes a "bias" velocity from each atom.  This allows
+compute temp/sphere to compute its thermal temperature after the
+translational kinetic energy components have been altered in a
+prescribed way, e.g. to remove a velocity profile.  Thermostats that
+use this compute will work with this bias term.  See the doc pages for
+individual computes that calculate a temperature and the doc pages for
+fixes that perform thermostatting for more details.
+</P>
+<P>For the <I>dof</I> keyword, a setting of <I>all</I> calculates a temperature
+that includes both translational and rotational degrees of freedom.  A
+setting of <I>rotate</I> calculates a temperature that includes only
+rotational degrees of freedom.
+</P>
+<HR>
+
 <P><B>Output info:</B>
 </P>
 <P>This compute calculates a global scalar (the temperature) and a global
diff --git a/doc/compute_temp_asphere.txt b/doc/compute_temp_asphere.txt
index b22256fbe8..cdd8870981 100755
--- a/doc/compute_temp_asphere.txt
+++ b/doc/compute_temp_asphere.txt
@@ -10,16 +10,24 @@ compute temp/asphere command :h3
 
 [Syntax:]
 
-compute ID group-ID temp/asphere bias-ID :pre
+compute ID group-ID temp/asphere keyword value ... :pre
 
-ID, group-ID are documented in "compute"_compute.html command
-temp/asphere = style name of this compute command
-bias-ID = ID of a temperature compute that removes a velocity bias (optional) :ul
+ID, group-ID are documented in "compute"_compute.html command :ulb,l
+temp/asphere = style name of this compute command :l
+zero or more keyword/value pairs may be appended :l
+keyword = {bias} or {dof} :l
+  {bias} value = bias-ID{uniform} or {gaussian}
+    bias-ID = ID of a temperature compute that removes a velocity bias
+  {dof} value = {all} or {rotate}
+    all = compute temperature of translational and rotational degrees of freedom
+    rotate = compute temperature of just rotational degrees of freedom :pre
+:ule
 
 [Examples:]
 
 compute 1 all temp/asphere
-compute myTemp mobile temp/asphere tempCOM :pre
+compute myTemp mobile temp/asphere bias tempCOM
+compute myTemp mobile temp/asphere dof rotate :pre
 
 [Description:]
 
@@ -72,15 +80,6 @@ The number of atoms contributing to the temperature is assumed to be
 constant for the duration of the run; use the {dynamic} option of the
 "compute_modify"_compute_modify.html command if this is not the case.
 
-If a {bias-ID} is specified it must be the ID of a temperature compute
-that removes a "bias" velocity from each atom.  This allows compute
-temp/sphere to compute its thermal temperature after the translational
-kinetic energy components have been altered in a prescribed way,
-e.g. to remove a velocity profile.  Thermostats that use this compute
-will work with this bias term.  See the doc pages for individual
-computes that calculate a temperature and the doc pages for fixes that
-perform thermostatting for more details.
-
 This compute subtracts out translational degrees-of-freedom due to
 fixes that constrain molecular motion, such as "fix
 shake"_fix_shake.html and "fix rigid"_fix_rigid.html.  This means the
@@ -93,6 +92,26 @@ See "this howto section"_Section_howto.html#4_16 of the manual for a
 discussion of different ways to compute temperature and perform
 thermostatting.
 
+:line
+
+The keyword/value option pairs are used in the following ways.
+
+For the {bias} keyword, {bias-ID} refers to the ID of a temperature
+compute that removes a "bias" velocity from each atom.  This allows
+compute temp/sphere to compute its thermal temperature after the
+translational kinetic energy components have been altered in a
+prescribed way, e.g. to remove a velocity profile.  Thermostats that
+use this compute will work with this bias term.  See the doc pages for
+individual computes that calculate a temperature and the doc pages for
+fixes that perform thermostatting for more details.
+
+For the {dof} keyword, a setting of {all} calculates a temperature
+that includes both translational and rotational degrees of freedom.  A
+setting of {rotate} calculates a temperature that includes only
+rotational degrees of freedom.
+
+:line
+
 [Output info:]
 
 This compute calculates a global scalar (the temperature) and a global
diff --git a/doc/compute_temp_sphere.html b/doc/compute_temp_sphere.html
index 31e73a05f5..23e18d16b5 100644
--- a/doc/compute_temp_sphere.html
+++ b/doc/compute_temp_sphere.html
@@ -13,16 +13,29 @@
 </H3>
 <P><B>Syntax:</B>
 </P>
-<PRE>compute ID group-ID temp/sphere bias-ID 
+<PRE>compute ID group-ID temp/sphere keyword value ... 
 </PRE>
-<UL><LI>ID, group-ID are documented in <A HREF = "compute.html">compute</A> command
-<LI>temp/sphere = style name of this compute command
-<LI>bias-ID = ID of a temperature compute that removes a velocity bias (optional) 
+<UL><LI>ID, group-ID are documented in <A HREF = "compute.html">compute</A> command 
+
+<LI>temp/sphere = style name of this compute command 
+
+<LI>zero or more keyword/value pairs may be appended 
+
+<LI>keyword = <I>bias</I> or <I>dof</I> 
+
+<PRE>  <I>bias</I> value = bias-ID<I>uniform</I> or <I>gaussian</I>
+    bias-ID = ID of a temperature compute that removes a velocity bias
+  <I>dof</I> value = <I>all</I> or <I>rotate</I>
+    all = compute temperature of translational and rotational degrees of freedom
+    rotate = compute temperature of just rotational degrees of freedom 
+</PRE>
+
 </UL>
 <P><B>Examples:</B>
 </P>
 <PRE>compute 1 all temp/sphere
-compute myTemp mobile temp/sphere tempCOM 
+compute myTemp mobile temp/sphere bias tempCOM
+compute myTemp mobile temp/sphere dof rotate 
 </PRE>
 <P><B>Description:</B>
 </P>
@@ -66,15 +79,6 @@ the vector are ordered xx, yy, zz, xy, xz, yz.
 constant for the duration of the run; use the <I>dynamic</I> option of the
 <A HREF = "compute_modify.html">compute_modify</A> command if this is not the case.
 </P>
-<P>If a <I>bias-ID</I> is specified it must be the ID of a temperature compute
-that removes a "bias" velocity from each atom.  This allows compute
-temp/sphere to compute its thermal temperature after the translational
-kinetic energy components have been altered in a prescribed way,
-e.g. to remove a velocity profile.  Thermostats that use this compute
-will work with this bias term.  See the doc pages for individual
-computes that calculate a temperature and the doc pages for fixes that
-perform thermostatting for more details.
-</P>
 <P>This compute subtracts out translational degrees-of-freedom due to
 fixes that constrain molecular motion, such as <A HREF = "fix_shake.html">fix
 shake</A> and <A HREF = "fix_rigid.html">fix rigid</A>.  This means the
@@ -87,6 +91,26 @@ be altered using the <I>extra</I> option of the
 discussion of different ways to compute temperature and perform
 thermostatting.
 </P>
+<HR>
+
+<P>The keyword/value option pairs are used in the following ways.
+</P>
+<P>For the <I>bias</I> keyword, <I>bias-ID</I> refers to the ID of a temperature
+compute that removes a "bias" velocity from each atom.  This allows
+compute temp/sphere to compute its thermal temperature after the
+translational kinetic energy components have been altered in a
+prescribed way, e.g. to remove a velocity profile.  Thermostats that
+use this compute will work with this bias term.  See the doc pages for
+individual computes that calculate a temperature and the doc pages for
+fixes that perform thermostatting for more details.
+</P>
+<P>For the <I>dof</I> keyword, a setting of <I>all</I> calculates a temperature
+that includes both translational and rotational degrees of freedom.  A
+setting of <I>rotate</I> calculates a temperature that includes only
+rotational degrees of freedom.
+</P>
+<HR>
+
 <P><B>Output info:</B>
 </P>
 <P>This compute calculates a global scalar (the temperature) and a global
@@ -116,6 +140,8 @@ particles with radius = 0.0.
 <P><A HREF = "compute_temp.html">compute temp</A>, <A HREF = "compute_temp.html">compute
 temp/asphere</A>
 </P>
-<P><B>Default:</B> none
+<P><B>Default:</B>
+</P>
+<P>The option defaults are no bias and dof = all.
 </P>
 </HTML>
diff --git a/doc/compute_temp_sphere.txt b/doc/compute_temp_sphere.txt
index 874f50f364..16d1fcc761 100755
--- a/doc/compute_temp_sphere.txt
+++ b/doc/compute_temp_sphere.txt
@@ -10,16 +10,24 @@ compute temp/sphere command :h3
 
 [Syntax:]
 
-compute ID group-ID temp/sphere bias-ID :pre
+compute ID group-ID temp/sphere keyword value ... :pre
 
-ID, group-ID are documented in "compute"_compute.html command
-temp/sphere = style name of this compute command
-bias-ID = ID of a temperature compute that removes a velocity bias (optional) :ul
+ID, group-ID are documented in "compute"_compute.html command :ulb,l
+temp/sphere = style name of this compute command :l
+zero or more keyword/value pairs may be appended :l
+keyword = {bias} or {dof} :l
+  {bias} value = bias-ID{uniform} or {gaussian}
+    bias-ID = ID of a temperature compute that removes a velocity bias
+  {dof} value = {all} or {rotate}
+    all = compute temperature of translational and rotational degrees of freedom
+    rotate = compute temperature of just rotational degrees of freedom :pre
+:ule
 
 [Examples:]
 
 compute 1 all temp/sphere
-compute myTemp mobile temp/sphere tempCOM :pre
+compute myTemp mobile temp/sphere bias tempCOM
+compute myTemp mobile temp/sphere dof rotate :pre
 
 [Description:]
 
@@ -63,15 +71,6 @@ The number of atoms contributing to the temperature is assumed to be
 constant for the duration of the run; use the {dynamic} option of the
 "compute_modify"_compute_modify.html command if this is not the case.
 
-If a {bias-ID} is specified it must be the ID of a temperature compute
-that removes a "bias" velocity from each atom.  This allows compute
-temp/sphere to compute its thermal temperature after the translational
-kinetic energy components have been altered in a prescribed way,
-e.g. to remove a velocity profile.  Thermostats that use this compute
-will work with this bias term.  See the doc pages for individual
-computes that calculate a temperature and the doc pages for fixes that
-perform thermostatting for more details.
-
 This compute subtracts out translational degrees-of-freedom due to
 fixes that constrain molecular motion, such as "fix
 shake"_fix_shake.html and "fix rigid"_fix_rigid.html.  This means the
@@ -84,6 +83,26 @@ See "this howto section"_Section_howto.html#4_16 of the manual for a
 discussion of different ways to compute temperature and perform
 thermostatting.
 
+:line
+
+The keyword/value option pairs are used in the following ways.
+
+For the {bias} keyword, {bias-ID} refers to the ID of a temperature
+compute that removes a "bias" velocity from each atom.  This allows
+compute temp/sphere to compute its thermal temperature after the
+translational kinetic energy components have been altered in a
+prescribed way, e.g. to remove a velocity profile.  Thermostats that
+use this compute will work with this bias term.  See the doc pages for
+individual computes that calculate a temperature and the doc pages for
+fixes that perform thermostatting for more details.
+
+For the {dof} keyword, a setting of {all} calculates a temperature
+that includes both translational and rotational degrees of freedom.  A
+setting of {rotate} calculates a temperature that includes only
+rotational degrees of freedom.
+
+:line
+
 [Output info:]
 
 This compute calculates a global scalar (the temperature) and a global
@@ -113,4 +132,6 @@ particles with radius = 0.0.
 "compute temp"_compute_temp.html, "compute
 temp/asphere"_compute_temp.html
 
-[Default:] none
+[Default:]
+
+The option defaults are no bias and dof = all.
diff --git a/doc/fix_langevin.html b/doc/fix_langevin.html
index 07c16421e0..b304ac44fb 100644
--- a/doc/fix_langevin.html
+++ b/doc/fix_langevin.html
@@ -27,14 +27,21 @@
 
 <LI>zero or more keyword/value pairs may be appended 
 
-<PRE>keyword = <I>scale</I> or <I>tally</I>
+<LI>keyword = <I>angmom</I> or <I>omega</I> or <I>scale</I> or <I>tally</I> or <I>zero</I> 
+
+<PRE>  <I>angmom</I> value = <I>no</I> or <I>yes</I>
+    <I>no</I> = do not thermostat rotational degrees of freedom via the angular momentum
+    <I>yes</I> = do thermostat rotational degrees of freedom via the angular momentum
+  <I>omega</I> value = <I>no</I> or <I>yes</I>
+    <I>no</I> = do not thermostat rotational degrees of freedom via then angular velocity
+    <I>yes</I> = do thermostat rotational degrees of freedom via the angular velocity
   <I>scale</I> values = type ratio
     type = atom type (1-N)
     ratio = factor by which to scale the damping coefficient
-  <I>tally</I> values = <I>no</I> or <I>yes</I>
+  <I>tally</I> value = <I>no</I> or <I>yes</I>
     <I>no</I> = do not tally the energy added/subtracted to atoms
     <I>yes</I> = do tally the energy added/subtracted to atoms
-  <I>zero</I> values = <I>no</I> or <I>yes</I>
+  <I>zero</I> value = <I>no</I> or <I>yes</I>
     <I>no</I> = do not set total random force to zero
     <I>yes</I> = set total random force to zero 
 </PRE>
@@ -135,6 +142,25 @@ generate its own unique seed and its own stream of random numbers.
 Thus the dynamics of the system will not be identical on two runs on
 different numbers of processors.
 </P>
+<HR>
+
+<P>The keyword/value option pairs are used in the following ways.
+</P>
+<P>The keyword <I>angmom</I> and <I>omega</I> keywords enable thermostatting of
+rotational degrees of freedom in addition to the usual translational
+degrees of freedom.  This can only be done for finite-size particles.
+A simulation using atom_style sphere defines an omega for finite-size
+spheres.  A simulation using atom_style ellipsoid defines a finite
+size and shape for aspherical particles and an angular momentum.  The
+Langevin formulas for thermostatting the rotational degrees of freedom
+are the same as those above, where force is replaced by torque, m is
+replaced by the moment of inertia I, and v is replaced by omega (which
+is derived from the angular momentum in the case of aspherical
+particles).  The rotational temperature of the particles can be
+monitored by the <A HREF = "compute_temp_sphere.html">compute temp/sphere</A> and
+<A HREF = "compute_temp_asphere.html">compute temp/asphere</A> commands with their
+rotate options.
+</P>
 <P>The keyword <I>scale</I> allows the damp factor to be scaled up or down by
 the specified factor for atoms of that type.  This can be useful when
 different atom types have different sizes or masses.  It can be used
@@ -166,6 +192,8 @@ to zero by subtracting off an equal part of it from each atom in the
 group.  As a result, the center-of-mass of a system with zero initial
 momentum will not drift over time.
 </P>
+<HR>
+
 <P><B>Restart, fix_modify, output, run start/stop, minimize info:</B>
 </P>
 <P>No information about this fix is written to <A HREF = "restart.html">binary restart
@@ -209,8 +237,8 @@ dpd/tstat</A>
 </P>
 <P><B>Default:</B>
 </P>
-<P>The option defaults are scale = 1.0 for all types, tally = no, zero =
-no.
+<P>The option defaults are angmom = no, omega = no, scale = 1.0 for all
+types, tally = no, zero = no.
 </P>
 <HR>
 
diff --git a/doc/fix_langevin.txt b/doc/fix_langevin.txt
index 422889c9b1..1228b63265 100644
--- a/doc/fix_langevin.txt
+++ b/doc/fix_langevin.txt
@@ -18,14 +18,20 @@ Tstart,Tstop = desired temperature at start/end of run (temperature units) :l
 damp = damping parameter (time units) :l
 seed = random number seed to use for white noise (positive integer) :l
 zero or more keyword/value pairs may be appended :l
-keyword = {scale} or {tally}
+keyword = {angmom} or {omega} or {scale} or {tally} or {zero} :l
+  {angmom} value = {no} or {yes}
+    {no} = do not thermostat rotational degrees of freedom via the angular momentum
+    {yes} = do thermostat rotational degrees of freedom via the angular momentum
+  {omega} value = {no} or {yes}
+    {no} = do not thermostat rotational degrees of freedom via then angular velocity
+    {yes} = do thermostat rotational degrees of freedom via the angular velocity
   {scale} values = type ratio
     type = atom type (1-N)
     ratio = factor by which to scale the damping coefficient
-  {tally} values = {no} or {yes}
+  {tally} value = {no} or {yes}
     {no} = do not tally the energy added/subtracted to atoms
     {yes} = do tally the energy added/subtracted to atoms
-  {zero} values = {no} or {yes}
+  {zero} value = {no} or {yes}
     {no} = do not set total random force to zero
     {yes} = set total random force to zero :pre
 :ule
@@ -125,6 +131,25 @@ generate its own unique seed and its own stream of random numbers.
 Thus the dynamics of the system will not be identical on two runs on
 different numbers of processors.
 
+:line
+
+The keyword/value option pairs are used in the following ways.
+
+The keyword {angmom} and {omega} keywords enable thermostatting of
+rotational degrees of freedom in addition to the usual translational
+degrees of freedom.  This can only be done for finite-size particles.
+A simulation using atom_style sphere defines an omega for finite-size
+spheres.  A simulation using atom_style ellipsoid defines a finite
+size and shape for aspherical particles and an angular momentum.  The
+Langevin formulas for thermostatting the rotational degrees of freedom
+are the same as those above, where force is replaced by torque, m is
+replaced by the moment of inertia I, and v is replaced by omega (which
+is derived from the angular momentum in the case of aspherical
+particles).  The rotational temperature of the particles can be
+monitored by the "compute temp/sphere"_compute_temp_sphere.html and
+"compute temp/asphere"_compute_temp_asphere.html commands with their
+rotate options.
+
 The keyword {scale} allows the damp factor to be scaled up or down by
 the specified factor for atoms of that type.  This can be useful when
 different atom types have different sizes or masses.  It can be used
@@ -156,6 +181,8 @@ to zero by subtracting off an equal part of it from each atom in the
 group.  As a result, the center-of-mass of a system with zero initial
 momentum will not drift over time.
 
+:line
+
 [Restart, fix_modify, output, run start/stop, minimize info:]
 
 No information about this fix is written to "binary restart
@@ -199,8 +226,8 @@ dpd/tstat"_pair_dpd.html
 
 [Default:]
 
-The option defaults are scale = 1.0 for all types, tally = no, zero =
-no.
+The option defaults are angmom = no, omega = no, scale = 1.0 for all
+types, tally = no, zero = no.
 
 :line
 
diff --git a/doc/fix_rigid.html b/doc/fix_rigid.html
index b7b057c857..9abf8effc8 100644
--- a/doc/fix_rigid.html
+++ b/doc/fix_rigid.html
@@ -33,9 +33,13 @@
 </PRE>
 <LI>zero or more keyword/value pairs may be appended 
 
-<LI>keyword = <I>temp</I> or <I>press</I> or <I>tparam</I> or <I>pparam</I> or <I>force</I> or <I>torque</I> 
+<LI>keyword = <I>langevin</I> or <I>temp</I> or <I>tparam</I> or <I>force</I> or <I>torque</I> 
 
-<PRE>  <I>temp</I> values = Tstart Tstop Tperiod
+<PRE>  <I>langevin</I> values = Tstart Tstop Tperiod seed
+    Tstart,Tstop = desired temperature at start/stop of run (temperature units)
+    Tdamp = temperature damping parameter (time units)
+    seed = random number seed to use for white noise (positive integer)
+  <I>temp</I> values = Tstart Tstop Tdamp
     Tstart,Tstop = desired temperature at start/stop of run (temperature units)
     Tdamp = temperature damping parameter (time units)
   <I>tparam</I> values = Tchain Titer Torder
@@ -54,7 +58,7 @@
 <P><B>Examples:</B>
 </P>
 <PRE>fix 1 clump rigid single
-fix 1 clump rigid single force 1 off off on
+fix 1 clump rigid single force 1 off off on langevin 1.0 1.0 1.0 428984
 fix 1 polychains rigid/nvt molecule temp 1.0 1.0 5.0
 fix 1 polychains rigid molecule force 1*5 off off off force 6*10 off off on
 fix 2 fluid rigid group 3 clump1 clump2 clump3 torque * off off off 
@@ -200,19 +204,35 @@ multiple rigid fixes to be defined, but it is more expensive.
 </P>
 <HR>
 
-<P>As stated above, the <I>rigid</I> and <I>rigid/nve</I> styles perform constant
-NVE time integration.  Thus the <I>temp</I>, <I>press</I>, and <I>tparam</I> keywords
-cannot be used with these styles.
+<P>The keyword/value option pairs are used in the following ways.
 </P>
-<P>The <I>rigid/nvt</I> style performs constant NVT time integration, using a
-temperature it computes for the rigid bodies which includes their
-translational and rotational motion.  The <I>temp</I> keyword must be used
-with this style.  The desired temperature at each timestep is a ramped
-value during the run from <I>Tstart</I> to <I>Tstop</I>.  The <I>Tdamp</I> parameter
-is specified in time units and determines how rapidly the temperature
-is relaxed.  For example, a value of 100.0 means to relax the
-temperature in a timespan of (roughly) 100 time units (tau or fmsec or
-psec - see the <A HREF = "units.html">units</A> command).
+<P>The <I>langevin</I> and <I>temp</I> and <I>tparam</I> keywords perform thermostatting
+of the rigid bodies, altering both their translational and rotational
+degrees of freedom.  What is meant by "temperature" of a collection of
+rigid bodies and how it can be monitored via the fix output is
+discussed below.
+</P>
+<P>The <I>langevin</I> keyword applies a Langevin thermostat to the constant
+NVE time integration performed by either the <I>rigid</I> or <I>rigid/nve</I>
+styles.  It cannot be used with the <I>rigid/nvt</I> style.  The desired
+temperature at each timestep is a ramped value during the run from
+<I>Tstart</I> to <I>Tstop</I>.  The <I>Tdamp</I> parameter is specified in time units
+and determines how rapidly the temperature is relaxed.  For example, a
+value of 100.0 means to relax the temperature in a timespan of
+(roughly) 100 time units (tau or fmsec or psec - see the
+<A HREF = "units.html">units</A> command).  The random # <I>seed</I> must be a positive
+integer.  The way the Langevin thermostatting operates is explained on
+the <A HREF = "fix_langevin.html">fix langevin</A> doc page.
+</P>
+<P>The <I>temp</I> and <I>tparam</I> keywords apply a Nose/Hoover thermostat to the
+NVT time integration performed by the <I>rigid/nvt</I> style.  They cannot
+be used with the <I>rigid</I> or <I>rigid/nve</I> styles.  The desired
+temperature at each timestep is a ramped value during the run from
+<I>Tstart</I> to <I>Tstop</I>.  The <I>Tdamp</I> parameter is specified in time units
+and determines how rapidly the temperature is relaxed.  For example, a
+value of 100.0 means to relax the temperature in a timespan of
+(roughly) 100 time units (tau or fmsec or psec - see the
+<A HREF = "units.html">units</A> command).
 </P>
 <P>Nose/Hoover chains are used in conjunction with this thermostat.  The
 <I>tparam</I> keyword can optionally be used to change the chain settings
@@ -222,18 +242,22 @@ oscillations in temperature that can occur in a simulation.  As a rule
 of thumb, increasing the chain length should lead to smaller
 oscillations.
 </P>
-<P>There are alternate ways to thermostat a system of rigid bodies.  You
-can use <A HREF = "fix_langevin.html">fix langevin</A> to treat the system as
-effectively immersed in an implicit solvent, e.g. a Brownian dynamics
-model.  For hybrid systems with both rigid bodies and solvent
-particles, you can thermostat only the solvent particles that surround
-one or more rigid bodies by appropriate choice of groups in the
-compute and fix commands for temperature and thermostatting.  The
-solvent interactions with the rigid bodies should then effectively
-thermostat the rigid body temperature as well.
+<P>IMPORTANT NOTE: There are alternate ways to thermostat a system of
+rigid bodies.  You can use <A HREF = "fix_langevin.html">fix langevin</A> to treat
+the individual particles in the rigid bodies as effectively immersed
+in an implicit solvent, e.g. a Brownian dynamics model.  For hybrid
+systems with both rigid bodies and solvent particles, you can
+thermostat only the solvent particles that surround one or more rigid
+bodies by appropriate choice of groups in the compute and fix commands
+for temperature and thermostatting.  The solvent interactions with the
+rigid bodies should then effectively thermostat the rigid body
+temperature as well without use of the Langevin or Nose/Hoover options
+associated with the fix rigid commands.
 </P>
 <HR>
 
+<P>The keyword/value option pairs are used in the following ways.
+</P>
 <P>If you use a <A HREF = "compute.html">temperature compute</A> with a group that
 includes particles in rigid bodies, the degrees-of-freedom removed by
 each rigid body are accounted for in the temperature (and pressure)
@@ -289,6 +313,18 @@ rigid/nvt fix to add the energy change induced by the thermostatting
 to the system's potential energy as part of <A HREF = "thermo_style.html">thermodynamic
 output</A>.
 </P>
+<P>The rigid and rigid/nve fixes computes a global scalar which can be
+accessed by various <A HREF = "Section_howto.html#4_15">output commands</A>.  The
+scalar value calculated by these fixes is "intensive".  The scalar is
+the current temperature of the collection of rigid bodies.  This is
+averaged over all rigid bodies and their translational and rotational
+degrees of freedom.  The translational energy of a rigid body is 1/2 m
+v^2, where m = total mass of the body and v = the velocity of its
+center of mass.  The rotational energy of a rigid body is 1/2 I w^2,
+where I = the moment of inertia tensor of the body and w = its angular
+velocity.  Degrees of freedom constrained by the <I>force</I> and <I>torque</I>
+keywords are removed from this calculation.
+</P>
 <P>The rigid/nvt fix computes a global scalar which can be accessed by
 various <A HREF = "Section_howto.html#4_15">output commands</A>.  The scalar value
 calculated by the rigid/nvt fix is "extensive".  The scalar is the
diff --git a/doc/fix_rigid.txt b/doc/fix_rigid.txt
index 1a1c7ee6d4..9130c881ed 100644
--- a/doc/fix_rigid.txt
+++ b/doc/fix_rigid.txt
@@ -24,8 +24,12 @@ bodystyle = {single} or {molecule} or {group} :l
     groupID1, groupID2, ... = list of N group IDs :pre
 
 zero or more keyword/value pairs may be appended :l
-keyword = {temp} or {press} or {tparam} or {pparam} or {force} or {torque} :l
-  {temp} values = Tstart Tstop Tperiod
+keyword = {langevin} or {temp} or {tparam} or {force} or {torque} :l
+  {langevin} values = Tstart Tstop Tperiod seed
+    Tstart,Tstop = desired temperature at start/stop of run (temperature units)
+    Tdamp = temperature damping parameter (time units)
+    seed = random number seed to use for white noise (positive integer)
+  {temp} values = Tstart Tstop Tdamp
     Tstart,Tstop = desired temperature at start/stop of run (temperature units)
     Tdamp = temperature damping parameter (time units)
   {tparam} values = Tchain Titer Torder
@@ -43,7 +47,7 @@ keyword = {temp} or {press} or {tparam} or {pparam} or {force} or {torque} :l
 [Examples:]
 
 fix 1 clump rigid single
-fix 1 clump rigid single force 1 off off on
+fix 1 clump rigid single force 1 off off on langevin 1.0 1.0 1.0 428984
 fix 1 polychains rigid/nvt molecule temp 1.0 1.0 5.0
 fix 1 polychains rigid molecule force 1*5 off off off force 6*10 off off on
 fix 2 fluid rigid group 3 clump1 clump2 clump3 torque * off off off :pre
@@ -189,19 +193,35 @@ multiple rigid fixes to be defined, but it is more expensive.
 
 :line
 
-As stated above, the {rigid} and {rigid/nve} styles perform constant
-NVE time integration.  Thus the {temp}, {press}, and {tparam} keywords
-cannot be used with these styles.
+The keyword/value option pairs are used in the following ways.
 
-The {rigid/nvt} style performs constant NVT time integration, using a
-temperature it computes for the rigid bodies which includes their
-translational and rotational motion.  The {temp} keyword must be used
-with this style.  The desired temperature at each timestep is a ramped
-value during the run from {Tstart} to {Tstop}.  The {Tdamp} parameter
-is specified in time units and determines how rapidly the temperature
-is relaxed.  For example, a value of 100.0 means to relax the
-temperature in a timespan of (roughly) 100 time units (tau or fmsec or
-psec - see the "units"_units.html command).
+The {langevin} and {temp} and {tparam} keywords perform thermostatting
+of the rigid bodies, altering both their translational and rotational
+degrees of freedom.  What is meant by "temperature" of a collection of
+rigid bodies and how it can be monitored via the fix output is
+discussed below.
+
+The {langevin} keyword applies a Langevin thermostat to the constant
+NVE time integration performed by either the {rigid} or {rigid/nve}
+styles.  It cannot be used with the {rigid/nvt} style.  The desired
+temperature at each timestep is a ramped value during the run from
+{Tstart} to {Tstop}.  The {Tdamp} parameter is specified in time units
+and determines how rapidly the temperature is relaxed.  For example, a
+value of 100.0 means to relax the temperature in a timespan of
+(roughly) 100 time units (tau or fmsec or psec - see the
+"units"_units.html command).  The random # {seed} must be a positive
+integer.  The way the Langevin thermostatting operates is explained on
+the "fix langevin"_fix_langevin.html doc page.
+
+The {temp} and {tparam} keywords apply a Nose/Hoover thermostat to the
+NVT time integration performed by the {rigid/nvt} style.  They cannot
+be used with the {rigid} or {rigid/nve} styles.  The desired
+temperature at each timestep is a ramped value during the run from
+{Tstart} to {Tstop}.  The {Tdamp} parameter is specified in time units
+and determines how rapidly the temperature is relaxed.  For example, a
+value of 100.0 means to relax the temperature in a timespan of
+(roughly) 100 time units (tau or fmsec or psec - see the
+"units"_units.html command).
 
 Nose/Hoover chains are used in conjunction with this thermostat.  The
 {tparam} keyword can optionally be used to change the chain settings
@@ -211,18 +231,22 @@ oscillations in temperature that can occur in a simulation.  As a rule
 of thumb, increasing the chain length should lead to smaller
 oscillations.
 
-There are alternate ways to thermostat a system of rigid bodies.  You
-can use "fix langevin"_fix_langevin.html to treat the system as
-effectively immersed in an implicit solvent, e.g. a Brownian dynamics
-model.  For hybrid systems with both rigid bodies and solvent
-particles, you can thermostat only the solvent particles that surround
-one or more rigid bodies by appropriate choice of groups in the
-compute and fix commands for temperature and thermostatting.  The
-solvent interactions with the rigid bodies should then effectively
-thermostat the rigid body temperature as well.
+IMPORTANT NOTE: There are alternate ways to thermostat a system of
+rigid bodies.  You can use "fix langevin"_fix_langevin.html to treat
+the individual particles in the rigid bodies as effectively immersed
+in an implicit solvent, e.g. a Brownian dynamics model.  For hybrid
+systems with both rigid bodies and solvent particles, you can
+thermostat only the solvent particles that surround one or more rigid
+bodies by appropriate choice of groups in the compute and fix commands
+for temperature and thermostatting.  The solvent interactions with the
+rigid bodies should then effectively thermostat the rigid body
+temperature as well without use of the Langevin or Nose/Hoover options
+associated with the fix rigid commands.
 
 :line
 
+The keyword/value option pairs are used in the following ways.
+
 If you use a "temperature compute"_compute.html with a group that
 includes particles in rigid bodies, the degrees-of-freedom removed by
 each rigid body are accounted for in the temperature (and pressure)
@@ -278,6 +302,18 @@ rigid/nvt fix to add the energy change induced by the thermostatting
 to the system's potential energy as part of "thermodynamic
 output"_thermo_style.html.
 
+The rigid and rigid/nve fixes computes a global scalar which can be
+accessed by various "output commands"_Section_howto.html#4_15.  The
+scalar value calculated by these fixes is "intensive".  The scalar is
+the current temperature of the collection of rigid bodies.  This is
+averaged over all rigid bodies and their translational and rotational
+degrees of freedom.  The translational energy of a rigid body is 1/2 m
+v^2, where m = total mass of the body and v = the velocity of its
+center of mass.  The rotational energy of a rigid body is 1/2 I w^2,
+where I = the moment of inertia tensor of the body and w = its angular
+velocity.  Degrees of freedom constrained by the {force} and {torque}
+keywords are removed from this calculation.
+
 The rigid/nvt fix computes a global scalar which can be accessed by
 various "output commands"_Section_howto.html#4_15.  The scalar value
 calculated by the rigid/nvt fix is "extensive".  The scalar is the

From a990ae69be42fc5acb30c93cdcb3c391f19ae5fc Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Fri, 29 Apr 2011 16:28:42 +0000
Subject: [PATCH 06/21] git-svn-id:
 svn://svn.icms.temple.edu/lammps-ro/trunk@6035
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 src/fix_langevin.cpp  | 26 +++++++++++++-------------
 src/fix_rigid_nve.cpp | 18 +++++++++++-------
 2 files changed, 24 insertions(+), 20 deletions(-)

diff --git a/src/fix_langevin.cpp b/src/fix_langevin.cpp
index 0d233126ef..2f4308e386 100644
--- a/src/fix_langevin.cpp
+++ b/src/fix_langevin.cpp
@@ -82,7 +82,19 @@ FixLangevin::FixLangevin(LAMMPS *lmp, int narg, char **arg) :
 
   int iarg = 7;
   while (iarg < narg) {
-    if (strcmp(arg[iarg],"scale") == 0) {
+    if (strcmp(arg[iarg],"angmom") == 0) {
+      if (iarg+2 > narg) error->all("Illegal fix langevin command");
+      if (strcmp(arg[iarg+1],"no") == 0) aflag = 0;
+      else if (strcmp(arg[iarg+1],"yes") == 0) aflag = 1;
+      else error->all("Illegal fix langevin command");
+      iarg += 2;
+    } else if (strcmp(arg[iarg],"omega") == 0) {
+      if (iarg+2 > narg) error->all("Illegal fix langevin command");
+      if (strcmp(arg[iarg+1],"no") == 0) oflag = 0;
+      else if (strcmp(arg[iarg+1],"yes") == 0) oflag = 1;
+      else error->all("Illegal fix langevin command");
+      iarg += 2;
+    } else if (strcmp(arg[iarg],"scale") == 0) {
       if (iarg+3 > narg) error->all("Illegal fix langevin command");
       int itype = atoi(arg[iarg+1]);
       double scale = atof(arg[iarg+2]);
@@ -102,18 +114,6 @@ FixLangevin::FixLangevin(LAMMPS *lmp, int narg, char **arg) :
       else if (strcmp(arg[iarg+1],"yes") == 0) zeroflag = 1;
       else error->all("Illegal fix langevin command");
       iarg += 2;
-    } else if (strcmp(arg[iarg],"omega") == 0) {
-      if (iarg+2 > narg) error->all("Illegal fix langevin command");
-      if (strcmp(arg[iarg+1],"no") == 0) oflag = 0;
-      else if (strcmp(arg[iarg+1],"yes") == 0) oflag = 1;
-      else error->all("Illegal fix langevin command");
-      iarg += 2;
-    } else if (strcmp(arg[iarg],"angmom") == 0) {
-      if (iarg+2 > narg) error->all("Illegal fix langevin command");
-      if (strcmp(arg[iarg+1],"no") == 0) aflag = 0;
-      else if (strcmp(arg[iarg+1],"yes") == 0) aflag = 1;
-      else error->all("Illegal fix langevin command");
-      iarg += 2;
     } else error->all("Illegal fix langevin command");
   }
 
diff --git a/src/fix_rigid_nve.cpp b/src/fix_rigid_nve.cpp
index ccd908e8f8..abdb258a75 100644
--- a/src/fix_rigid_nve.cpp
+++ b/src/fix_rigid_nve.cpp
@@ -223,16 +223,20 @@ void FixRigidNVE::final_integrate()
 
   MPI_Allreduce(sum[0],all[0],6*nbody,MPI_DOUBLE,MPI_SUM,world);
   
+  // update vcm and angmom
+  // include Langevin thermostat forces
+  // fflag,tflag = 0 for some dimensions in 2d
+
   double mbody[3],tbody[3],fquat[4];
   double dtf2 = dtf * 2.0;
-  
+
   for (ibody = 0; ibody < nbody; ibody++) {
-    fcm[ibody][0] = all[ibody][0];
-    fcm[ibody][1] = all[ibody][1];
-    fcm[ibody][2] = all[ibody][2];
-    torque[ibody][0] = all[ibody][3];
-    torque[ibody][1] = all[ibody][4];
-    torque[ibody][2] = all[ibody][5];
+    fcm[ibody][0] = all[ibody][0] + langextra[ibody][0];
+    fcm[ibody][1] = all[ibody][1] + langextra[ibody][1];
+    fcm[ibody][2] = all[ibody][2] + langextra[ibody][2];
+    torque[ibody][0] = all[ibody][3] + langextra[ibody][3];
+    torque[ibody][1] = all[ibody][4] + langextra[ibody][4];
+    torque[ibody][2] = all[ibody][5] + langextra[ibody][5];
 
     // update vcm by 1/2 step
   

From f702d5c26a92cffeed89a868560d8d19b43b75a2 Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Fri, 29 Apr 2011 16:32:19 +0000
Subject: [PATCH 07/21] git-svn-id:
 svn://svn.icms.temple.edu/lammps-ro/trunk@6036
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 src/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/version.h b/src/version.h
index 66a97ccc65..b7cd4f016f 100644
--- a/src/version.h
+++ b/src/version.h
@@ -1 +1 @@
-#define LAMMPS_VERSION "27 Apr 2011"
+#define LAMMPS_VERSION "29 Apr 2011"

From 3b3c1d118dfe1aaf36d7307158cd2a6390c27b8d Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Fri, 29 Apr 2011 19:48:13 +0000
Subject: [PATCH 08/21] git-svn-id:
 svn://svn.icms.temple.edu/lammps-ro/trunk@6040
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 src/fix_rigid.cpp  | 10 ++--------
 src/math_extra.cpp | 32 ++++++++++++++++++++++++++++----
 src/math_extra.h   | 14 ++++++++++++++
 3 files changed, 44 insertions(+), 12 deletions(-)

diff --git a/src/fix_rigid.cpp b/src/fix_rigid.cpp
index 6d43b9949e..b24b87b41c 100644
--- a/src/fix_rigid.cpp
+++ b/src/fix_rigid.cpp
@@ -679,10 +679,7 @@ void FixRigid::init()
     for (i = 0; i < nlocal; i++) {
       if (body[i] < 0) continue;
       ibody = body[i];
-
-      itype = type[i];
-      if (rmass) massone = rmass[i];
-      else massone = mass[itype];
+      massone = rmass[i];
 
       if (eflags[i] & INERTIA_SPHERE) {
 	sum[ibody][0] += 0.4 * massone * radius[i]*radius[i];
@@ -842,10 +839,7 @@ void FixRigid::init()
     for (i = 0; i < nlocal; i++) {
       if (body[i] < 0) continue;
       ibody = body[i];
-
-      itype = type[i];
-      if (rmass) massone = rmass[i];
-      else massone = mass[itype];
+      massone = rmass[i];
 
       if (eflags[i] & INERTIA_SPHERE) {
 	sum[ibody][0] += 0.4 * massone * radius[i]*radius[i];
diff --git a/src/math_extra.cpp b/src/math_extra.cpp
index 5160262aff..32acc8ed08 100644
--- a/src/math_extra.cpp
+++ b/src/math_extra.cpp
@@ -177,7 +177,7 @@ void rotate(double matrix[3][3], int i, int j, int k, int l,
 /* ----------------------------------------------------------------------
    Richardson iteration to update quaternion from angular momentum
    return new normalized quaternion q
-   also returns
+   also returns updated omega at 1/2 step
 ------------------------------------------------------------------------- */
 
 void richardson(double *q, double *m, double *w, double *moments, double dtq)
@@ -506,9 +506,9 @@ void inertia_triangle(double *v0, double *v1, double *v2,
   double v[3][3],sv[3][3],vtsv[3][3];
   double vvv[3],v1mv0[3],v2mv0[3],normal[3];
 
-  v[0][0] = v0[0]; v[0][1] = v0[2]; v[0][2] = v0[3];
-  v[1][0] = v1[0]; v[1][1] = v1[2]; v[1][2] = v1[3];
-  v[2][0] = v2[0]; v[2][1] = v2[2]; v[2][2] = v2[3];
+  v[0][0] = v0[0]; v[0][1] = v0[1]; v[0][2] = v0[2];
+  v[1][0] = v1[0]; v[1][1] = v1[1]; v[1][2] = v1[2];
+  v[2][0] = v2[0]; v[2][1] = v2[1]; v[2][2] = v2[2];
 
   times3(s,v,sv);
   transpose_times3(v,sv,vtsv);
@@ -533,6 +533,30 @@ void inertia_triangle(double *v0, double *v1, double *v2,
   inertia[5] = -inv24*a*vtsv[0][1];
 }
 
+/* ----------------------------------------------------------------------
+   compute space-frame inertia tensor of a triangle
+   idiag = previously computed diagonal inertia tensor
+   quat = orientiation quaternion of triangle
+   return symmetric inertia tensor as 6-vector in Voigt notation
+------------------------------------------------------------------------- */
+
+void inertia_triangle(double *idiag, double *quat, double mass,
+		      double *inertia)
+{
+  double p[3][3],ptrans[3][3],itemp[3][3],tensor[3][3];
+
+  quat_to_mat(quat,p);
+  quat_to_mat_trans(quat,ptrans);
+  diag_times3(idiag,ptrans,itemp);
+  times3(p,itemp,tensor);
+  inertia[0] = tensor[0][0];
+  inertia[1] = tensor[1][1];
+  inertia[2] = tensor[2][2];
+  inertia[3] = tensor[1][2];
+  inertia[4] = tensor[0][2];
+  inertia[5] = tensor[0][1];
+}
+
 /* ---------------------------------------------------------------------- */
 
 }
diff --git a/src/math_extra.h b/src/math_extra.h
index 44af2e9a8a..1e05c5d728 100755
--- a/src/math_extra.h
+++ b/src/math_extra.h
@@ -31,6 +31,7 @@ namespace MathExtra {
   inline void normalize3(const double *v, double *ans);
   inline void snormalize3(const double, const double *v, double *ans);
   inline void negate3(double *v);
+  inline void scale3(double s, double *v);
   inline void add3(const double *v1, const double *v2, double *ans);
   inline void sub3(const double *v1, const double *v2, double *ans);
   inline double len3(const double *v);
@@ -119,6 +120,8 @@ namespace MathExtra {
 		    double *inertia);
   void inertia_triangle(double *v0, double *v1, double *v2, 
 			double mass, double *inertia);
+  void inertia_triangle(double *idiag, double *quat, double mass, 
+			double *inertia); 
 }
 
 /* ----------------------------------------------------------------------
@@ -168,6 +171,17 @@ void MathExtra::negate3(double *v)
   v[2] = -v[2];
 }
 
+/* ----------------------------------------------------------------------
+   scale vector v by s
+------------------------------------------------------------------------- */
+
+void MathExtra::scale3(double s, double *v)
+{
+  v[0] *= s;
+  v[1] *= s;
+  v[2] *= s;
+}
+
 /* ----------------------------------------------------------------------
    ans = v1 + v2
 ------------------------------------------------------------------------- */

From ee769613d766262e5bf437164385269bb9e0468d Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Fri, 29 Apr 2011 21:05:35 +0000
Subject: [PATCH 09/21] git-svn-id:
 svn://svn.icms.temple.edu/lammps-ro/trunk@6041
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 src/ASPHERE/compute_temp_asphere.cpp | 2 +-
 src/ASPHERE/fix_nve_asphere.cpp      | 2 +-
 src/compute_erotate_sphere.cpp       | 2 +-
 src/compute_temp_sphere.cpp          | 2 +-
 src/fix_langevin.cpp                 | 4 ++--
 src/fix_nh_sphere.cpp                | 2 +-
 src/fix_nve_sphere.cpp               | 2 +-
 src/memory.h                         | 6 +++---
 8 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/ASPHERE/compute_temp_asphere.cpp b/src/ASPHERE/compute_temp_asphere.cpp
index f2d34ac72a..b4fb8c79f8 100755
--- a/src/ASPHERE/compute_temp_asphere.cpp
+++ b/src/ASPHERE/compute_temp_asphere.cpp
@@ -34,7 +34,7 @@ using namespace LAMMPS_NS;
 
 enum{ROTATE,ALL};
 
-#define INERTIA 0.2          // moment of inertia for ellipsoid
+#define INERTIA 0.2          // moment of inertia prefactor for ellipsoid
 
 /* ---------------------------------------------------------------------- */
 
diff --git a/src/ASPHERE/fix_nve_asphere.cpp b/src/ASPHERE/fix_nve_asphere.cpp
index 9e4155581f..e078d2fb75 100755
--- a/src/ASPHERE/fix_nve_asphere.cpp
+++ b/src/ASPHERE/fix_nve_asphere.cpp
@@ -29,7 +29,7 @@
 
 using namespace LAMMPS_NS;
 
-#define INERTIA 0.2          // moment of inertia for ellipsoid
+#define INERTIA 0.2          // moment of inertia prefactor for ellipsoid
 
 /* ---------------------------------------------------------------------- */
 
diff --git a/src/compute_erotate_sphere.cpp b/src/compute_erotate_sphere.cpp
index b357501a06..1aa5ad8d99 100644
--- a/src/compute_erotate_sphere.cpp
+++ b/src/compute_erotate_sphere.cpp
@@ -23,7 +23,7 @@
 
 using namespace LAMMPS_NS;
 
-#define INERTIA 0.4          // moment of inertia for sphere
+#define INERTIA 0.4          // moment of inertia prefactor for sphere
 
 /* ---------------------------------------------------------------------- */
 
diff --git a/src/compute_temp_sphere.cpp b/src/compute_temp_sphere.cpp
index 93c9ec74aa..246d58bae4 100644
--- a/src/compute_temp_sphere.cpp
+++ b/src/compute_temp_sphere.cpp
@@ -28,7 +28,7 @@ using namespace LAMMPS_NS;
 
 enum{ROTATE,ALL};
 
-#define INERTIA 0.4          // moment of inertia for sphere
+#define INERTIA 0.4          // moment of inertia prefactor for sphere
 
 /* ---------------------------------------------------------------------- */
 
diff --git a/src/fix_langevin.cpp b/src/fix_langevin.cpp
index 2f4308e386..5e40a7c28b 100644
--- a/src/fix_langevin.cpp
+++ b/src/fix_langevin.cpp
@@ -40,8 +40,8 @@ using namespace LAMMPS_NS;
 
 enum{NOBIAS,BIAS};
 
-#define SINERTIA 0.4          // moment of inertia for sphere
-#define EINERTIA 0.2          // moment of inertia for ellipsoid
+#define SINERTIA 0.4          // moment of inertia prefactor for sphere
+#define EINERTIA 0.2          // moment of inertia prefactor for ellipsoid
 
 /* ---------------------------------------------------------------------- */
 
diff --git a/src/fix_nh_sphere.cpp b/src/fix_nh_sphere.cpp
index d1be4184fd..9f39466552 100644
--- a/src/fix_nh_sphere.cpp
+++ b/src/fix_nh_sphere.cpp
@@ -24,7 +24,7 @@
 
 using namespace LAMMPS_NS;
 
-#define INERTIA 0.4          // moment of inertia for sphere
+#define INERTIA 0.4          // moment of inertia prefactor for sphere
 
 /* ---------------------------------------------------------------------- */
 
diff --git a/src/fix_nve_sphere.cpp b/src/fix_nve_sphere.cpp
index 02968e1f13..fc67023f4a 100644
--- a/src/fix_nve_sphere.cpp
+++ b/src/fix_nve_sphere.cpp
@@ -24,7 +24,7 @@
 
 using namespace LAMMPS_NS;
 
-#define INERTIA 0.4          // moment of inertia for sphere
+#define INERTIA 0.4          // moment of inertia prefactor for sphere
 
 enum{NONE,DIPOLE};
 
diff --git a/src/memory.h b/src/memory.h
index 64901536c2..47abb49443 100644
--- a/src/memory.h
+++ b/src/memory.h
@@ -45,7 +45,7 @@ class Memory : protected Pointers {
       bigint nbytes = sizeof(TYPE) * n;
       array = (TYPE *) smalloc(nbytes,name);
       return array;
-    };
+    }
 
   template <typename TYPE>
     TYPE **create(TYPE **&array, int n, const char *name) {fail(name);}
@@ -62,7 +62,7 @@ class Memory : protected Pointers {
       bigint nbytes = sizeof(TYPE) * n;
       array = (TYPE *) srealloc(array,nbytes,name);
       return array;
-    };
+    }
 
   template <typename TYPE>
     TYPE **grow(TYPE **&array, int n, const char *name) {fail(name);}
@@ -75,7 +75,7 @@ class Memory : protected Pointers {
     void destroy(TYPE *array) 
     {
       sfree(array);
-    };
+    }
 
 /* ----------------------------------------------------------------------
    create a 1d array with index from nlo to nhi inclusive 

From 21de80701082fc091af03eefd37883711c690c91 Mon Sep 17 00:00:00 2001
From: athomps <athomps@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Fri, 29 Apr 2011 23:38:34 +0000
Subject: [PATCH 10/21] Added xsu, ysu, zsu to dump custom and dump cfg

git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6043 f3b2605a-c512-4ea7-a41b-209d697bcdaa
---
 doc/Section_start.txt | 29 +++++++++++++++++++++++++++--
 doc/dump.html         | 26 +++++++++++++++++++++-----
 doc/dump.txt          | 26 +++++++++++++++++++++-----
 3 files changed, 69 insertions(+), 12 deletions(-)

diff --git a/doc/Section_start.txt b/doc/Section_start.txt
index 4b4d96693f..fc45cf8cda 100644
--- a/doc/Section_start.txt
+++ b/doc/Section_start.txt
@@ -782,7 +782,9 @@ one-letter abbreviation can be used:
 -in or -i
 -log or -l
 -screen or -s
--var or -v :ul
+-var or -v
+-plog or -pl
+-pscreen or -ps :ul
 
 For example, lmp_ibm might be launched as follows:
 
@@ -846,6 +848,7 @@ logfile is named "file" and each partition also logs information to a
 file.N.  For both one-partition and multi-partition mode, if the
 specified file is "none", then no log files are created.  Using a
 "log"_log.html command in the input script will override this setting.
+Option -plog will override the name of the partition log files file.N.
 
 -screen file :pre
 
@@ -859,7 +862,8 @@ the partition ID.  If the switch is specified in multi-partition mode,
 the hi-level screen dump is named "file" and each partition also
 writes screen information to a file.N.  For both one-partition and
 multi-partition mode, if the specified file is "none", then no screen
-output is performed.
+output is performed. Option -pscreen will override the name of the 
+partition screen files file.N.
 
 -var name value1 value2 ... :pre
 
@@ -878,6 +882,27 @@ defining index and other kinds of variables and "this
 section"_Section_commands.html#3_2 for more info on using variables in
 input scripts.
 
+-plog file:pre 
+Specify the base name for the partition log files, 
+so partition N writes log information to file.N. If file is 
+none, then no partition log files are created. 
+This overrides the
+filename specified in the -log command-line option
+If this option is not used
+the log file for partition N is log.lammps.N or whatever is specified by
+the -log command-line option.
+
+-pscreen file:pre 
+specify the base name for the 
+partition screen file, so partition N writes 
+screen information to file.N. If file is 
+none, then no partition screen files are created. 
+This overrides the
+filename specified in the -screen command-line option.
+If this option is not used
+the screen file for partition N is screen.N or whatever is specified by
+the -screen command-line option. 
+
 :line
 
 2.7 LAMMPS screen output :h4,link(2_7)
diff --git a/doc/dump.html b/doc/dump.html
index 418f4996b5..f95117f5bc 100644
--- a/doc/dump.html
+++ b/doc/dump.html
@@ -47,7 +47,8 @@
 </PRE>
 <PRE>  <I>custom</I> args = list of atom attributes
     possible attributes = id, mol, type, mass,
-			  x, y, z, xs, ys, zs, xu, yu, zu, ix, iy, iz,
+			  x, y, z, xs, ys, zs, xu, yu, zu, 
+			  xsu, ysu, zsu, ix, iy, iz,
 			  vx, vy, vz, fx, fy, fz,
                           q, mux, muy, muz, mu,
                           radius, omegax, omegay, omegaz,
@@ -62,6 +63,7 @@
       x,y,z = unscaled atom coordinates
       xs,ys,zs = scaled atom coordinates
       xu,yu,zu = unwrapped atom coordinates
+      xsu,ysu,zsu = scaled unwrapped atom coordinates
       ix,iy,iz = box image that the atom is in
       vx,vy,vz = atom velocities
       fx,fy,fz = forces on atoms
@@ -228,14 +230,23 @@ extended CFG format files, as used by the
 package.  Since the extended CFG format uses a single snapshot of the
 system per file, a wildcard "*" must be included in the filename, as
 discussed below.  The list of atom attributes for style <I>cfg</I> must
-begin with "id type xs ys zs", since these quantities are needed to
+begin with either "id type xs ys zs" or "id type xsu ysu zsu" or 
+since these quantities are needed to
 write the CFG files in the appropriate format (though the "id" and
 "type" fields do not appear explicitly in the file).  Any remaining
 attributes will be stored as "auxiliary properties" in the CFG files.
 Note that you will typically want to use the <A HREF = "dump_modify.html">dump_modify
 element</A> command with CFG-formatted files, to
 associate element names with atom types, so that AtomEye can render
-atoms appropriately.
+atoms appropriately. When unwrapped coordinates <I>xsu</I>, <I>ysu</I>, and <I>zsu</I>
+are requested, the nominal AtomEye periodic cell dimensions are expanded 
+by a large factor UNWRAPEXPAND = 10.0, which ensures atoms that are 
+displayed correctly for up to UNWRAPEXPAND/2 periodic boundary crossings 
+in any direction. 
+Beyond this, AtomEye will rewrap the unwrapped coordinates. 
+The expansion causes the atoms to be drawn farther
+away from the viewer, but it is easy to zoom the atoms closer, and
+the interatomic distances are unaffected.   
 </P>
 <P>The <I>dcd</I> style writes DCD files, a standard atomic trajectory format
 used by the CHARMM, NAMD, and XPlor molecular dynamics packages.  DCD
@@ -391,7 +402,7 @@ of atom velocity and force and atomic charge.
 <I>y</I>, <I>z</I> attributes write atom coordinates "unscaled", in the
 appropriate distance <A HREF = "units.html">units</A> (Angstroms, sigma, etc).  Use
 <I>xs</I>, <I>ys</I>, <I>zs</I> if you want the coordinates "scaled" to the box size,
-so that each value is 0.0 to 1.0.  If the simluation box is triclinic
+so that each value is 0.0 to 1.0.  If the simulation box is triclinic
 (tilted), then all atom coords will still be between 0.0 and 1.0.  Use
 <I>xu</I>, <I>yu</I>, <I>zu</I> if you want the coordinates "unwrapped" by the image
 flags for each atom.  Unwrapped means that if the atom has passed thru
@@ -399,7 +410,12 @@ a periodic boundary one or more times, the value is printed for what
 the coordinate would be if it had not been wrapped back into the
 periodic box.  Note that using <I>xu</I>, <I>yu</I>, <I>zu</I> means that the
 coordinate values may be far outside the box bounds printed with the
-snapshot.  The image flags can be printed directly using the <I>ix</I>,
+snapshot.  Using <I>xsu</I>, <I>ysu</I>, <I>zsu</I> is similar to using <I>xu</I>, <I>yu</I>, <I>zu</I>,
+except that the unwrapped coordinates are scaled by the box size. Atoms
+that have passed through a periodic boundary will have the corresponding
+cooordinate increased or decreased by 1.0.
+</P>
+<P>The image flags can be printed directly using the <I>ix</I>,
 <I>iy</I>, <I>iz</I> attributes. The <A HREF = "dump_modify.html">dump_modify</A> command
 describes in more detail what is meant by scaled vs unscaled
 coordinates and the image flags.
diff --git a/doc/dump.txt b/doc/dump.txt
index 749b1b9109..11d0a9d730 100644
--- a/doc/dump.txt
+++ b/doc/dump.txt
@@ -37,7 +37,8 @@ args = list of arguments for a particular style :l
 
   {custom} args = list of atom attributes
     possible attributes = id, mol, type, mass,
-			  x, y, z, xs, ys, zs, xu, yu, zu, ix, iy, iz,
+			  x, y, z, xs, ys, zs, xu, yu, zu, 
+			  xsu, ysu, zsu, ix, iy, iz,
 			  vx, vy, vz, fx, fy, fz,
                           q, mux, muy, muz, mu,
                           radius, omegax, omegay, omegaz,
@@ -52,6 +53,7 @@ args = list of arguments for a particular style :l
       x,y,z = unscaled atom coordinates
       xs,ys,zs = scaled atom coordinates
       xu,yu,zu = unwrapped atom coordinates
+      xsu,ysu,zsu = scaled unwrapped atom coordinates
       ix,iy,iz = box image that the atom is in
       vx,vy,vz = atom velocities
       fx,fy,fz = forces on atoms
@@ -217,14 +219,23 @@ extended CFG format files, as used by the
 package.  Since the extended CFG format uses a single snapshot of the
 system per file, a wildcard "*" must be included in the filename, as
 discussed below.  The list of atom attributes for style {cfg} must
-begin with "id type xs ys zs", since these quantities are needed to
+begin with either "id type xs ys zs" or "id type xsu ysu zsu" or 
+since these quantities are needed to
 write the CFG files in the appropriate format (though the "id" and
 "type" fields do not appear explicitly in the file).  Any remaining
 attributes will be stored as "auxiliary properties" in the CFG files.
 Note that you will typically want to use the "dump_modify
 element"_dump_modify.html command with CFG-formatted files, to
 associate element names with atom types, so that AtomEye can render
-atoms appropriately.
+atoms appropriately. When unwrapped coordinates {xsu}, {ysu}, and {zsu}
+are requested, the nominal AtomEye periodic cell dimensions are expanded 
+by a large factor UNWRAPEXPAND = 10.0, which ensures atoms that are 
+displayed correctly for up to UNWRAPEXPAND/2 periodic boundary crossings 
+in any direction. 
+Beyond this, AtomEye will rewrap the unwrapped coordinates. 
+The expansion causes the atoms to be drawn farther
+away from the viewer, but it is easy to zoom the atoms closer, and
+the interatomic distances are unaffected.   
 
 The {dcd} style writes DCD files, a standard atomic trajectory format
 used by the CHARMM, NAMD, and XPlor molecular dynamics packages.  DCD
@@ -380,7 +391,7 @@ There are several options for outputting atom coordinates.  The {x},
 {y}, {z} attributes write atom coordinates "unscaled", in the
 appropriate distance "units"_units.html (Angstroms, sigma, etc).  Use
 {xs}, {ys}, {zs} if you want the coordinates "scaled" to the box size,
-so that each value is 0.0 to 1.0.  If the simluation box is triclinic
+so that each value is 0.0 to 1.0.  If the simulation box is triclinic
 (tilted), then all atom coords will still be between 0.0 and 1.0.  Use
 {xu}, {yu}, {zu} if you want the coordinates "unwrapped" by the image
 flags for each atom.  Unwrapped means that if the atom has passed thru
@@ -388,7 +399,12 @@ a periodic boundary one or more times, the value is printed for what
 the coordinate would be if it had not been wrapped back into the
 periodic box.  Note that using {xu}, {yu}, {zu} means that the
 coordinate values may be far outside the box bounds printed with the
-snapshot.  The image flags can be printed directly using the {ix},
+snapshot.  Using {xsu}, {ysu}, {zsu} is similar to using {xu}, {yu}, {zu},
+except that the unwrapped coordinates are scaled by the box size. Atoms
+that have passed through a periodic boundary will have the corresponding
+cooordinate increased or decreased by 1.0.
+
+The image flags can be printed directly using the {ix},
 {iy}, {iz} attributes. The "dump_modify"_dump_modify.html command
 describes in more detail what is meant by scaled vs unscaled
 coordinates and the image flags.

From 7c04f95ce0f73b7dcd80b5d82bd256834205e85c Mon Sep 17 00:00:00 2001
From: athomps <athomps@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Fri, 29 Apr 2011 23:40:14 +0000
Subject: [PATCH 11/21] Added xsu, ysu, zsu to dump custom and dump cfg

git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6044 f3b2605a-c512-4ea7-a41b-209d697bcdaa
---
 src/dump_cfg.cpp    |  59 ++++++++++---
 src/dump_cfg.h      |   1 +
 src/dump_custom.cpp | 207 +++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 254 insertions(+), 13 deletions(-)

diff --git a/src/dump_cfg.cpp b/src/dump_cfg.cpp
index 471b5d3bb0..de7790bb89 100755
--- a/src/dump_cfg.cpp
+++ b/src/dump_cfg.cpp
@@ -30,6 +30,8 @@
 #include "memory.h"
 #include "error.h"
 
+#define UNWRAPEXPAND 10.0
+
 using namespace LAMMPS_NS;
 
 enum{INT,DOUBLE};  // same as in dump_custom.cpp
@@ -41,10 +43,20 @@ DumpCFG::DumpCFG(LAMMPS *lmp, int narg, char **arg) :
 {
   if (narg < 10 ||
       strcmp(arg[5],"id") != 0 || strcmp(arg[6],"type") != 0 ||
-      strcmp(arg[7],"xs") != 0 || strcmp(arg[8],"ys") != 0 ||
-      strcmp(arg[9],"zs") != 0)
-    error->all("Dump cfg arguments must start with 'id type xs ys zs'");
+      (strcmp(arg[7],"xs") != 0 && strcmp(arg[7],"xsu") != 0) || 
+      (strcmp(arg[8],"ys") != 0 && strcmp(arg[8],"ysu") != 0) ||
+      (strcmp(arg[9],"zs") != 0 && strcmp(arg[9],"zsu") != 0)
+      )
+    error->all("Dump cfg arguments must start with 'id type xs ys zs' or 'id type xsu ysu zsu'");
 
+  if (strcmp(arg[7],"xs") == 0)
+    if (strcmp(arg[8],"ysu") == 0 || strcmp(arg[9],"zsu") == 0)
+      error->all("Dump cfg arguments can not mix xs|ys|zs with xsu|ysu|zsu");
+    else unwrapflag = 0;
+  else if (strcmp(arg[8],"ys") == 0 || strcmp(arg[9],"zs") == 0)
+    error->all("Dump cfg arguments can not mix xs|ys|zs with xsu|ysu|zsu");
+  else unwrapflag = 1;
+    
   ntypes = atom->ntypes;
   typenames = NULL;
 
@@ -189,7 +201,9 @@ void DumpCFG::write_header(bigint n)
   // special handling for atom style peri
   // use average volume of particles to scale particles to mimic C atoms
   // scale box dimension to sc lattice for C with sigma = 1.44 Angstroms  
- 
+
+  // Special handling for unwrapped coordinates
+
   double scale;
   if (atom->peri_flag) {
     int nlocal = atom->nlocal;
@@ -199,9 +213,9 @@ void DumpCFG::write_header(bigint n)
     MPI_Allreduce(&vone,&vave,1,MPI_DOUBLE,MPI_SUM,world); 
     if (atom->natoms) vave /= atom->natoms; 
     if (vave > 0.0) scale = 1.44 / pow(vave,1.0/3.0); 
-    else scale = 1.0;
-  } else scale = 1.0;
- 
+  } else if (unwrapflag == 1) scale = UNWRAPEXPAND;
+  else scale = 1.0;
+    
   if (me == 0 || multiproc) {
     char str[64];
     sprintf(str,"Number of particles = %s\n",BIGINT_FORMAT);
@@ -261,6 +275,8 @@ void DumpCFG::write_data(int n, double *mybuf)
 
   //  write data lines in rbuf to file after transfer is done
   
+  double unwrap_coord;
+
   if (nlines == nchosen) {
     for (itype = 1; itype <= ntypes; itype++) {
       for (i = 0; i < nchosen; i++)
@@ -271,11 +287,30 @@ void DumpCFG::write_data(int n, double *mybuf)
 	fprintf(fp,"%s\n",typenames[itype]);
 	for (; i < nchosen; i++) {
 	  if (rbuf[i][1] == itype) {
-	    for (j = 2; j < size_one; j++) {
-	      if (vtype[j] == INT)
-		fprintf(fp,vformat[j],static_cast<int> (rbuf[i][j]));
-	      else fprintf(fp,vformat[j],rbuf[i][j]);
-	    }
+	    if (unwrapflag == 0)
+	      for (j = 2; j < size_one; j++) {
+		if (vtype[j] == INT)
+		  fprintf(fp,vformat[j],static_cast<int> (rbuf[i][j]));
+		else fprintf(fp,vformat[j],rbuf[i][j]);
+	      }
+	    else
+
+	      // Unwrapped scaled coordinates are shifted to
+	      // center of expanded box, to prevent
+	      // rewrapping by AtomEye. Dividing by 
+	      // expansion factor restores correct
+	      // interatomic distances.
+
+	      for (j = 2; j < 5; j++) {
+		unwrap_coord = (rbuf[i][j] - 0.5)/UNWRAPEXPAND + 0.5;
+		fprintf(fp,vformat[j],unwrap_coord);
+	      }
+	      for (j = 5; j < size_one; j++) {
+		if (vtype[j] == INT)
+		  fprintf(fp,vformat[j],static_cast<int> (rbuf[i][j]));
+		else fprintf(fp,vformat[j],rbuf[i][j]);
+	      }
+
 	    fprintf(fp,"\n");
 	  }
 	}
diff --git a/src/dump_cfg.h b/src/dump_cfg.h
index 0467983f76..3ddfc8fa44 100755
--- a/src/dump_cfg.h
+++ b/src/dump_cfg.h
@@ -36,6 +36,7 @@ class DumpCFG : public DumpCustom {
   int nchosen;               // # of lines to be written on a writing proc
   int nlines;                // # of lines transferred from buf to rbuf
   double **rbuf;             // buf of data lines for data lines rearrangement
+  int unwrapflag;            // 1 if unwrapped coordinates are requested
 
   void init_style();
   void write_header(bigint);
diff --git a/src/dump_custom.cpp b/src/dump_custom.cpp
index 823c5c0dbf..b0dcc982a2 100644
--- a/src/dump_custom.cpp
+++ b/src/dump_custom.cpp
@@ -35,7 +35,9 @@ using namespace LAMMPS_NS;
 // same list as in compute_property.cpp, also customize that command
 
 enum{ID,MOL,TYPE,MASS,
-     X,Y,Z,XS,YS,ZS,XSTRI,YSTRI,ZSTRI,XU,YU,ZU,XUTRI,YUTRI,ZUTRI,IX,IY,IZ,
+     X,Y,Z,XS,YS,ZS,XSTRI,YSTRI,ZSTRI,XU,YU,ZU,XUTRI,YUTRI,ZUTRI,
+     XSU,YSU,ZSU,XSUTRI,YSUTRI,ZSUTRI,
+     IX,IY,IZ,
      VX,VY,VZ,FX,FY,FZ,
      Q,MUX,MUY,MUZ,MU,RADIUS,OMEGAX,OMEGAY,OMEGAZ,ANGMOMX,ANGMOMY,ANGMOMZ,
      TQX,TQY,TQZ,SPIN,ERADIUS,ERVEL,ERFORCE,
@@ -563,6 +565,70 @@ int DumpCustom::count()
 	ptr = dchoose;
 	nstride = 1;
 
+      } else if (thresh_array[ithresh] == XSU) {
+        double **x = atom->x;
+	int *image = atom->image;
+	double boxxlo = domain->boxlo[0];
+	double invxprd = 1.0/domain->xprd;
+	for (i = 0; i < nlocal; i++) 
+	  dchoose[i] = (x[i][0] - boxxlo) * invxprd + (image[i] & 1023) - 512;
+	ptr = dchoose;
+	nstride = 1;
+
+      } else if (thresh_array[ithresh] == YSU) {
+        double **x = atom->x;
+	int *image = atom->image;
+	double boxylo = domain->boxlo[1];
+	double invyprd = 1.0/domain->yprd;
+	for (i = 0; i < nlocal; i++) 
+	  dchoose[i] = (x[i][1] - boxylo) * invyprd + (image[i] >> 10 & 1023) - 512;
+	ptr = dchoose;
+	nstride = 1;
+
+      } else if (thresh_array[ithresh] == ZSU) {
+        double **x = atom->x;
+	int *image = atom->image;
+	double boxzlo = domain->boxlo[2];
+	double invzprd = 1.0/domain->zprd;
+	for (i = 0; i < nlocal; i++) 
+	  dchoose[i] = (x[i][2] - boxzlo) * invzprd + (image[i] >> 20) - 512;
+	ptr = dchoose;
+	nstride = 1;
+
+      } else if (thresh_array[ithresh] == XSUTRI) {
+        double **x = atom->x;
+	int *image = atom->image;
+	double *boxlo = domain->boxlo;
+	double *h_inv = domain->h_inv;
+	for (i = 0; i < nlocal; i++) 
+	  dchoose[i] = h_inv[0]*(x[i][0]-boxlo[0]) + 
+	    h_inv[5]*(x[i][1]-boxlo[1]) + 
+	    h_inv[4]*(x[i][2]-boxlo[2]) + 
+	    (image[i] & 1023) - 512;
+	ptr = dchoose;
+	nstride = 1;
+      } else if (thresh_array[ithresh] == YSUTRI) {
+        double **x = atom->x;
+	int *image = atom->image;
+	double *boxlo = domain->boxlo;
+	double *h_inv = domain->h_inv;
+	for (i = 0; i < nlocal; i++) 
+	  dchoose[i] = h_inv[1]*(x[i][1]-boxlo[1]) + 
+	    h_inv[3]*(x[i][2]-boxlo[2]) + 
+	    (image[i] >> 10 & 1023) - 512;
+	ptr = dchoose;
+	nstride = 1;
+      } else if (thresh_array[ithresh] == ZSUTRI) {
+        double **x = atom->x;
+	int *image = atom->image;
+	double *boxlo = domain->boxlo;
+	double *h_inv = domain->h_inv;
+	for (i = 0; i < nlocal; i++) 
+	  dchoose[i] = h_inv[2]*(x[i][2]-boxlo[2]) +
+	    (image[i] >> 20) - 512;
+	ptr = dchoose;
+	nstride = 1;
+
       } else if (thresh_array[ithresh] == IX) {
 	int *image = atom->image;
 	for (i = 0; i < nlocal; i++)
@@ -879,6 +945,18 @@ void DumpCustom::parse_fields(int narg, char **arg)
       if (domain->triclinic) pack_choice[i] = &DumpCustom::pack_zu_triclinic;
       else pack_choice[i] = &DumpCustom::pack_zu;
       vtype[i] = DOUBLE;
+    } else if (strcmp(arg[iarg],"xsu") == 0) {
+      if (domain->triclinic) pack_choice[i] = &DumpCustom::pack_xsu_triclinic;
+      else pack_choice[i] = &DumpCustom::pack_xsu;
+      vtype[i] = DOUBLE;
+    } else if (strcmp(arg[iarg],"ysu") == 0) {
+      if (domain->triclinic) pack_choice[i] = &DumpCustom::pack_ysu_triclinic;
+      else pack_choice[i] = &DumpCustom::pack_ysu;
+      vtype[i] = DOUBLE;
+    } else if (strcmp(arg[iarg],"zsu") == 0) {
+      if (domain->triclinic) pack_choice[i] = &DumpCustom::pack_zsu_triclinic;
+      else pack_choice[i] = &DumpCustom::pack_zsu;
+      vtype[i] = DOUBLE;
     } else if (strcmp(arg[iarg],"ix") == 0) {
       pack_choice[i] = &DumpCustom::pack_ix;
       vtype[i] = INT;
@@ -1254,6 +1332,19 @@ int DumpCustom::modify_param(int narg, char **arg)
     else if (strcmp(arg[1],"zu") == 0 && domain->triclinic == 1)
       thresh_array[nthresh] = ZUTRI;
 
+    else if (strcmp(arg[1],"xsu") == 0 && domain->triclinic == 0)
+      thresh_array[nthresh] = XSU;
+    else if (strcmp(arg[1],"xsu") == 0 && domain->triclinic == 1)
+      thresh_array[nthresh] = XSUTRI;
+    else if (strcmp(arg[1],"ysu") == 0 && domain->triclinic == 0)
+      thresh_array[nthresh] = YSU;
+    else if (strcmp(arg[1],"ysu") == 0 && domain->triclinic == 1)
+      thresh_array[nthresh] = YSUTRI;
+    else if (strcmp(arg[1],"zsu") == 0 && domain->triclinic == 0)
+      thresh_array[nthresh] = ZSU;
+    else if (strcmp(arg[1],"zsu") == 0 && domain->triclinic == 1)
+      thresh_array[nthresh] = ZSUTRI;
+
     else if (strcmp(arg[1],"ix") == 0) thresh_array[nthresh] = IX;
     else if (strcmp(arg[1],"iy") == 0) thresh_array[nthresh] = IY;
     else if (strcmp(arg[1],"iz") == 0) thresh_array[nthresh] = IZ;
@@ -1821,6 +1912,120 @@ void DumpCustom::pack_zu_triclinic(int n)
 
 /* ---------------------------------------------------------------------- */
 
+void DumpCustom::pack_xsu(int n)
+{
+  double **x = atom->x;
+  int *image = atom->image;
+  int nlocal = atom->nlocal;
+
+  double boxxlo = domain->boxlo[0];
+  double invxprd = 1.0/domain->xprd;
+
+  for (int i = 0; i < nlocal; i++)
+    if (choose[i]) {
+      buf[n] = (x[i][0] - boxxlo) * invxprd + (image[i] & 1023) - 512;
+      n += size_one;
+    }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void DumpCustom::pack_ysu(int n)
+{
+  double **x = atom->x;
+  int *image = atom->image;
+  int nlocal = atom->nlocal;
+
+  double boxylo = domain->boxlo[1];
+  double invyprd = 1.0/domain->yprd;
+
+  for (int i = 0; i < nlocal; i++)
+    if (choose[i]) {
+      buf[n] = (x[i][1] - boxylo) * invyprd + (image[i] >> 10 & 1023) - 512;
+      n += size_one;
+    }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void DumpCustom::pack_zsu(int n)
+{
+  double **x = atom->x;
+  int *image = atom->image;
+  int nlocal = atom->nlocal;
+
+  double boxzlo = domain->boxlo[2];
+  double invzprd = 1.0/domain->zprd;
+
+  for (int i = 0; i < nlocal; i++)
+    if (choose[i]) {
+      buf[n] = (x[i][2] - boxzlo) * invzprd + (image[i] >> 20) - 512;
+      n += size_one;
+    }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void DumpCustom::pack_xsu_triclinic(int n)
+{
+  double **x = atom->x;
+  int *image = atom->image;
+  int nlocal = atom->nlocal;
+
+  double *boxlo = domain->boxlo;
+  double *h_inv = domain->h_inv;
+
+  for (int i = 0; i < nlocal; i++)
+    if (choose[i]) {
+      buf[n] = h_inv[0]*(x[i][0]-boxlo[0]) + 
+	h_inv[5]*(x[i][1]-boxlo[1]) + 
+	h_inv[4]*(x[i][2]-boxlo[2]) +
+	(image[i] & 1023) - 512;
+      n += size_one;
+    }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void DumpCustom::pack_ysu_triclinic(int n)
+{
+  double **x = atom->x;
+  int *image = atom->image;
+  int nlocal = atom->nlocal;
+
+  double *boxlo = domain->boxlo;
+  double *h_inv = domain->h_inv;
+
+  for (int i = 0; i < nlocal; i++)
+    if (choose[i]) {
+      buf[n] = h_inv[1]*(x[i][1]-boxlo[1]) + 
+	h_inv[3]*(x[i][2]-boxlo[2]) +
+	(image[i] >> 10 & 1023) - 512;
+      n += size_one;
+    }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void DumpCustom::pack_zsu_triclinic(int n)
+{
+  double **x = atom->x;
+  int *image = atom->image;
+  int nlocal = atom->nlocal;
+
+  double *boxlo = domain->boxlo;
+  double *h_inv = domain->h_inv;
+
+  for (int i = 0; i < nlocal; i++)
+    if (choose[i]) {
+      buf[n] = h_inv[2]*(x[i][2]-boxlo[2]) + 
+	(image[i] >> 20) - 512;
+      n += size_one;
+    }
+}
+
+/* ---------------------------------------------------------------------- */
+
 void DumpCustom::pack_ix(int n)
 {
   int *image = atom->image;

From 18b365794d3328b568c07bea29a1afc81568a9cf Mon Sep 17 00:00:00 2001
From: athomps <athomps@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Fri, 29 Apr 2011 23:40:29 +0000
Subject: [PATCH 12/21] Added xsu, ysu, zsu to dump custom and dump cfg

git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6045 f3b2605a-c512-4ea7-a41b-209d697bcdaa
---
 src/dump_custom.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/dump_custom.h b/src/dump_custom.h
index c5c867de5a..b6a65b2822 100644
--- a/src/dump_custom.h
+++ b/src/dump_custom.h
@@ -120,6 +120,12 @@ class DumpCustom : public Dump {
   void pack_xu_triclinic(int);
   void pack_yu_triclinic(int);
   void pack_zu_triclinic(int);
+  void pack_xsu(int);
+  void pack_ysu(int);
+  void pack_zsu(int);
+  void pack_xsu_triclinic(int);
+  void pack_ysu_triclinic(int);
+  void pack_zsu_triclinic(int);
   void pack_ix(int);
   void pack_iy(int);
   void pack_iz(int);

From fb012a60aff355d063d895eac7efabaa00f1db17 Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Fri, 29 Apr 2011 23:41:40 +0000
Subject: [PATCH 13/21] git-svn-id:
 svn://svn.icms.temple.edu/lammps-ro/trunk@6046
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 src/fix_rigid.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/fix_rigid.cpp b/src/fix_rigid.cpp
index b24b87b41c..0d276cc348 100644
--- a/src/fix_rigid.cpp
+++ b/src/fix_rigid.cpp
@@ -532,8 +532,8 @@ void FixRigid::init()
   }
 
   // grow extended arrays and set extended flags for each particle
-  // dorientflag = 1 if any particles store dipole orientation
-  // qorientflag = 1 if any particles store quat orientation
+  // qorientflag = 1 if any particle stores quat orientation
+  // dorientflag = 1 if any particle stores dipole orientation
 
   if (extended) {
     if (atom->mu_flag) dorientflag = 1;

From 1773dd293f39c8bf3e5adb57c8a4e7ad8a251403 Mon Sep 17 00:00:00 2001
From: athomps <athomps@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Fri, 29 Apr 2011 23:44:34 +0000
Subject: [PATCH 14/21] reverted accidental change

git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6048 f3b2605a-c512-4ea7-a41b-209d697bcdaa
---
 doc/Section_start.txt | 29 ++---------------------------
 1 file changed, 2 insertions(+), 27 deletions(-)

diff --git a/doc/Section_start.txt b/doc/Section_start.txt
index fc45cf8cda..4b4d96693f 100644
--- a/doc/Section_start.txt
+++ b/doc/Section_start.txt
@@ -782,9 +782,7 @@ one-letter abbreviation can be used:
 -in or -i
 -log or -l
 -screen or -s
--var or -v
--plog or -pl
--pscreen or -ps :ul
+-var or -v :ul
 
 For example, lmp_ibm might be launched as follows:
 
@@ -848,7 +846,6 @@ logfile is named "file" and each partition also logs information to a
 file.N.  For both one-partition and multi-partition mode, if the
 specified file is "none", then no log files are created.  Using a
 "log"_log.html command in the input script will override this setting.
-Option -plog will override the name of the partition log files file.N.
 
 -screen file :pre
 
@@ -862,8 +859,7 @@ the partition ID.  If the switch is specified in multi-partition mode,
 the hi-level screen dump is named "file" and each partition also
 writes screen information to a file.N.  For both one-partition and
 multi-partition mode, if the specified file is "none", then no screen
-output is performed. Option -pscreen will override the name of the 
-partition screen files file.N.
+output is performed.
 
 -var name value1 value2 ... :pre
 
@@ -882,27 +878,6 @@ defining index and other kinds of variables and "this
 section"_Section_commands.html#3_2 for more info on using variables in
 input scripts.
 
--plog file:pre 
-Specify the base name for the partition log files, 
-so partition N writes log information to file.N. If file is 
-none, then no partition log files are created. 
-This overrides the
-filename specified in the -log command-line option
-If this option is not used
-the log file for partition N is log.lammps.N or whatever is specified by
-the -log command-line option.
-
--pscreen file:pre 
-specify the base name for the 
-partition screen file, so partition N writes 
-screen information to file.N. If file is 
-none, then no partition screen files are created. 
-This overrides the
-filename specified in the -screen command-line option.
-If this option is not used
-the screen file for partition N is screen.N or whatever is specified by
-the -screen command-line option. 
-
 :line
 
 2.7 LAMMPS screen output :h4,link(2_7)

From f6151f67354d145b0f9d0da9cc76c66e15a98ea2 Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Mon, 2 May 2011 15:01:49 +0000
Subject: [PATCH 15/21] git-svn-id:
 svn://svn.icms.temple.edu/lammps-ro/trunk@6051
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 doc/Section_commands.html |  16 ++--
 doc/Section_commands.txt  |   4 +
 doc/Section_errors.html   |  29 ++++--
 doc/Section_errors.txt    |  29 ++++--
 doc/Section_intro.html    |   8 ++
 doc/Section_intro.txt     |   8 ++
 doc/Section_start.html    | 197 ++++++++++++++++++--------------------
 doc/Section_start.txt     | 197 ++++++++++++++++++--------------------
 doc/fix_gpu.html          |  18 ++--
 doc/fix_gpu.txt           |  18 ++--
 doc/kspace_style.html     |  37 ++++++-
 doc/kspace_style.txt      |  37 ++++++-
 doc/pair_coeff.html       |   2 +
 doc/pair_coeff.txt        |   2 +
 doc/pair_lj_expand.html   |  33 ++++++-
 doc/pair_lj_expand.txt    |  31 +++++-
 doc/pair_morse.html       |  34 ++++++-
 doc/pair_morse.txt        |  31 +++++-
 doc/pair_style.html       |   2 +
 doc/pair_style.txt        |   2 +
 20 files changed, 478 insertions(+), 257 deletions(-)

diff --git a/doc/Section_commands.html b/doc/Section_commands.html
index 70bc1e8857..5f996268de 100644
--- a/doc/Section_commands.html
+++ b/doc/Section_commands.html
@@ -399,12 +399,13 @@ potentials.  Click on the style itself for a full description:
 <TR ALIGN="center"><TD ><A HREF = "pair_charmm.html">lj/charmm/coul/long/gpu</A></TD><TD ><A HREF = "pair_charmm.html">lj/charmm/coul/long/opt</A></TD><TD ><A HREF = "pair_class2.html">lj/class2</A></TD><TD ><A HREF = "pair_class2.html">lj/class2/coul/cut</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_class2.html">lj/class2/coul/long</A></TD><TD ><A HREF = "pair_lj.html">lj/cut</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/gpu</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/opt</A></TD></TR>
 <TR ALIGN="center"><TD ><A HREF = "pair_lj.html">lj/cut/coul/cut</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/cut/gpu</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/debye</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/long</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_lj.html">lj/cut/coul/long/gpu</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/long/tip4p</A></TD><TD ><A HREF = "pair_lj_expand.html">lj/expand</A></TD><TD ><A HREF = "pair_gromacs.html">lj/gromacs</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_gromacs.html">lj/gromacs/coul/gromacs</A></TD><TD ><A HREF = "pair_lj_smooth.html">lj/smooth</A></TD><TD ><A HREF = "pair_lj96_cut.html">lj96/cut</A></TD><TD ><A HREF = "pair_lj96_cut.html">lj96/cut/gpu</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_lubricate.html">lubricate</A></TD><TD ><A HREF = "pair_meam.html">meam</A></TD><TD ><A HREF = "pair_morse.html">morse</A></TD><TD ><A HREF = "pair_morse.html">morse/opt</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_peri.html">peri/lps</A></TD><TD ><A HREF = "pair_peri.html">peri/pmb</A></TD><TD ><A HREF = "pair_reax.html">reax</A></TD><TD ><A HREF = "pair_resquared.html">resquared</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_soft.html">soft</A></TD><TD ><A HREF = "pair_sw.html">sw</A></TD><TD ><A HREF = "pair_table.html">table</A></TD><TD ><A HREF = "pair_tersoff.html">tersoff</A></TD></TR>
-<TR ALIGN="center"><TD ><A HREF = "pair_tersoff_zbl.html">tersoff/zbl</A></TD><TD ><A HREF = "pair_yukawa.html">yukawa</A></TD><TD ><A HREF = "pair_yukawa_colloid.html">yukawa/colloid</A> 
+<TR ALIGN="center"><TD ><A HREF = "pair_lj.html">lj/cut/coul/long/gpu</A></TD><TD ><A HREF = "pair_lj.html">lj/cut/coul/long/tip4p</A></TD><TD ><A HREF = "pair_lj_expand.html">lj/expand</A></TD><TD ><A HREF = "pair_lj_expand.html">lj/expand/gpu</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_gromacs.html">lj/gromacs</A></TD><TD ><A HREF = "pair_gromacs.html">lj/gromacs/coul/gromacs</A></TD><TD ><A HREF = "pair_lj_smooth.html">lj/smooth</A></TD><TD ><A HREF = "pair_lj96_cut.html">lj96/cut</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_lj96_cut.html">lj96/cut/gpu</A></TD><TD ><A HREF = "pair_lubricate.html">lubricate</A></TD><TD ><A HREF = "pair_meam.html">meam</A></TD><TD ><A HREF = "pair_morse.html">morse</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_morse.html">morse/gpu</A></TD><TD ><A HREF = "pair_morse.html">morse/opt</A></TD><TD ><A HREF = "pair_peri.html">peri/lps</A></TD><TD ><A HREF = "pair_peri.html">peri/pmb</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_reax.html">reax</A></TD><TD ><A HREF = "pair_resquared.html">resquared</A></TD><TD ><A HREF = "pair_soft.html">soft</A></TD><TD ><A HREF = "pair_sw.html">sw</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_table.html">table</A></TD><TD ><A HREF = "pair_tersoff.html">tersoff</A></TD><TD ><A HREF = "pair_tersoff_zbl.html">tersoff/zbl</A></TD><TD ><A HREF = "pair_yukawa.html">yukawa</A></TD></TR>
+<TR ALIGN="center"><TD ><A HREF = "pair_yukawa_colloid.html">yukawa/colloid</A> 
 </TD></TR></TABLE></DIV>
 
 <P>These are pair styles contributed by users, which can be used if
@@ -483,7 +484,8 @@ description:
 Kspace solvers.  Click on the style itself for a full description:
 </P>
 <DIV ALIGN=center><TABLE  BORDER=1 >
-<TR ALIGN="center"><TD WIDTH="100"><A HREF = "kspace_style.html">ewald</A></TD><TD WIDTH="100"><A HREF = "kspace_style.html">pppm</A></TD><TD WIDTH="100"><A HREF = "kspace_style.html">pppm/tip4p</A> 
+<TR ALIGN="center"><TD WIDTH="100"><A HREF = "kspace_style.html">ewald</A></TD><TD WIDTH="100"><A HREF = "kspace_style.html">pppm</A></TD><TD WIDTH="100"><A HREF = "kspace_style.html">pppm/gpu/single</A></TD><TD WIDTH="100"><A HREF = "kspace_style.html">pppm/gpu/double</A></TD></TR>
+<TR ALIGN="center"><TD WIDTH="100"><A HREF = "kspace_style.html">pppm/tip4p</A> 
 </TD></TR></TABLE></DIV>
 
 <P>These are Kspace solvers contributed by users, which can be used if
diff --git a/doc/Section_commands.txt b/doc/Section_commands.txt
index 1a18b8b9b2..1c58401303 100644
--- a/doc/Section_commands.txt
+++ b/doc/Section_commands.txt
@@ -611,6 +611,7 @@ potentials.  Click on the style itself for a full description:
 "lj/cut/coul/long/gpu"_pair_lj.html,
 "lj/cut/coul/long/tip4p"_pair_lj.html,
 "lj/expand"_pair_lj_expand.html,
+"lj/expand/gpu"_pair_lj_expand.html,
 "lj/gromacs"_pair_gromacs.html,
 "lj/gromacs/coul/gromacs"_pair_gromacs.html,
 "lj/smooth"_pair_lj_smooth.html,
@@ -619,6 +620,7 @@ potentials.  Click on the style itself for a full description:
 "lubricate"_pair_lubricate.html,
 "meam"_pair_meam.html,
 "morse"_pair_morse.html,
+"morse/gpu"_pair_morse.html,
 "morse/opt"_pair_morse.html,
 "peri/lps"_pair_peri.html,
 "peri/pmb"_pair_peri.html,
@@ -728,6 +730,8 @@ Kspace solvers.  Click on the style itself for a full description:
 
 "ewald"_kspace_style.html,
 "pppm"_kspace_style.html,
+"pppm/gpu/single"_kspace_style.html,
+"pppm/gpu/double"_kspace_style.html,
 "pppm/tip4p"_kspace_style.html :tb(c=4,ea=c,w=100)
 
 These are Kspace solvers contributed by users, which can be used if
diff --git a/doc/Section_errors.html b/doc/Section_errors.html
index 8cd9ed6e46..b90784e0c0 100644
--- a/doc/Section_errors.html
+++ b/doc/Section_errors.html
@@ -173,6 +173,10 @@ the bond topologies you have defined.
 neighbors for each atom.  This likely means something is wrong with
 the bond topologies you have defined. 
 
+<DT><I>Accelerated style in input script but no fix gpu</I> 
+
+<DD>GPU acceleration requires fix gpu in the input script. 
+
 <DT><I>All angle coeffs are not set</I> 
 
 <DD>All angle coefficients must be set in the data file or by the
@@ -1240,9 +1244,9 @@ non-periodic z dimension.
 unless you use the kspace_modify command to define a 2d slab with a
 non-periodic z dimension. 
 
-<DT><I>Cannot use pair hybrid with multiple GPU pair styles</I> 
+<DT><I>Cannot use pair hybrid with GPU neighbor builds</I> 
 
-<DD>Self-explanatory. 
+<DD>See documentation for fix gpu. 
 
 <DT><I>Cannot use pair tail corrections with 2d simulations</I> 
 
@@ -1843,7 +1847,7 @@ does not exist.
 
 <DD>Self-explanatory. 
 
-<DT><I>Could not find or initialize a specified accelerator device</I> 
+<DT><I>Could not find/initialize a specified accelerator device</I> 
 
 <DD>Your GPU setup is invalid. 
 
@@ -2123,6 +2127,10 @@ model.
 used.  Most likely, one or more atoms have been blown out of the
 simulation box to a great distance. 
 
+<DT><I>Double precision is not supported on this accelerator.</I> 
+
+<DD>In this case, you must compile the GPU library for single precision. 
+
 <DT><I>Dump cfg and fix not computed at compatible times</I> 
 
 <DD>The fix must produce per-atom quantities on timesteps that dump cfg
@@ -2355,6 +2363,10 @@ smaller simulation or on more processors.
 
 <DD>Self-explanatory. 
 
+<DT><I>Fix gpu split must be positive for hybrid pair styles.</I> 
+
+<DD>See documentation for fix gpu. 
+
 <DT><I>Fix ID for compute atom/molecule does not exist</I> 
 
 <DD>Self-explanatory. 
@@ -3227,6 +3239,11 @@ this fix.
 
 <DD>This is the way the fix must be defined in your input script. 
 
+<DT><I>GPU library not compiled for this accelerator</I> 
+
+<DD>The GPU library was not built for your accelerator. Check the arch flag in
+lib/gpu. 
+
 <DT><I>Gmask function in equal-style variable formula</I> 
 
 <DD>Gmask is per-atom operation. 
@@ -3509,7 +3526,7 @@ simulation box.
 
 <DD>Eigensolve for rigid body was not sufficiently accurate. 
 
-<DT><I>Insufficient memory on accelerator (or no fix gpu)</I> 
+<DT><I>Insufficient memory on accelerator. </I> 
 
 <DD>Self-explanatory. 
 
@@ -4587,10 +4604,6 @@ contain the same atom.
 <DD>Any rigid body defined by the fix rigid command must contain 2 or more
 atoms. 
 
-<DT><I>Out of memory on GPGPU</I> 
-
-<DD>You are attempting to run with too many atoms on the GPU. 
-
 <DT><I>Out of range atoms - cannot compute PPPM</I> 
 
 <DD>One or more atoms are attempting to map their charge to a PPPM grid
diff --git a/doc/Section_errors.txt b/doc/Section_errors.txt
index d94e8a9be7..0e2b2e804b 100644
--- a/doc/Section_errors.txt
+++ b/doc/Section_errors.txt
@@ -170,6 +170,10 @@ An inconsistency was detected when computing the number of 1-4
 neighbors for each atom.  This likely means something is wrong with
 the bond topologies you have defined. :dd
 
+{Accelerated style in input script but no fix gpu} :dt
+
+GPU acceleration requires fix gpu in the input script. :dd
+
 {All angle coeffs are not set} :dt
 
 All angle coefficients must be set in the data file or by the
@@ -1237,9 +1241,9 @@ For kspace style pppm, all 3 dimensions must have periodic boundaries
 unless you use the kspace_modify command to define a 2d slab with a
 non-periodic z dimension. :dd
 
-{Cannot use pair hybrid with multiple GPU pair styles} :dt
+{Cannot use pair hybrid with GPU neighbor builds} :dt
 
-Self-explanatory. :dd
+See documentation for fix gpu. :dd
 
 {Cannot use pair tail corrections with 2d simulations} :dt
 
@@ -1840,7 +1844,7 @@ The compute ID for computing temperature does not exist. :dd
 
 Self-explanatory. :dd
 
-{Could not find or initialize a specified accelerator device} :dt
+{Could not find/initialize a specified accelerator device} :dt
 
 Your GPU setup is invalid. :dd
 
@@ -2120,6 +2124,10 @@ The domain has become extremely large so that neighbor bins cannot be
 used.  Most likely, one or more atoms have been blown out of the
 simulation box to a great distance. :dd
 
+{Double precision is not supported on this accelerator.} :dt
+
+In this case, you must compile the GPU library for single precision. :dd
+
 {Dump cfg and fix not computed at compatible times} :dt
 
 The fix must produce per-atom quantities on timesteps that dump cfg
@@ -2352,6 +2360,10 @@ This is not allowed.  Make your SRD bin size smaller. :dd
 
 Self-explanatory. :dd
 
+{Fix gpu split must be positive for hybrid pair styles.} :dt
+
+See documentation for fix gpu. :dd
+
 {Fix ID for compute atom/molecule does not exist} :dt
 
 Self-explanatory. :dd
@@ -3224,6 +3236,11 @@ When using a "*" in the restart file name, no matching file was found. :dd
 
 This is the way the fix must be defined in your input script. :dd
 
+{GPU library not compiled for this accelerator} :dt
+
+The GPU library was not built for your accelerator. Check the arch flag in
+lib/gpu. :dd
+
 {Gmask function in equal-style variable formula} :dt
 
 Gmask is per-atom operation. :dd
@@ -3506,7 +3523,7 @@ Eigensolve for rigid body was not sufficiently accurate. :dd
 
 Eigensolve for rigid body was not sufficiently accurate. :dd
 
-{Insufficient memory on accelerator (or no fix gpu)} :dt
+{Insufficient memory on accelerator. } :dt
 
 Self-explanatory. :dd
 
@@ -4584,10 +4601,6 @@ contain the same atom. :dd
 Any rigid body defined by the fix rigid command must contain 2 or more
 atoms. :dd
 
-{Out of memory on GPGPU} :dt
-
-You are attempting to run with too many atoms on the GPU. :dd
-
 {Out of range atoms - cannot compute PPPM} :dt
 
 One or more atoms are attempting to map their charge to a PPPM grid
diff --git a/doc/Section_intro.html b/doc/Section_intro.html
index f9b00bb689..bce1a9d718 100644
--- a/doc/Section_intro.html
+++ b/doc/Section_intro.html
@@ -505,6 +505,14 @@ the list.
 
 
 <DIV ALIGN=center><TABLE  BORDER=1 >
+<TR><TD >pppm GPU single and double </TD><TD > Mike Brown (ORNL)</TD></TR>
+<TR><TD >pair_style lj/cut/expand </TD><TD > Inderaj Bains (NVIDIA)</TD></TR>
+<TR><TD >temperature accelerated dynamics (TAD) </TD><TD > Aidan Thompson (Sandia)</TD></TR>
+<TR><TD >pair reax/c and fix qeq/reax </TD><TD > Metin Aktulga (Purdue, now LBNL)</TD></TR>
+<TR><TD >DREIDING force field, pair_style hbond/dreiding, etc </TD><TD > Tod Pascal (CalTech)</TD></TR>
+<TR><TD >fix adapt and compute ti for thermodynamic integreation for free energies </TD><TD > Sai Jayaraman (Sandia)</TD></TR>
+<TR><TD >pair born and pair gauss </TD><TD > Sai Jayaraman (Sandia)</TD></TR>
+<TR><TD >stochastic rotation dynamics (SRD) via fix srd </TD><TD > Jemery Lechman (Sandia) and Pieter in 't Veld (BASF)</TD></TR>
 <TR><TD >ipp Perl script tool </TD><TD > Reese Jones (Sandia)</TD></TR>
 <TR><TD >eam_database and createatoms tools </TD><TD > Xiaowang Zhou (Sandia)</TD></TR>
 <TR><TD >electron force field (eFF) </TD><TD > Andres Jaramillo-Botero and Julius Su (Caltech)</TD></TR>
diff --git a/doc/Section_intro.txt b/doc/Section_intro.txt
index e4c26c8aab..a8e46df996 100644
--- a/doc/Section_intro.txt
+++ b/doc/Section_intro.txt
@@ -490,6 +490,14 @@ the list.
 
 :link(sjp,http://www.sandia.gov/~sjplimp)
 
+pppm GPU single and double : Mike Brown (ORNL)
+pair_style lj/cut/expand : Inderaj Bains (NVIDIA)
+temperature accelerated dynamics (TAD) : Aidan Thompson (Sandia)
+pair reax/c and fix qeq/reax : Metin Aktulga (Purdue, now LBNL)
+DREIDING force field, pair_style hbond/dreiding, etc : Tod Pascal (CalTech)
+fix adapt and compute ti for thermodynamic integreation for free energies : Sai Jayaraman (Sandia)
+pair born and pair gauss : Sai Jayaraman (Sandia)
+stochastic rotation dynamics (SRD) via fix srd : Jemery Lechman (Sandia) and Pieter in 't Veld (BASF)
 ipp Perl script tool : Reese Jones (Sandia)
 eam_database and createatoms tools : Xiaowang Zhou (Sandia)
 electron force field (eFF) : Andres Jaramillo-Botero and Julius Su (Caltech)
diff --git a/doc/Section_start.html b/doc/Section_start.html
index a83aaa0ad5..08287e3377 100644
--- a/doc/Section_start.html
+++ b/doc/Section_start.html
@@ -994,143 +994,130 @@ processing units (GPUs).  We plan to add more over time.  Currently,
 they only support NVIDIA GPU cards.  To use them you need to install
 certain NVIDIA CUDA software on your system:
 </P>
-<UL><LI>Check if you have an NVIDIA card: cat /proc/driver/nvidia/cards/0
-<LI>Go to http://www.nvidia.com/object/cuda_get.html
-<LI>Install a driver and toolkit appropriate for your system (SDK is not necessary)
-<LI>Follow the instructions in README in lammps/lib/gpu to build the library.
-<LI>Run lammps/lib/gpu/nvc_get_devices to list supported devices and properties 
+<UL><LI>Check if you have an NVIDIA card: cat /proc/driver/nvidia/cards/0 Go
+<LI>to http://www.nvidia.com/object/cuda_get.html Install a driver and
+<LI>toolkit appropriate for your system (SDK is not necessary) Follow the
+<LI>instructions in README in lammps/lib/gpu to build the library.  Run
+<LI>lammps/lib/gpu/nvc_get_devices to list supported devices and
+<LI>properties 
 </UL>
 <H4>GPU configuration 
 </H4>
 <P>When using GPUs, you are restricted to one physical GPU per LAMMPS
-process. Multiple processes can share a single GPU and in many cases it
-will be more efficient to run with multiple processes per GPU. Any GPU
-accelerated style requires that <A HREF = "fix_gpu.html">fix gpu</A> be used in the
-input script to select and initialize the GPUs. The format for the fix
-is:
+process. Multiple processes can share a single GPU and in many cases
+it will be more efficient to run with multiple processes per GPU. Any
+GPU accelerated style requires that <A HREF = "fix_gpu.html">fix gpu</A> be used in
+the input script to select and initialize the GPUs. The format for the
+fix is:
 </P>
 <PRE>fix <I>name</I> all gpu <I>mode</I> <I>first</I> <I>last</I> <I>split</I> 
 </PRE>
 <P>where <I>name</I> is the name for the fix. The gpu fix must be the first
-fix specified for a given run, otherwise the program will exit
-with an error. The gpu fix will not have any effect on runs 
-that do not use GPU acceleration; there should be no problem
-with specifying the fix first in any input script.
+fix specified for a given run, otherwise the program will exit with an
+error. The gpu fix will not have any effect on runs that do not use
+GPU acceleration; there should be no problem with specifying the fix
+first in any input script.
 </P>
-<P><I>mode</I> can be either "force" or "force/neigh". In the former,
-neighbor list calculation is performed on the CPU using the
-standard LAMMPS routines. In the latter, the neighbor list
-calculation is performed on the GPU. The GPU neighbor list
-can be used for better performance, however, it 
-should not be used with a triclinic box.
+<P><I>mode</I> can be either "force" or "force/neigh". In the former, neighbor
+list calculation is performed on the CPU using the standard LAMMPS
+routines. In the latter, the neighbor list calculation is performed on
+the GPU. The GPU neighbor list can be used for better performance,
+however, it cannot not be used with a triclinic box or with
+<A HREF = "pair_hybrid.html">hybrid</A> pair styles.
 </P>
-<P>There are cases when it might be more efficient to select the CPU for neighbor
-list builds. If a non-GPU enabled style requires a neighbor list, it will also
-be built using CPU routines. Redundant CPU and GPU neighbor list calculations
-will typically be less efficient. For <A HREF = "pair_hybrid.html">hybrid</A> pair
-styles, GPU calculated neighbor lists might be less efficient because
-no particles will be skipped in a given neighbor list.
+<P>There are cases when it might be more efficient to select the CPU for
+neighbor list builds. If a non-GPU enabled style requires a neighbor
+list, it will also be built using CPU routines. Redundant CPU and GPU
+neighbor list calculations will typically be less efficient.
 </P>
-<P><I>first</I> is the ID (as reported by lammps/lib/gpu/nvc_get_devices)
-of the first GPU that will be used on each node. <I>last</I> is the
-ID of the last GPU that will be used on each node. If you have
-only one GPU per node, <I>first</I> and <I>last</I> will typically both be
-0. Selecting a non-sequential set of GPU IDs (e.g. 0,1,3)
-is not currently supported.
+<P><I>first</I> is the ID (as reported by lammps/lib/gpu/nvc_get_devices) of
+the first GPU that will be used on each node. <I>last</I> is the ID of the
+last GPU that will be used on each node. If you have only one GPU per
+node, <I>first</I> and <I>last</I> will typically both be 0. Selecting a
+non-sequential set of GPU IDs (e.g. 0,1,3) is not currently supported.
 </P>
-<P><I>split</I> is the fraction of particles whose forces, torques,
-energies, and/or virials will be calculated on the GPU. This
-can be used to perform CPU and GPU force calculations
-simultaneously. If <I>split</I> is negative, the software will
-attempt to calculate the optimal fraction automatically 
-every 25 timesteps based on CPU and GPU timings. Because the GPU speedups
-are dependent on the number of particles, automatic calculation of the
-split can be less efficient, but typically results in loop times
-within 20% of an optimal fixed split.
+<P><I>split</I> is the fraction of particles whose forces, torques, energies,
+and/or virials will be calculated on the GPU. This can be used to
+perform CPU and GPU force calculations simultaneously. If <I>split</I> is
+negative, the software will attempt to calculate the optimal fraction
+automatically every 25 timesteps based on CPU and GPU timings. Because
+the GPU speedups are dependent on the number of particles, automatic
+calculation of the split can be less efficient, but typically results
+in loop times within 20% of an optimal fixed split.
 </P>
-<P>If you have two GPUs per node, 8 CPU cores per node, and
-would like to run on 4 nodes with dynamic balancing of
-force calculation across CPU and GPU cores, the fix
-might be
+<P>If you have two GPUs per node, 8 CPU cores per node, and would like to
+run on 4 nodes with dynamic balancing of force calculation across CPU
+and GPU cores, the fix might be
 </P>
 <PRE>fix 0 all gpu force/neigh 0 1 -1 
 </PRE>
-<P>with LAMMPS run on 32 processes. In this case, all
-CPU cores and GPU devices on the nodes would be utilized.
-Each GPU device would be shared by 4 CPU cores. The
-CPU cores would perform force calculations for some
-fraction of the particles at the same time the GPUs
-performed force calculation for the other particles.
+<P>with LAMMPS run on 32 processes. In this case, all CPU cores and GPU
+devices on the nodes would be utilized.  Each GPU device would be
+shared by 4 CPU cores. The CPU cores would perform force calculations
+for some fraction of the particles at the same time the GPUs performed
+force calculation for the other particles.
 </P>
-<P>Because of the large number of cores on each GPU
-device, it might be more efficient to run on fewer
-processes per GPU when the number of particles per process
-is small (100's of particles); this can be necessary
-to keep the GPU cores busy.
+<P>Because of the large number of cores on each GPU device, it might be
+more efficient to run on fewer processes per GPU when the number of
+particles per process is small (100's of particles); this can be
+necessary to keep the GPU cores busy.
 </P>
 <H4>GPU input script 
 </H4>
-<P>In order to use GPU acceleration in LAMMPS, 
-<A HREF = "fix_gpu.html">fix_gpu</A>
-should be used in order to initialize and configure the
-GPUs for use. Additionally, GPU enabled styles must be
-selected in the input script. Currently,
-this is limited to a few <A HREF = "pair_style.html">pair styles</A>.
-Some GPU-enabled styles have additional restrictions
-listed in their documentation.
+<P>In order to use GPU acceleration in LAMMPS, <A HREF = "fix_gpu.html">fix_gpu</A>
+should be used in order to initialize and configure the GPUs for
+use. Additionally, GPU enabled styles must be selected in the input
+script. Currently, this is limited to a few <A HREF = "pair_style.html">pair
+styles</A> and PPPM.  Some GPU-enabled styles have
+additional restrictions listed in their documentation.
 </P>
 <H4>GPU asynchronous pair computation 
 </H4>
-<P>The GPU accelerated pair styles can be used to perform
-pair style force calculation on the GPU while other 
-calculations are
-performed on the CPU. One method to do this is to specify
-a <I>split</I> in the gpu fix as described above. In this case,
-force calculation for the pair style will also be performed
-on the CPU. 
+<P>The GPU accelerated pair styles can be used to perform pair style
+force calculation on the GPU while other calculations are performed on
+the CPU. One method to do this is to specify a <I>split</I> in the gpu fix
+as described above.  In this case, force calculation for the pair
+style will also be performed on the CPU.
 </P>
-<P>When the CPU work in a GPU pair style has finished,
-the next force computation will begin, possibly before the
-GPU has finished. If <I>split</I> is 1.0 in the gpu fix, the next
-force computation will begin almost immediately. This can
-be used to run a <A HREF = "pair_hybrid.html">hybrid</A> GPU pair style at 
-the same time as a hybrid CPU pair style. In this case, the 
-GPU pair style should be first in the hybrid command in order to
-perform simultaneous calculations. This also
-allows <A HREF = "bond_style.html">bond</A>, <A HREF = "angle_style.html">angle</A>, 
-<A HREF = "dihedral_style.html">dihedral</A>, <A HREF = "improper_style.html">improper</A>, 
-and <A HREF = "kspace_style.html">long-range</A> force
-computations to be run simultaneously with the GPU pair style.
-Once all CPU force computations have completed, the gpu fix
-will block until the GPU has finished all work before continuing
-the run.
+<P>When the CPU work in a GPU pair style has finished, the next force
+computation will begin, possibly before the GPU has finished. If
+<I>split</I> is 1.0 in the gpu fix, the next force computation will begin
+almost immediately. This can be used to run a
+<A HREF = "pair_hybrid.html">hybrid</A> GPU pair style at the same time as a hybrid
+CPU pair style. In this case, the GPU pair style should be first in
+the hybrid command in order to perform simultaneous calculations. This
+also allows <A HREF = "bond_style.html">bond</A>, <A HREF = "angle_style.html">angle</A>,
+<A HREF = "dihedral_style.html">dihedral</A>, <A HREF = "improper_style.html">improper</A>, and
+<A HREF = "kspace_style.html">long-range</A> force computations to be run
+simultaneously with the GPU pair style.  Once all CPU force
+computations have completed, the gpu fix will block until the GPU has
+finished all work before continuing the run.
 </P>
 <H4>GPU timing 
 </H4>
 <P>GPU accelerated pair styles can perform computations asynchronously
-with CPU computations. The "Pair" time reported by LAMMPS
-will be the maximum of the time required to complete the CPU
-pair style computations and the time required to complete the GPU
-pair style computations. Any time spent for GPU-enabled pair styles
-for computations that run simultaneously with <A HREF = "bond_style.html">bond</A>, 
-<A HREF = "angle_style.html">angle</A>, <A HREF = "dihedral_style.html">dihedral</A>, 
-<A HREF = "improper_style.html">improper</A>, and <A HREF = "kspace_style.html">long-range</A> calculations
-will not be included in the "Pair" time.
+with CPU computations. The "Pair" time reported by LAMMPS will be the
+maximum of the time required to complete the CPU pair style
+computations and the time required to complete the GPU pair style
+computations. Any time spent for GPU-enabled pair styles for
+computations that run simultaneously with <A HREF = "bond_style.html">bond</A>,
+<A HREF = "angle_style.html">angle</A>, <A HREF = "dihedral_style.html">dihedral</A>,
+<A HREF = "improper_style.html">improper</A>, and <A HREF = "kspace_style.html">long-range</A>
+calculations will not be included in the "Pair" time.
 </P>
-<P>When <I>mode</I> for the gpu fix is force/neigh,
-the time for neighbor list calculations on the GPU will be added
-into the "Pair" time, not the "Neigh" time. A breakdown of the
-times required for various tasks on the GPU (data copy, neighbor
-calculations, force computations, etc.) are output only
-with the LAMMPS screen output at the end of each run. These timings represent
-total time spent on the GPU for each routine, regardless of asynchronous
-CPU calculations.
+<P>When <I>mode</I> for the gpu fix is force/neigh, the time for neighbor list
+calculations on the GPU will be added into the "Pair" time, not the
+"Neigh" time. A breakdown of the times required for various tasks on
+the GPU (data copy, neighbor calculations, force computations, etc.)
+are output only with the LAMMPS screen output at the end of each
+run. These timings represent total time spent on the GPU for each
+routine, regardless of asynchronous CPU calculations.
 </P>
 <H4>GPU single vs double precision 
 </H4>
-<P>See the lammps/lib/gpu/README file for instructions on how to build 
-the LAMMPS gpu library for single, mixed, and double precision.  The latter
-requires that your GPU card supports double precision. 
+<P>See the lammps/lib/gpu/README file for instructions on how to build
+the LAMMPS gpu library for single, mixed, and double precision.  The
+latter requires that your GPU card supports double precision.
 </P>
 <HR>
 
diff --git a/doc/Section_start.txt b/doc/Section_start.txt
index 4b4d96693f..fbdd015ab4 100644
--- a/doc/Section_start.txt
+++ b/doc/Section_start.txt
@@ -984,143 +984,130 @@ processing units (GPUs).  We plan to add more over time.  Currently,
 they only support NVIDIA GPU cards.  To use them you need to install
 certain NVIDIA CUDA software on your system:
 
-Check if you have an NVIDIA card: cat /proc/driver/nvidia/cards/0
-Go to http://www.nvidia.com/object/cuda_get.html
-Install a driver and toolkit appropriate for your system (SDK is not necessary)
-Follow the instructions in README in lammps/lib/gpu to build the library.
-Run lammps/lib/gpu/nvc_get_devices to list supported devices and properties :ul
+Check if you have an NVIDIA card: cat /proc/driver/nvidia/cards/0 Go
+to http://www.nvidia.com/object/cuda_get.html Install a driver and
+toolkit appropriate for your system (SDK is not necessary) Follow the
+instructions in README in lammps/lib/gpu to build the library.  Run
+lammps/lib/gpu/nvc_get_devices to list supported devices and
+properties :ul
 
 GPU configuration :h4
 
 When using GPUs, you are restricted to one physical GPU per LAMMPS
-process. Multiple processes can share a single GPU and in many cases it
-will be more efficient to run with multiple processes per GPU. Any GPU
-accelerated style requires that "fix gpu"_fix_gpu.html be used in the
-input script to select and initialize the GPUs. The format for the fix
-is:
+process. Multiple processes can share a single GPU and in many cases
+it will be more efficient to run with multiple processes per GPU. Any
+GPU accelerated style requires that "fix gpu"_fix_gpu.html be used in
+the input script to select and initialize the GPUs. The format for the
+fix is:
 
 fix {name} all gpu {mode} {first} {last} {split} :pre
 
 where {name} is the name for the fix. The gpu fix must be the first
-fix specified for a given run, otherwise the program will exit
-with an error. The gpu fix will not have any effect on runs 
-that do not use GPU acceleration; there should be no problem
-with specifying the fix first in any input script.
+fix specified for a given run, otherwise the program will exit with an
+error. The gpu fix will not have any effect on runs that do not use
+GPU acceleration; there should be no problem with specifying the fix
+first in any input script.
 
-{mode} can be either "force" or "force/neigh". In the former,
-neighbor list calculation is performed on the CPU using the
-standard LAMMPS routines. In the latter, the neighbor list
-calculation is performed on the GPU. The GPU neighbor list
-can be used for better performance, however, it 
-should not be used with a triclinic box.
+{mode} can be either "force" or "force/neigh". In the former, neighbor
+list calculation is performed on the CPU using the standard LAMMPS
+routines. In the latter, the neighbor list calculation is performed on
+the GPU. The GPU neighbor list can be used for better performance,
+however, it cannot not be used with a triclinic box or with
+"hybrid"_pair_hybrid.html pair styles.
 
-There are cases when it might be more efficient to select the CPU for neighbor
-list builds. If a non-GPU enabled style requires a neighbor list, it will also
-be built using CPU routines. Redundant CPU and GPU neighbor list calculations
-will typically be less efficient. For "hybrid"_pair_hybrid.html pair
-styles, GPU calculated neighbor lists might be less efficient because
-no particles will be skipped in a given neighbor list.
+There are cases when it might be more efficient to select the CPU for
+neighbor list builds. If a non-GPU enabled style requires a neighbor
+list, it will also be built using CPU routines. Redundant CPU and GPU
+neighbor list calculations will typically be less efficient.
 
-{first} is the ID (as reported by lammps/lib/gpu/nvc_get_devices)
-of the first GPU that will be used on each node. {last} is the
-ID of the last GPU that will be used on each node. If you have
-only one GPU per node, {first} and {last} will typically both be
-0. Selecting a non-sequential set of GPU IDs (e.g. 0,1,3)
-is not currently supported.
+{first} is the ID (as reported by lammps/lib/gpu/nvc_get_devices) of
+the first GPU that will be used on each node. {last} is the ID of the
+last GPU that will be used on each node. If you have only one GPU per
+node, {first} and {last} will typically both be 0. Selecting a
+non-sequential set of GPU IDs (e.g. 0,1,3) is not currently supported.
 
-{split} is the fraction of particles whose forces, torques,
-energies, and/or virials will be calculated on the GPU. This
-can be used to perform CPU and GPU force calculations
-simultaneously. If {split} is negative, the software will
-attempt to calculate the optimal fraction automatically 
-every 25 timesteps based on CPU and GPU timings. Because the GPU speedups
-are dependent on the number of particles, automatic calculation of the
-split can be less efficient, but typically results in loop times
-within 20% of an optimal fixed split.
+{split} is the fraction of particles whose forces, torques, energies,
+and/or virials will be calculated on the GPU. This can be used to
+perform CPU and GPU force calculations simultaneously. If {split} is
+negative, the software will attempt to calculate the optimal fraction
+automatically every 25 timesteps based on CPU and GPU timings. Because
+the GPU speedups are dependent on the number of particles, automatic
+calculation of the split can be less efficient, but typically results
+in loop times within 20% of an optimal fixed split.
 
-If you have two GPUs per node, 8 CPU cores per node, and
-would like to run on 4 nodes with dynamic balancing of
-force calculation across CPU and GPU cores, the fix
-might be
+If you have two GPUs per node, 8 CPU cores per node, and would like to
+run on 4 nodes with dynamic balancing of force calculation across CPU
+and GPU cores, the fix might be
 
 fix 0 all gpu force/neigh 0 1 -1 :pre
 
-with LAMMPS run on 32 processes. In this case, all
-CPU cores and GPU devices on the nodes would be utilized.
-Each GPU device would be shared by 4 CPU cores. The
-CPU cores would perform force calculations for some
-fraction of the particles at the same time the GPUs
-performed force calculation for the other particles.
+with LAMMPS run on 32 processes. In this case, all CPU cores and GPU
+devices on the nodes would be utilized.  Each GPU device would be
+shared by 4 CPU cores. The CPU cores would perform force calculations
+for some fraction of the particles at the same time the GPUs performed
+force calculation for the other particles.
 
-Because of the large number of cores on each GPU
-device, it might be more efficient to run on fewer
-processes per GPU when the number of particles per process
-is small (100's of particles); this can be necessary
-to keep the GPU cores busy.
+Because of the large number of cores on each GPU device, it might be
+more efficient to run on fewer processes per GPU when the number of
+particles per process is small (100's of particles); this can be
+necessary to keep the GPU cores busy.
 
 GPU input script :h4
 
-In order to use GPU acceleration in LAMMPS, 
-"fix_gpu"_fix_gpu.html
-should be used in order to initialize and configure the
-GPUs for use. Additionally, GPU enabled styles must be
-selected in the input script. Currently,
-this is limited to a few "pair styles"_pair_style.html.
-Some GPU-enabled styles have additional restrictions
-listed in their documentation.
+In order to use GPU acceleration in LAMMPS, "fix_gpu"_fix_gpu.html
+should be used in order to initialize and configure the GPUs for
+use. Additionally, GPU enabled styles must be selected in the input
+script. Currently, this is limited to a few "pair
+styles"_pair_style.html and PPPM.  Some GPU-enabled styles have
+additional restrictions listed in their documentation.
 
 GPU asynchronous pair computation :h4
 
-The GPU accelerated pair styles can be used to perform
-pair style force calculation on the GPU while other 
-calculations are
-performed on the CPU. One method to do this is to specify
-a {split} in the gpu fix as described above. In this case,
-force calculation for the pair style will also be performed
-on the CPU. 
+The GPU accelerated pair styles can be used to perform pair style
+force calculation on the GPU while other calculations are performed on
+the CPU. One method to do this is to specify a {split} in the gpu fix
+as described above.  In this case, force calculation for the pair
+style will also be performed on the CPU.
 
-When the CPU work in a GPU pair style has finished,
-the next force computation will begin, possibly before the
-GPU has finished. If {split} is 1.0 in the gpu fix, the next
-force computation will begin almost immediately. This can
-be used to run a "hybrid"_pair_hybrid.html GPU pair style at 
-the same time as a hybrid CPU pair style. In this case, the 
-GPU pair style should be first in the hybrid command in order to
-perform simultaneous calculations. This also
-allows "bond"_bond_style.html, "angle"_angle_style.html, 
-"dihedral"_dihedral_style.html, "improper"_improper_style.html, 
-and "long-range"_kspace_style.html force
-computations to be run simultaneously with the GPU pair style.
-Once all CPU force computations have completed, the gpu fix
-will block until the GPU has finished all work before continuing
-the run.
+When the CPU work in a GPU pair style has finished, the next force
+computation will begin, possibly before the GPU has finished. If
+{split} is 1.0 in the gpu fix, the next force computation will begin
+almost immediately. This can be used to run a
+"hybrid"_pair_hybrid.html GPU pair style at the same time as a hybrid
+CPU pair style. In this case, the GPU pair style should be first in
+the hybrid command in order to perform simultaneous calculations. This
+also allows "bond"_bond_style.html, "angle"_angle_style.html,
+"dihedral"_dihedral_style.html, "improper"_improper_style.html, and
+"long-range"_kspace_style.html force computations to be run
+simultaneously with the GPU pair style.  Once all CPU force
+computations have completed, the gpu fix will block until the GPU has
+finished all work before continuing the run.
 
 GPU timing :h4
 
 GPU accelerated pair styles can perform computations asynchronously
-with CPU computations. The "Pair" time reported by LAMMPS
-will be the maximum of the time required to complete the CPU
-pair style computations and the time required to complete the GPU
-pair style computations. Any time spent for GPU-enabled pair styles
-for computations that run simultaneously with "bond"_bond_style.html, 
-"angle"_angle_style.html, "dihedral"_dihedral_style.html, 
-"improper"_improper_style.html, and "long-range"_kspace_style.html calculations
-will not be included in the "Pair" time.
+with CPU computations. The "Pair" time reported by LAMMPS will be the
+maximum of the time required to complete the CPU pair style
+computations and the time required to complete the GPU pair style
+computations. Any time spent for GPU-enabled pair styles for
+computations that run simultaneously with "bond"_bond_style.html,
+"angle"_angle_style.html, "dihedral"_dihedral_style.html,
+"improper"_improper_style.html, and "long-range"_kspace_style.html
+calculations will not be included in the "Pair" time.
 
-When {mode} for the gpu fix is force/neigh,
-the time for neighbor list calculations on the GPU will be added
-into the "Pair" time, not the "Neigh" time. A breakdown of the
-times required for various tasks on the GPU (data copy, neighbor
-calculations, force computations, etc.) are output only
-with the LAMMPS screen output at the end of each run. These timings represent
-total time spent on the GPU for each routine, regardless of asynchronous
-CPU calculations.
+When {mode} for the gpu fix is force/neigh, the time for neighbor list
+calculations on the GPU will be added into the "Pair" time, not the
+"Neigh" time. A breakdown of the times required for various tasks on
+the GPU (data copy, neighbor calculations, force computations, etc.)
+are output only with the LAMMPS screen output at the end of each
+run. These timings represent total time spent on the GPU for each
+routine, regardless of asynchronous CPU calculations.
 
 GPU single vs double precision :h4
 
-See the lammps/lib/gpu/README file for instructions on how to build 
-the LAMMPS gpu library for single, mixed, and double precision.  The latter
-requires that your GPU card supports double precision. 
+See the lammps/lib/gpu/README file for instructions on how to build
+the LAMMPS gpu library for single, mixed, and double precision.  The
+latter requires that your GPU card supports double precision.
 
 :line
 
diff --git a/doc/fix_gpu.html b/doc/fix_gpu.html
index 72839bc0d1..f71a8e8a4a 100644
--- a/doc/fix_gpu.html
+++ b/doc/fix_gpu.html
@@ -48,14 +48,13 @@ should not be any problems with specifying this fix first in input scripts.
 <P><I>mode</I> specifies where neighbor list calculations will be performed.
 If <I>mode</I> is force, neighbor list calculation is performed on the
 CPU. If <I>mode</I> is force/neigh, neighbor list calculation is 
-performed on the GPU. GPU neighbor
-list calculation currently cannot be used with a triclinic box.
+performed on the GPU. GPU neighbor list calculation currently cannot be
+used with a triclinic box. GPU neighbor list calculation currently
+cannot be used with <A HREF = "pair_hybrid.html">hybrid</A> pair styles.
 GPU neighbor lists are not compatible with styles that are not GPU-enabled.
 When a non-GPU enabled style requires a neighbor list, it will also be
 built using CPU routines. In these cases, it will typically be more efficient
-to only use CPU neighbor list builds. For <A HREF = "pair_hybrid.html">hybrid</A> pair
-styles, GPU calculated neighbor lists might be less efficient because
-no particles will be skipped in a given neighbor list.
+to only use CPU neighbor list builds.
 </P>
 <P><I>first</I> and <I>last</I> specify the GPUs that will be used for simulation.
 On each node, the GPU IDs in the inclusive range from <I>first</I> to <I>last</I> will
@@ -77,7 +76,8 @@ style.
 </P>
 <P>In order to use GPU acceleration, a GPU enabled style must be
 selected in the input script in addition to this fix. Currently,
-this is limited to a few <A HREF = "pair_style.html">pair styles</A>.
+this is limited to a few <A HREF = "pair_style.html">pair styles</A> and
+the PPPM <A HREF = "kspace_style.html">kspace style</A>.
 </P>
 <P>More details about these settings and various possible hardware
 configuration are in <A HREF = "Section_start.html#2_8">this section</A> of the
@@ -95,8 +95,10 @@ the <A HREF = "run.html">run</A> command.
 <P><B>Restrictions:</B> 
 </P>
 <P>The fix must be the first fix specified for a given run. The force/neigh
-<I>mode</I> should not be used with a triclinic box or GPU-enabled pair styles
-that need <A HREF = "special_bonds.html">special_bonds</A> settings.
+<I>mode</I> should not be used with a triclinic box or <A HREF = "pair_hybrid.html">hybrid</A>
+pair styles.
+</P>
+<P><I>split</I> must be positive when using <A HREF = "pair_hybrid.html">hybrid</A> pair styles.
 </P>
 <P>Currently, group-ID must be all.
 </P>
diff --git a/doc/fix_gpu.txt b/doc/fix_gpu.txt
index 88fa6f5414..df8fbadb8f 100644
--- a/doc/fix_gpu.txt
+++ b/doc/fix_gpu.txt
@@ -39,14 +39,13 @@ should not be any problems with specifying this fix first in input scripts.
 {mode} specifies where neighbor list calculations will be performed.
 If {mode} is force, neighbor list calculation is performed on the
 CPU. If {mode} is force/neigh, neighbor list calculation is 
-performed on the GPU. GPU neighbor
-list calculation currently cannot be used with a triclinic box.
+performed on the GPU. GPU neighbor list calculation currently cannot be
+used with a triclinic box. GPU neighbor list calculation currently
+cannot be used with "hybrid"_pair_hybrid.html pair styles.
 GPU neighbor lists are not compatible with styles that are not GPU-enabled.
 When a non-GPU enabled style requires a neighbor list, it will also be
 built using CPU routines. In these cases, it will typically be more efficient
-to only use CPU neighbor list builds. For "hybrid"_pair_hybrid.html pair
-styles, GPU calculated neighbor lists might be less efficient because
-no particles will be skipped in a given neighbor list.
+to only use CPU neighbor list builds.
 
 {first} and {last} specify the GPUs that will be used for simulation.
 On each node, the GPU IDs in the inclusive range from {first} to {last} will
@@ -68,7 +67,8 @@ style.
 
 In order to use GPU acceleration, a GPU enabled style must be
 selected in the input script in addition to this fix. Currently,
-this is limited to a few "pair styles"_pair_style.html.
+this is limited to a few "pair styles"_pair_style.html and
+the PPPM "kspace style"_kspace_style.html.
 
 More details about these settings and various possible hardware
 configuration are in "this section"_Section_start.html#2_8 of the
@@ -86,8 +86,10 @@ the "run"_run.html command.
 [Restrictions:] 
 
 The fix must be the first fix specified for a given run. The force/neigh
-{mode} should not be used with a triclinic box or GPU-enabled pair styles
-that need "special_bonds"_special_bonds.html settings.
+{mode} should not be used with a triclinic box or "hybrid"_pair_hybrid.html
+pair styles.
+
+{split} must be positive when using "hybrid"_pair_hybrid.html pair styles.
 
 Currently, group-ID must be all.
 
diff --git a/doc/kspace_style.html b/doc/kspace_style.html
index 57c035f570..30b0bcbc1b 100644
--- a/doc/kspace_style.html
+++ b/doc/kspace_style.html
@@ -15,7 +15,7 @@
 </P>
 <PRE>kspace_style style value 
 </PRE>
-<UL><LI>style = <I>none</I> or <I>ewald</I> or <I>pppm</I> or <I>pppm/tip4p</I> or <I>ewald/n</I> 
+<UL><LI>style = <I>none</I> or <I>ewald</I> or <I>pppm</I> or <I>pppm/tip4p</I> or <I>ewald/n</I> or <I>pppm/gpu/single</I> or <I>pppm/gpu/double</I> 
 
 <PRE>  <I>none</I> value = none
   <I>ewald</I> value = precision
@@ -25,6 +25,10 @@
   <I>pppm/tip4p</I> value = precision
     precision = desired accuracy
   <I>ewald/n</I> value = precision
+    precision = desired accuracy
+  <I>pppm/gpu/single</I> value = precision
+    precision = desired accuracy
+  <I>pppm/gpu/double</I> value = precision
     precision = desired accuracy 
 </PRE>
 
@@ -72,6 +76,11 @@ long-range potentials.
 <P>Currently, only the <I>ewald/n</I> style can be used with non-orthogonal
 (triclinic symmetry) simulation boxes.
 </P>
+<P>The <I>pppm/gpu/single</I> and <I>pppm/gpu/double</I> styles are GPU-enabled
+version of <I>pppm</I>. See more details below.
+</P>
+<HR>
+
 <P>When a kspace style is used, a pair style that includes the
 short-range correction to the pairwise Coulombic or other 1/r^N forces
 must also be selected.  For Coulombic interactions, these styles are
@@ -88,6 +97,27 @@ of K-space vectors for style <I>ewald</I> or the FFT grid size for style
 <P>See the <A HREF = "kspace_modify.html">kspace_modify</A> command for additional
 options of the K-space solvers that can be set.
 </P>
+<HR>
+
+<P>The <I>pppm/gpu/single</I> style performs single precision
+charge assignment and force interpolation calculations on the GPU.
+The <I>pppm/gpu/double</I> style performs the mesh calculations on the GPU
+in double precision. FFT solves are calculated on the CPU in both
+cases. If either <I>pppm/gpu/single</I> or <I>pppm/gpu/double</I> are used with
+a GPU-enabled pair style, part of the PPPM calculation can be performed
+concurrently on the GPU while other calculations for non-bonded and
+bonded force calculation are performed on the CPU.
+</P>
+<P>More details about GPU settings and various possible hardware
+configurations are in <A HREF = "Section_start.html#2_8">this section</A> of the
+manual.
+</P>
+<P>Additional requirements in your input script to run with GPU-enabled 
+PPPM styles are as follows:
+</P>
+<P><A HREF = "fix_gpu.html">fix gpu</A> must be used. The fix controls
+the essential GPU selection and initialization steps.
+</P>
 <P><B>Restrictions:</B>
 </P>
 <P>A simulation must be 3d and periodic in all dimensions to use an Ewald
@@ -103,6 +133,11 @@ LAMMPS</A> section for more info.
 enabled if LAMMPS was built with that package.  See the <A HREF = "Section_start.html#2_3">Making
 LAMMPS</A> section for more info.
 </P>
+<P>The <I>pppm/gpu/single</I> and <I>pppm/gpu/double</I> styles are part of the
+"gpu" package.  They are only enabled if LAMMPS was built with that
+package.  See the <A HREF = "Section_start.html#2_3">Making LAMMPS</A> section for
+more info.
+</P>
 <P>When using a long-range pairwise TIP4P potential, you must use kspace
 style <I>pppm/tip4p</I> and vice versa.
 </P>
diff --git a/doc/kspace_style.txt b/doc/kspace_style.txt
index b6b12696d2..217978c193 100644
--- a/doc/kspace_style.txt
+++ b/doc/kspace_style.txt
@@ -12,7 +12,7 @@ kspace_style command :h3
 
 kspace_style style value :pre
 
-style = {none} or {ewald} or {pppm} or {pppm/tip4p} or {ewald/n} :ulb,l
+style = {none} or {ewald} or {pppm} or {pppm/tip4p} or {ewald/n} or {pppm/gpu/single} or {pppm/gpu/double} :ulb,l
   {none} value = none
   {ewald} value = precision
     precision = desired accuracy
@@ -21,6 +21,10 @@ style = {none} or {ewald} or {pppm} or {pppm/tip4p} or {ewald/n} :ulb,l
   {pppm/tip4p} value = precision
     precision = desired accuracy
   {ewald/n} value = precision
+    precision = desired accuracy
+  {pppm/gpu/single} value = precision
+    precision = desired accuracy
+  {pppm/gpu/double} value = precision
     precision = desired accuracy :pre
 :ule
 
@@ -67,6 +71,11 @@ long-range potentials.
 Currently, only the {ewald/n} style can be used with non-orthogonal
 (triclinic symmetry) simulation boxes.
 
+The {pppm/gpu/single} and {pppm/gpu/double} styles are GPU-enabled
+version of {pppm}. See more details below.
+
+:line
+
 When a kspace style is used, a pair style that includes the
 short-range correction to the pairwise Coulombic or other 1/r^N forces
 must also be selected.  For Coulombic interactions, these styles are
@@ -83,6 +92,27 @@ of K-space vectors for style {ewald} or the FFT grid size for style
 See the "kspace_modify"_kspace_modify.html command for additional
 options of the K-space solvers that can be set.
 
+:line
+
+The {pppm/gpu/single} style performs single precision
+charge assignment and force interpolation calculations on the GPU.
+The {pppm/gpu/double} style performs the mesh calculations on the GPU
+in double precision. FFT solves are calculated on the CPU in both
+cases. If either {pppm/gpu/single} or {pppm/gpu/double} are used with
+a GPU-enabled pair style, part of the PPPM calculation can be performed
+concurrently on the GPU while other calculations for non-bonded and
+bonded force calculation are performed on the CPU.
+
+More details about GPU settings and various possible hardware
+configurations are in "this section"_Section_start.html#2_8 of the
+manual.
+
+Additional requirements in your input script to run with GPU-enabled 
+PPPM styles are as follows:
+
+"fix gpu"_fix_gpu.html must be used. The fix controls
+the essential GPU selection and initialization steps.
+
 [Restrictions:]
 
 A simulation must be 3d and periodic in all dimensions to use an Ewald
@@ -98,6 +128,11 @@ The {ewald/n} style is part of the "user-ewaldn" package.  It is only
 enabled if LAMMPS was built with that package.  See the "Making
 LAMMPS"_Section_start.html#2_3 section for more info.
 
+The {pppm/gpu/single} and {pppm/gpu/double} styles are part of the
+"gpu" package.  They are only enabled if LAMMPS was built with that
+package.  See the "Making LAMMPS"_Section_start.html#2_3 section for
+more info.
+
 When using a long-range pairwise TIP4P potential, you must use kspace
 style {pppm/tip4p} and vice versa.
 
diff --git a/doc/pair_coeff.html b/doc/pair_coeff.html
index fa98d3addd..0f54432555 100644
--- a/doc/pair_coeff.html
+++ b/doc/pair_coeff.html
@@ -134,6 +134,7 @@ the pair_style command, and coefficients specified by the associated
 <LI><A HREF = "pair_lj.html">pair_style lj/cut/coul/long/gpu</A> - GPU-enabled version of LJ with long-range Coulomb
 <LI><A HREF = "pair_lj.html">pair_style lj/cut/coul/long/tip4p</A> - LJ with long-range Coulomb for TIP4P water
 <LI><A HREF = "pair_lj_expand.html">pair_style lj/expand</A> - Lennard-Jones for variable size particles
+<LI><A HREF = "pair_lj_expand.html">pair_style lj/expand/gpu</A> - GPU-enabled version of lj/expand
 <LI><A HREF = "pair_gromacs.html">pair_style lj/gromacs</A> - GROMACS-style Lennard-Jones potential
 <LI><A HREF = "pair_gromacs.html">pair_style lj/gromacs/coul/gromacs</A> - GROMACS-style LJ and Coulombic potential
 <LI><A HREF = "pair_lj_smooth.html">pair_style lj/smooth</A> - smoothed Lennard-Jones potential
@@ -142,6 +143,7 @@ the pair_style command, and coefficients specified by the associated
 <LI><A HREF = "pair_lubricate.html">pair_style lubricate</A> - hydrodynamic lubrication forces
 <LI><A HREF = "pair_meam.html">pair_style meam</A> - modified embedded atom method (MEAM)
 <LI><A HREF = "pair_morse.html">pair_style morse</A> - Morse potential
+<LI><A HREF = "pair_morse.html">pair_style morse/gpu</A> - GPU-enabled version of Morse potential
 <LI><A HREF = "pair_morse.html">pair_style morse/opt</A> - optimized version of Morse potential
 <LI><A HREF = "pair_peri.html">pair_style peri/lps</A> - peridynamic LPS potential
 <LI><A HREF = "pair_peri.html">pair_style peri/pmb</A> - peridynamic PMB potential
diff --git a/doc/pair_coeff.txt b/doc/pair_coeff.txt
index baf95341db..308e35329c 100644
--- a/doc/pair_coeff.txt
+++ b/doc/pair_coeff.txt
@@ -131,6 +131,7 @@ the pair_style command, and coefficients specified by the associated
 "pair_style lj/cut/coul/long/gpu"_pair_lj.html - GPU-enabled version of LJ with long-range Coulomb
 "pair_style lj/cut/coul/long/tip4p"_pair_lj.html - LJ with long-range Coulomb for TIP4P water
 "pair_style lj/expand"_pair_lj_expand.html - Lennard-Jones for variable size particles
+"pair_style lj/expand/gpu"_pair_lj_expand.html - GPU-enabled version of lj/expand
 "pair_style lj/gromacs"_pair_gromacs.html - GROMACS-style Lennard-Jones potential
 "pair_style lj/gromacs/coul/gromacs"_pair_gromacs.html - GROMACS-style LJ and Coulombic potential
 "pair_style lj/smooth"_pair_lj_smooth.html - smoothed Lennard-Jones potential
@@ -139,6 +140,7 @@ the pair_style command, and coefficients specified by the associated
 "pair_style lubricate"_pair_lubricate.html - hydrodynamic lubrication forces
 "pair_style meam"_pair_meam.html - modified embedded atom method (MEAM)
 "pair_style morse"_pair_morse.html - Morse potential
+"pair_style morse/gpu"_pair_morse.html - GPU-enabled version of Morse potential
 "pair_style morse/opt"_pair_morse.html - optimized version of Morse potential
 "pair_style peri/lps"_pair_peri.html - peridynamic LPS potential
 "pair_style peri/pmb"_pair_peri.html - peridynamic PMB potential
diff --git a/doc/pair_lj_expand.html b/doc/pair_lj_expand.html
index 8dfb3d2068..9e766d3f4b 100644
--- a/doc/pair_lj_expand.html
+++ b/doc/pair_lj_expand.html
@@ -11,10 +11,14 @@
 
 <H3>pair_style lj/expand command 
 </H3>
+<H3>pair_style lj/expand/gpu command 
+</H3>
 <P><B>Syntax:</B>
 </P>
 <PRE>pair_style lj/expand cutoff 
 </PRE>
+<PRE>pair_style lj/expand/gpu cutoff 
+</PRE>
 <UL><LI>cutoff = global cutoff for lj/expand interactions (distance units) 
 </UL>
 <P><B>Examples:</B>
@@ -49,6 +53,29 @@ commands, or by mixing as described below:
 <P>The delta values can be positive or negative.  The last coefficient is
 optional.  If not specified, the global LJ cutoff is used.
 </P>
+<P>Style <I>lj/expand/gpu</I> is a GPU-enabled version of style <I>lj/expand</I>.
+See more details below.
+</P>
+<HR>
+
+<P>The <I>lj/expand/gpu</I> style is identical to the <I>lj/expand</I> style,
+except that each processor off-loads its pairwise calculations to a 
+GPU chip. Depending on the hardware available on your system this can provide a
+speed-up.  See the <A HREF = "Section_start.html#2_8">Running on GPUs</A> section of
+the manual for more details about hardware and software requirements
+for using GPUs.
+</P>
+<P>More details about these settings and various possible hardware
+configuration are in <A HREF = "Section_start.html#2_8">this section</A> of the
+manual.
+</P>
+<P>Additional requirements in your input script to run with GPU-enabled styles
+are as follows:
+</P>
+<P>The <A HREF = "newton.html">newton pair</A> setting must be <I>off</I> and
+<A HREF = "fix_gpu.html">fix gpu</A> must be used. The fix controls
+the essential GPU selection and initialization steps.
+</P>
 <HR>
 
 <P><B>Mixing, shift, table, tail correction, restart, rRESPA info</B>:
@@ -80,7 +107,11 @@ to be specified in an input script that reads a restart file.
 </P>
 <HR>
 
-<P><B>Restrictions:</B> none
+<P><B>Restrictions:</B>
+</P>
+<P>The <I>lj/expand/gpu</I> style is part of the "gpu" package. It is only
+enabled if LAMMPS was built with that package.  See the <A HREF = "Section_start.html#2_3">Making
+LAMMPS</A> section for more info.
 </P>
 <P><B>Related commands:</B>
 </P>
diff --git a/doc/pair_lj_expand.txt b/doc/pair_lj_expand.txt
index 3c82f5b944..96487df87e 100644
--- a/doc/pair_lj_expand.txt
+++ b/doc/pair_lj_expand.txt
@@ -7,10 +7,12 @@
 :line
 
 pair_style lj/expand command :h3
+pair_style lj/expand/gpu command :h3
 
 [Syntax:]
 
 pair_style lj/expand cutoff :pre
+pair_style lj/expand/gpu cutoff :pre
 
 cutoff = global cutoff for lj/expand interactions (distance units) :ul
 
@@ -46,6 +48,29 @@ cutoff (distance units) :ul
 The delta values can be positive or negative.  The last coefficient is
 optional.  If not specified, the global LJ cutoff is used.
 
+Style {lj/expand/gpu} is a GPU-enabled version of style {lj/expand}.
+See more details below.
+
+:line
+
+The {lj/expand/gpu} style is identical to the {lj/expand} style,
+except that each processor off-loads its pairwise calculations to a 
+GPU chip. Depending on the hardware available on your system this can provide a
+speed-up.  See the "Running on GPUs"_Section_start.html#2_8 section of
+the manual for more details about hardware and software requirements
+for using GPUs.
+
+More details about these settings and various possible hardware
+configuration are in "this section"_Section_start.html#2_8 of the
+manual.
+
+Additional requirements in your input script to run with GPU-enabled styles
+are as follows:
+
+The "newton pair"_newton.html setting must be {off} and
+"fix gpu"_fix_gpu.html must be used. The fix controls
+the essential GPU selection and initialization steps.
+
 :line
 
 [Mixing, shift, table, tail correction, restart, rRESPA info]:
@@ -77,7 +102,11 @@ This pair style can only be used via the {pair} keyword of the
 
 :line
 
-[Restrictions:] none
+[Restrictions:]
+
+The {lj/expand/gpu} style is part of the "gpu" package. It is only
+enabled if LAMMPS was built with that package.  See the "Making
+LAMMPS"_Section_start.html#2_3 section for more info.
 
 [Related commands:]
 
diff --git a/doc/pair_morse.html b/doc/pair_morse.html
index e5183ef53e..0f505c5d28 100644
--- a/doc/pair_morse.html
+++ b/doc/pair_morse.html
@@ -11,12 +11,18 @@
 
 <H3>pair_style morse command 
 </H3>
+<H3>pair_style morse/gpu command 
+</H3>
 <H3>pair_style morse/opt command 
 </H3>
 <P><B>Syntax:</B>
 </P>
 <PRE>pair_style morse cutoff 
 </PRE>
+<PRE>pair_style morse/gpu cutoff 
+</PRE>
+<PRE>pair_style morse/opt cutoff 
+</PRE>
 <UL><LI>cutoff = global cutoff for Morse interactions (distance units) 
 </UL>
 <P><B>Examples:</B>
@@ -53,6 +59,29 @@ give identical answers.  Depending on system size and the processor
 you are running on, it may be 5-25% faster (for the pairwise portion
 of the run time).
 </P>
+<P>Style <I>morse/gpu</I> is a GPU-enabled version of style <I>morse</I>.
+See more details below.
+</P>
+<HR>
+
+<P>The <I>morse/gpu</I> style is identical to the <I>morse</I> style,
+except that each processor off-loads its pairwise calculations to a 
+GPU chip. Depending on the hardware available on your system this can provide a
+speed-up.  See the <A HREF = "Section_start.html#2_8">Running on GPUs</A> section of
+the manual for more details about hardware and software requirements
+for using GPUs.
+</P>
+<P>More details about these settings and various possible hardware
+configuration are in <A HREF = "Section_start.html#2_8">this section</A> of the
+manual.
+</P>
+<P>Additional requirements in your input script to run with GPU-enabled styles
+are as follows:
+</P>
+<P>The <A HREF = "newton.html">newton pair</A> setting must be <I>off</I> and
+<A HREF = "fix_gpu.html">fix gpu</A> must be used. The fix controls
+the essential GPU selection and initialization steps.
+</P>
 <HR>
 
 <P><B>Mixing, shift, table, tail correction, restart, rRESPA info</B>:
@@ -82,8 +111,9 @@ to be specified in an input script that reads a restart file.
 
 <P><B>Restrictions:</B>
 </P>
-<P>The <I>morse/opt</I> style is part of the "opt" package.  It is only
-enabled if LAMMPS was built with that package.  See the <A HREF = "Section_start.html#2_3">Making
+<P>The <I>morse/opt</I> style is part of the "opt" package.  The <I>morse/gpu</I>
+style is part of the "gpu" package. They are only
+enabled if LAMMPS was built with those packages.  See the <A HREF = "Section_start.html#2_3">Making
 LAMMPS</A> section for more info.
 </P>
 <P><B>Related commands:</B>
diff --git a/doc/pair_morse.txt b/doc/pair_morse.txt
index 1c1799c242..8e23d84767 100644
--- a/doc/pair_morse.txt
+++ b/doc/pair_morse.txt
@@ -7,11 +7,14 @@
 :line
 
 pair_style morse command :h3
+pair_style morse/gpu command :h3
 pair_style morse/opt command :h3
 
 [Syntax:]
 
 pair_style morse cutoff :pre
+pair_style morse/gpu cutoff :pre
+pair_style morse/opt cutoff :pre
 
 cutoff = global cutoff for Morse interactions (distance units) :ul
 
@@ -49,6 +52,29 @@ give identical answers.  Depending on system size and the processor
 you are running on, it may be 5-25% faster (for the pairwise portion
 of the run time).
 
+Style {morse/gpu} is a GPU-enabled version of style {morse}.
+See more details below.
+
+:line
+
+The {morse/gpu} style is identical to the {morse} style,
+except that each processor off-loads its pairwise calculations to a 
+GPU chip. Depending on the hardware available on your system this can provide a
+speed-up.  See the "Running on GPUs"_Section_start.html#2_8 section of
+the manual for more details about hardware and software requirements
+for using GPUs.
+
+More details about these settings and various possible hardware
+configuration are in "this section"_Section_start.html#2_8 of the
+manual.
+
+Additional requirements in your input script to run with GPU-enabled styles
+are as follows:
+
+The "newton pair"_newton.html setting must be {off} and
+"fix gpu"_fix_gpu.html must be used. The fix controls
+the essential GPU selection and initialization steps.
+
 :line
 
 [Mixing, shift, table, tail correction, restart, rRESPA info]:
@@ -78,8 +104,9 @@ These pair styles can only be used via the {pair} keyword of the
 
 [Restrictions:]
 
-The {morse/opt} style is part of the "opt" package.  It is only
-enabled if LAMMPS was built with that package.  See the "Making
+The {morse/opt} style is part of the "opt" package.  The {morse/gpu}
+style is part of the "gpu" package. They are only
+enabled if LAMMPS was built with those packages.  See the "Making
 LAMMPS"_Section_start.html#2_3 section for more info.
 
 [Related commands:]
diff --git a/doc/pair_style.html b/doc/pair_style.html
index 450428a7bc..862a22d7cc 100644
--- a/doc/pair_style.html
+++ b/doc/pair_style.html
@@ -136,6 +136,7 @@ the pair_style command, and coefficients specified by the associated
 <LI><A HREF = "pair_lj.html">pair_style lj/cut/coul/long/gpu</A> - GPU-enabled version of LJ with long-range Coulomb
 <LI><A HREF = "pair_lj.html">pair_style lj/cut/coul/long/tip4p</A> - LJ with long-range Coulomb for TIP4P water
 <LI><A HREF = "pair_lj_expand.html">pair_style lj/expand</A> - Lennard-Jones for variable size particles
+<LI><A HREF = "pair_lj_expand.html">pair_style lj/expand/gpu</A> - GPU-enabled version of lj/expand
 <LI><A HREF = "pair_gromacs.html">pair_style lj/gromacs</A> - GROMACS-style Lennard-Jones potential
 <LI><A HREF = "pair_gromacs.html">pair_style lj/gromacs/coul/gromacs</A> - GROMACS-style LJ and Coulombic potential
 <LI><A HREF = "pair_lj_smooth.html">pair_style lj/smooth</A> - smoothed Lennard-Jones potential
@@ -144,6 +145,7 @@ the pair_style command, and coefficients specified by the associated
 <LI><A HREF = "pair_lubricate.html">pair_style lubricate</A> - hydrodynamic lubrication forces
 <LI><A HREF = "pair_meam.html">pair_style meam</A> - modified embedded atom method (MEAM)
 <LI><A HREF = "pair_morse.html">pair_style morse</A> - Morse potential
+<LI><A HREF = "pair_morse.html">pair_style morse/gpu</A> - GPU-enabled version of Morse potential
 <LI><A HREF = "pair_morse.html">pair_style morse/opt</A> - optimized version of Morse potential
 <LI><A HREF = "pair_peri.html">pair_style peri/lps</A> - peridynamic LPS potential
 <LI><A HREF = "pair_peri.html">pair_style peri/pmb</A> - peridynamic PMB potential
diff --git a/doc/pair_style.txt b/doc/pair_style.txt
index 0db8457ea5..1943b32c99 100644
--- a/doc/pair_style.txt
+++ b/doc/pair_style.txt
@@ -133,6 +133,7 @@ the pair_style command, and coefficients specified by the associated
 "pair_style lj/cut/coul/long/gpu"_pair_lj.html - GPU-enabled version of LJ with long-range Coulomb
 "pair_style lj/cut/coul/long/tip4p"_pair_lj.html - LJ with long-range Coulomb for TIP4P water
 "pair_style lj/expand"_pair_lj_expand.html - Lennard-Jones for variable size particles
+"pair_style lj/expand/gpu"_pair_lj_expand.html - GPU-enabled version of lj/expand
 "pair_style lj/gromacs"_pair_gromacs.html - GROMACS-style Lennard-Jones potential
 "pair_style lj/gromacs/coul/gromacs"_pair_gromacs.html - GROMACS-style LJ and Coulombic potential
 "pair_style lj/smooth"_pair_lj_smooth.html - smoothed Lennard-Jones potential
@@ -141,6 +142,7 @@ the pair_style command, and coefficients specified by the associated
 "pair_style lubricate"_pair_lubricate.html - hydrodynamic lubrication forces
 "pair_style meam"_pair_meam.html - modified embedded atom method (MEAM)
 "pair_style morse"_pair_morse.html - Morse potential
+"pair_style morse/gpu"_pair_morse.html - GPU-enabled version of Morse potential
 "pair_style morse/opt"_pair_morse.html - optimized version of Morse potential
 "pair_style peri/lps"_pair_peri.html - peridynamic LPS potential
 "pair_style peri/pmb"_pair_peri.html - peridynamic PMB potential

From 2be078632da846d4e0893bcebe5ee4d03d4da490 Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Mon, 2 May 2011 15:02:09 +0000
Subject: [PATCH 16/21] git-svn-id:
 svn://svn.icms.temple.edu/lammps-ro/trunk@6052
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 tools/restart2data.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tools/restart2data.cpp b/tools/restart2data.cpp
index ba346a5087..70500c7d02 100644
--- a/tools/restart2data.cpp
+++ b/tools/restart2data.cpp
@@ -1860,7 +1860,8 @@ void pair(FILE *fp, Data &data, char *style, int flag)
 	}
       }
 
-  } else if (strcmp(style,"lj/expand") == 0) {
+  } else if ((strcmp(style,"lj/expand") == 0) ||
+	     (strcmp(style,"lj/expand/gpu") == 0)) {
 
     double cut_global = read_double(fp);
     int offset_flag = read_int(fp);
@@ -1981,6 +1982,7 @@ void pair(FILE *fp, Data &data, char *style, int flag)
   } else if (strcmp(style,"meam") == 0) {
 
   } else if ((strcmp(style,"morse") == 0) ||
+	     (strcmp(style,"morse/gpu") == 0) ||
 	     (strcmp(style,"morse/opt") == 0)) {
 
     double cut_global = read_double(fp);
@@ -2837,7 +2839,8 @@ void Data::write(FILE *fp, FILE *fp2)
 	fprintf(fp,"%d %g %g\n",i,
 		pair_lj_epsilon[i],pair_lj_sigma[i]);
 
-    } else if (strcmp(pair_style,"lj/expand") == 0) {
+    } else if ((strcmp(pair_style,"lj/expand") == 0) ||
+	       (strcmp(pair_style,"lj/expand/gpu")==0)) {
       for (int i = 1; i <= ntypes; i++)
 	fprintf(fp,"%d %g %g %g\n",i,
 		pair_ljexpand_epsilon[i],pair_ljexpand_sigma[i],
@@ -2855,6 +2858,7 @@ void Data::write(FILE *fp, FILE *fp2)
 		pair_ljsmooth_epsilon[i],pair_ljsmooth_sigma[i]);
 
     } else if ((strcmp(pair_style,"morse") == 0) ||
+	       (strcmp(pair_style,"morse/gpu") == 0) ||
 	       (strcmp(pair_style,"morse/opt") == 0)) {
       for (int i = 1; i <= ntypes; i++)
 	fprintf(fp,"%d %g %g %g\n",i,

From 5f799182b3822786373f4e10b43a405711bb27d2 Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Mon, 2 May 2011 15:02:52 +0000
Subject: [PATCH 17/21] git-svn-id:
 svn://svn.icms.temple.edu/lammps-ro/trunk@6053
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 lib/gpu/Makefile.fermi           |   2 +-
 lib/gpu/Makefile.lens            |   6 +-
 lib/gpu/Makefile.lincoln         |   2 +-
 lib/gpu/Makefile.linux           |   2 +-
 lib/gpu/Makefile.linux_opencl    |   2 +-
 lib/gpu/Makefile.longhorn        |   2 +-
 lib/gpu/Makefile.mac             |   2 +-
 lib/gpu/Makefile.mac_opencl      |   2 +-
 lib/gpu/Nvidia.makefile          | 103 +++++-
 lib/gpu/Opencl.makefile          |  84 +++--
 lib/gpu/README                   |   1 +
 lib/gpu/atomic_gpu_memory.cpp    | 133 ++++---
 lib/gpu/atomic_gpu_memory.h      |  66 ++--
 lib/gpu/charge_gpu_memory.cpp    | 140 +++++---
 lib/gpu/charge_gpu_memory.h      |  72 ++--
 lib/gpu/cmm_cut_gpu.cpp          |  68 ++--
 lib/gpu/cmm_cut_gpu_kernel.cu    | 199 ++++++++---
 lib/gpu/cmm_cut_gpu_memory.cpp   |  49 +--
 lib/gpu/cmm_cut_gpu_memory.h     |  21 +-
 lib/gpu/cmmc_long_gpu.cpp        |  82 ++---
 lib/gpu/cmmc_long_gpu_kernel.cu  | 208 ++++++++---
 lib/gpu/cmmc_long_gpu_memory.cpp |  57 +--
 lib/gpu/cmmc_long_gpu_memory.h   |  25 +-
 lib/gpu/crml_gpu.cpp             |  95 ++---
 lib/gpu/crml_gpu_kernel.cu       | 212 ++++++++---
 lib/gpu/crml_gpu_memory.cpp      |  42 ++-
 lib/gpu/crml_gpu_memory.h        |  29 +-
 lib/gpu/gb_gpu.cpp               | 203 ++++++-----
 lib/gpu/gb_gpu_extra.h           |   5 +-
 lib/gpu/gb_gpu_kernel.cu         | 535 ++++++++++++++-------------
 lib/gpu/gb_gpu_kernel_lj.cu      | 261 ++++++++++----
 lib/gpu/gb_gpu_kernel_nbor.cu    |   5 +-
 lib/gpu/gb_gpu_memory.cpp        |  94 +++--
 lib/gpu/gb_gpu_memory.h          |  75 ++--
 lib/gpu/geryon/VERSION.txt       |   4 +-
 lib/gpu/geryon/nvc_device.h      |   4 +-
 lib/gpu/geryon/nvd_device.h      |  16 +-
 lib/gpu/geryon/nvd_timer.h       |  12 +
 lib/gpu/geryon/ocl_timer.h       |  12 +
 lib/gpu/geryon/ucl_arg_kludge.h  | 597 ++++++++++++++++++++++++++++++-
 lib/gpu/geryon/ucl_d_mat.h       |  40 ++-
 lib/gpu/geryon/ucl_d_vec.h       |  35 +-
 lib/gpu/geryon/ucl_h_mat.h       |  44 ++-
 lib/gpu/geryon/ucl_h_vec.h       |  40 ++-
 lib/gpu/geryon/ucl_nv_kernel.h   |  19 +-
 lib/gpu/lj96_cut_gpu.cpp         |  68 ++--
 lib/gpu/lj96_cut_gpu_kernel.cu   | 197 +++++++---
 lib/gpu/lj96_cut_gpu_memory.cpp  |  35 +-
 lib/gpu/lj96_cut_gpu_memory.h    |  21 +-
 lib/gpu/lj_cut_gpu.cpp           |  67 ++--
 lib/gpu/lj_cut_gpu_kernel.cu     | 197 +++++++---
 lib/gpu/lj_cut_gpu_memory.cpp    |  49 +--
 lib/gpu/lj_cut_gpu_memory.h      |  21 +-
 lib/gpu/ljc_cut_gpu.cpp          |  82 ++---
 lib/gpu/ljc_cut_gpu_kernel.cu    | 209 ++++++++---
 lib/gpu/ljc_cut_gpu_memory.cpp   |  52 +--
 lib/gpu/ljc_cut_gpu_memory.h     |  25 +-
 lib/gpu/ljcl_cut_gpu.cpp         |  82 ++---
 lib/gpu/ljcl_cut_gpu_kernel.cu   | 208 ++++++++---
 lib/gpu/ljcl_cut_gpu_memory.cpp  |  35 +-
 lib/gpu/ljcl_cut_gpu_memory.h    |  25 +-
 lib/gpu/pair_gpu_atom.cpp        | 360 +++----------------
 lib/gpu/pair_gpu_atom.h          | 270 +++++++-------
 lib/gpu/pair_gpu_balance.h       |  87 ++---
 lib/gpu/pair_gpu_build_kernel.cu |  77 ++--
 lib/gpu/pair_gpu_device.cpp      | 448 ++++++++++++++++++++---
 lib/gpu/pair_gpu_device.h        | 181 +++++++++-
 lib/gpu/pair_gpu_nbor.cpp        | 196 +++++-----
 lib/gpu/pair_gpu_nbor.h          |  41 +--
 lib/gpu/pair_gpu_precision.h     |   2 -
 70 files changed, 4489 insertions(+), 2253 deletions(-)

diff --git a/lib/gpu/Makefile.fermi b/lib/gpu/Makefile.fermi
index d830c8924c..98c823cf40 100644
--- a/lib/gpu/Makefile.fermi
+++ b/lib/gpu/Makefile.fermi
@@ -26,7 +26,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include
 CUDA_LIB = -L$(CUDA_HOME)/lib64 -Xlinker -rpath -Xlinker $(CUDA_HOME)/lib64
 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math 
 
-CUDR_CPP = mpic++ -DMPI_GERYON -I$(CUDA_HOME)/include 
+CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -I$(CUDA_HOME)/include 
 CUDR_OPTS = -O3 -ffast-math -funroll-loops -DMPI_GERYON
 
 BIN_DIR = ./
diff --git a/lib/gpu/Makefile.lens b/lib/gpu/Makefile.lens
index 3b6301277f..d049967c5f 100644
--- a/lib/gpu/Makefile.lens
+++ b/lib/gpu/Makefile.lens
@@ -17,16 +17,16 @@
 #                          Paul Crozier (SNL), pscrozi@sandia.gov             
 # ------------------------------------------------------------------------- */
 
-CUDA_HOME = /sw/analysis-x64/cuda/3.0/sl5.0_binary/
+CUDA_HOME = /sw/analysis-x64/cuda/3.2/centos5.5_binary/
 NVCC = nvcc
 
 CUDA_ARCH = -arch=sm_13
-CUDA_PRECISION = -D_SINGLE_SINGLE
+CUDA_PRECISION = -D_SINGLE_DOUBLE
 CUDA_INCLUDE = -I$(CUDA_HOME)/include
 CUDA_LIB = -L$(CUDA_HOME)/lib64
 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
 
-CUDR_CPP = mpic++ -DMPI_GERYON -openmp
+CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -openmp
 CUDR_OPTS = -O2 -xSSE2 -ip -use-intel-optimized-headers -fno-alias
 
 BIN_DIR = ./
diff --git a/lib/gpu/Makefile.lincoln b/lib/gpu/Makefile.lincoln
index 97a7901811..bbaca61ef1 100644
--- a/lib/gpu/Makefile.lincoln
+++ b/lib/gpu/Makefile.lincoln
@@ -24,7 +24,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include
 CUDA_LIB = -L$(CUDA_HOME)/lib64 -Wl,-rpath,$(CUDA_HOME)/lib64
 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math 
 
-CUDR_CPP = mpic++ -DMPI_GERYON
+CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT 
 CUDR_OPTS = -O3 -DMPI_GERYON -ffast-math -funroll-loops
 
 BIN_DIR = ./
diff --git a/lib/gpu/Makefile.linux b/lib/gpu/Makefile.linux
index c0001a54ab..d69a00a817 100644
--- a/lib/gpu/Makefile.linux
+++ b/lib/gpu/Makefile.linux
@@ -26,7 +26,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include
 CUDA_LIB = -L$(CUDA_HOME)/lib64
 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
 
-CUDR_CPP = mpic++ -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
+CUDR_CPP = mpic++ -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK
 CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias
 
 BIN_DIR = ./
diff --git a/lib/gpu/Makefile.linux_opencl b/lib/gpu/Makefile.linux_opencl
index 69522298c5..3d65c9dc48 100644
--- a/lib/gpu/Makefile.linux_opencl
+++ b/lib/gpu/Makefile.linux_opencl
@@ -17,7 +17,7 @@
 #                          Paul Crozier (SNL), pscrozi@sandia.gov             
 # ------------------------------------------------------------------------- */
 
-OCL_CPP = mpic++ -I./geryon/opencl -O3 -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
+OCL_CPP = mpic++ -O3 -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK
 OCL_LINK = -lOpenCL
 OCL_PREC = -D_SINGLE_SINGLE
 
diff --git a/lib/gpu/Makefile.longhorn b/lib/gpu/Makefile.longhorn
index ba921f0f68..cc41174332 100644
--- a/lib/gpu/Makefile.longhorn
+++ b/lib/gpu/Makefile.longhorn
@@ -23,7 +23,7 @@ CUDA_INCLUDE = -I$(CUDA_HOME)/include
 CUDA_LIB = -L$(TACC_CUDA_LIB) -Wl,-rpath,$(TACC_CUDA_LIB)
 CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math
 
-CUDR_CPP = mpicxx -DMPI_GERYON -DMPICH_IGNORE_CXX_SEEK
+CUDR_CPP = mpicxx -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK
 CUDR_OPTS = -O2 # -xHost -no-prec-div -ansi-alias
 
 BIN_DIR = ./
diff --git a/lib/gpu/Makefile.mac b/lib/gpu/Makefile.mac
index f061a1a68a..5276ac10b2 100644
--- a/lib/gpu/Makefile.mac
+++ b/lib/gpu/Makefile.mac
@@ -24,7 +24,7 @@ CUDA_ARCH = -arch=sm_11
 CUDA_PRECISION = -D_SINGLE_SINGLE
 CUDA_INCLUDE = -I$(CUDA_HOME)/include
 CUDA_LIB = -L$(CUDA_HOME)/lib
-CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math -m32
+CUDA_OPTS = -DUNIX -DUCL_NO_EXIT -O3 -Xptxas -v --use_fast_math -m32
 
 CUDR_CPP = mpic++
 CUDR_OPTS = -O2 -m32 -g
diff --git a/lib/gpu/Makefile.mac_opencl b/lib/gpu/Makefile.mac_opencl
index 53d6d466e2..50ed67e9c3 100644
--- a/lib/gpu/Makefile.mac_opencl
+++ b/lib/gpu/Makefile.mac_opencl
@@ -17,7 +17,7 @@
 #                          Paul Crozier (SNL), pscrozi@sandia.gov             
 # ------------------------------------------------------------------------- */
 
-OCL_CPP = mpic++ -I./geryon/opencl_1_0 -O3 -DMPI_GERYON
+OCL_CPP = mpic++ -I./geryon/opencl_1_0 -O3 -DMPI_GERYON -DUCL_NO_EXIT
 OCL_LINK = -framework OpenCL
 OCL_PREC = -D_SINGLE_SINGLE
 
diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile
index adf281e156..17f616ab37 100644
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@@ -13,7 +13,8 @@
 #                                                                             
 # /* ----------------------------------------------------------------------   
 #    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov               
-#                          Peng Wang (Nvidia), penwang@nvidia.com             
+#                          Peng Wang (Nvidia), penwang@nvidia.com
+#                          Inderaj Bains (NVIDIA), ibains@nvidia.com
 #                          Paul Crozier (SNL), pscrozi@sandia.gov             
 # ------------------------------------------------------------------------- */
 
@@ -28,10 +29,11 @@ GPU_LIB = $(LIB_DIR)/libgpu.a
 # Headers for Geryon
 UCL_H  = $(wildcard ./geryon/ucl*.h)
 NVC_H  = $(wildcard ./geryon/nvc*.h) $(UCL_H)
-NVD_H  = $(wildcard ./geryon/nvd*.h) $(UCL_H) 
+NVD_H  = $(wildcard ./geryon/nvd*.h) $(UCL_H) nv_kernel_def.h
 # Headers for Pair Stuff
-PAIR_H  = pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_precision.h \
-          pair_gpu_device.h pair_gpu_balance.h
+PAIR_H  = pair_gpu_atom.h pair_gpu_ans.h pair_gpu_nbor_shared.h \
+          pair_gpu_nbor.h pair_gpu_precision.h pair_gpu_device.h \
+          pair_gpu_balance.h pppm_gpu_memory.h
 
 ALL_H = $(NVD_H) $(PAIR_H)
 
@@ -39,28 +41,37 @@ EXECS = $(BIN_DIR)/nvc_get_devices
 CUDPP = $(OBJ_DIR)/cudpp.o $(OBJ_DIR)/cudpp_plan.o \
         $(OBJ_DIR)/cudpp_maximal_launch.o $(OBJ_DIR)/cudpp_plan_manager.o \
         $(OBJ_DIR)/radixsort_app.cu_o $(OBJ_DIR)/scan_app.cu_o
-OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \
-       $(OBJ_DIR)/pair_gpu_device.o $(OBJ_DIR)/atomic_gpu_memory.o \
-       $(OBJ_DIR)/charge_gpu_memory.o \
+OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \
+       $(OBJ_DIR)/pair_gpu_nbor.o $(OBJ_DIR)/pair_gpu_nbor_shared.o \
+       $(OBJ_DIR)/pair_gpu_device.o \
+       $(OBJ_DIR)/atomic_gpu_memory.o $(OBJ_DIR)/charge_gpu_memory.o \
+       $(OBJ_DIR)/pppm_gpu_memory.o $(OBJ_DIR)/pppm_l_gpu.o \
        $(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \
        $(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \
        $(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \
+       $(OBJ_DIR)/lj_expand_gpu_memory.o $(OBJ_DIR)/lj_expand_gpu.o \
        $(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \
        $(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \
+       $(OBJ_DIR)/morse_gpu_memory.o $(OBJ_DIR)/morse_gpu.o \
        $(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \
        $(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
        $(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o \
        $(CUDPP)
-PTXS = $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h \
+PTXS = $(OBJ_DIR)/pair_gpu_dev_kernel.ptx \
+       $(OBJ_DIR)/pair_gpu_atom_kernel.ptx $(OBJ_DIR)/pair_gpu_atom_ptx.h \
        $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx $(OBJ_DIR)/pair_gpu_nbor_ptx.h \
        $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h \
+       $(OBJ_DIR)/pppm_f_gpu_kernel.ptx $(OBJ_DIR)/pppm_f_gpu_ptx.h \
+       $(OBJ_DIR)/pppm_d_gpu_kernel.ptx $(OBJ_DIR)/pppm_d_gpu_ptx.h \
        $(OBJ_DIR)/gb_gpu_kernel_nbor.ptx $(OBJ_DIR)/gb_gpu_kernel.ptx \
        $(OBJ_DIR)/gb_gpu_kernel_lj.ptx $(OBJ_DIR)/gb_gpu_ptx.h \
        $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_gpu_ptx.h \
        $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj96_cut_gpu_ptx.h \
+       $(OBJ_DIR)/lj_expand_gpu_kernel.ptx $(OBJ_DIR)/lj_expand_gpu_ptx.h \
        $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_cut_gpu_ptx.h \
        $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljcl_cut_gpu_ptx.h \
-       $(OBJ_DIR)/crml_cut_gpu_kernel.ptx $(OBJ_DIR)/crml_cut_gpu_ptx.h \
+       $(OBJ_DIR)/morse_gpu_kernel.ptx $(OBJ_DIR)/morse_gpu_ptx.h \
+       $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_ptx.h \
        $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_cut_gpu_ptx.h \
        $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/cmmc_long_gpu_ptx.h
 
@@ -93,6 +104,9 @@ $(OBJ_DIR)/pair_gpu_atom_ptx.h: $(OBJ_DIR)/pair_gpu_atom_kernel.ptx
 $(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(NVD_H) $(OBJ_DIR)/pair_gpu_atom_ptx.h
 	$(CUDR) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR)
 
+$(OBJ_DIR)/pair_gpu_ans.o: pair_gpu_ans.cpp pair_gpu_ans.h $(NVD_H)
+	$(CUDR) -o $@ -c pair_gpu_ans.cpp -I$(OBJ_DIR)
+
 $(OBJ_DIR)/pair_gpu_nbor_kernel.ptx: pair_gpu_nbor_kernel.cu
 	$(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_nbor_kernel.cu
 
@@ -105,11 +119,20 @@ $(OBJ_DIR)/pair_gpu_build_kernel.ptx: pair_gpu_build_kernel.cu
 $(OBJ_DIR)/pair_gpu_build_ptx.h: $(OBJ_DIR)/pair_gpu_build_kernel.ptx
 	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_build_kernel.ptx $(OBJ_DIR)/pair_gpu_build_ptx.h
 
-$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OBJ_DIR)/pair_gpu_nbor_ptx.h $(OBJ_DIR)/pair_gpu_build_ptx.h $(NVD_H)
+$(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared.h $(OBJ_DIR)/pair_gpu_nbor_ptx.h $(OBJ_DIR)/pair_gpu_build_ptx.h $(NVD_H)
+	$(CUDR) -o $@ -c pair_gpu_nbor_shared.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h pair_gpu_nbor_shared.h $(NVD_H)
 	$(CUDR) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(NVD_H)
-	$(CUDR) -o $@ -c pair_gpu_device.cpp
+$(OBJ_DIR)/pair_gpu_dev_kernel.ptx: pair_gpu_dev_kernel.cu
+	$(CUDA) --ptx -DNV_KERNEL -o $@ pair_gpu_dev_kernel.cu
+
+$(OBJ_DIR)/pair_gpu_dev_ptx.h: $(OBJ_DIR)/pair_gpu_dev_kernel.ptx
+	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pair_gpu_dev_kernel.ptx $(OBJ_DIR)/pair_gpu_dev_ptx.h
+
+$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H) $(OBJ_DIR)/pair_gpu_dev_ptx.h
+	$(CUDR) -o $@ -c pair_gpu_device.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/atomic_gpu_memory.o: $(ALL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp
 	$(CUDR) -o $@ -c atomic_gpu_memory.cpp
@@ -117,6 +140,24 @@ $(OBJ_DIR)/atomic_gpu_memory.o: $(ALL_H) atomic_gpu_memory.h atomic_gpu_memory.c
 $(OBJ_DIR)/charge_gpu_memory.o: $(ALL_H) charge_gpu_memory.h charge_gpu_memory.cpp
 	$(CUDR) -o $@ -c charge_gpu_memory.cpp
 
+$(OBJ_DIR)/pppm_f_gpu_kernel.ptx: pppm_gpu_kernel.cu pair_gpu_precision.h
+	$(CUDA) --ptx -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ pppm_gpu_kernel.cu
+
+$(OBJ_DIR)/pppm_f_gpu_ptx.h: $(OBJ_DIR)/pppm_f_gpu_kernel.ptx
+	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pppm_f_gpu_kernel.ptx $(OBJ_DIR)/pppm_f_gpu_ptx.h
+
+$(OBJ_DIR)/pppm_d_gpu_kernel.ptx: pppm_gpu_kernel.cu pair_gpu_precision.h
+	$(CUDA) --ptx -DNV_KERNEL -Dgrdtyp=double -Dgrdtyp4=double4 -o $@ pppm_gpu_kernel.cu
+
+$(OBJ_DIR)/pppm_d_gpu_ptx.h: $(OBJ_DIR)/pppm_d_gpu_kernel.ptx
+	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/pppm_d_gpu_kernel.ptx $(OBJ_DIR)/pppm_d_gpu_ptx.h
+
+$(OBJ_DIR)/pppm_gpu_memory.o: $(ALL_H) pppm_gpu_memory.h pppm_gpu_memory.cpp $(OBJ_DIR)/pppm_f_gpu_ptx.h $(OBJ_DIR)/pppm_d_gpu_ptx.h
+	$(CUDR) -o $@ -c pppm_gpu_memory.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/pppm_l_gpu.o: $(ALL_H) pppm_gpu_memory.h pppm_l_gpu.cpp
+	$(CUDR) -o $@ -c pppm_l_gpu.cpp -I$(OBJ_DIR)
+
 $(OBJ_DIR)/gb_gpu_kernel.ptx: gb_gpu_kernel.cu pair_gpu_precision.h gb_gpu_extra.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ gb_gpu_kernel.cu
 
@@ -144,7 +185,7 @@ $(OBJ_DIR)/lj_cut_gpu_ptx.h: $(OBJ_DIR)/lj_cut_gpu_kernel.ptx $(OBJ_DIR)/lj_cut_
 $(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp $(OBJ_DIR)/lj_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(CUDR) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp
+$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp atomic_gpu_memory.h
 	$(CUDR) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx: ljc_cut_gpu_kernel.cu pair_gpu_precision.h
@@ -156,7 +197,7 @@ $(OBJ_DIR)/ljc_cut_gpu_ptx.h: $(OBJ_DIR)/ljc_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc_c
 $(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp $(OBJ_DIR)/ljc_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
 	$(CUDR) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp
+$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp charge_gpu_memory.h
 	$(CUDR) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx: ljcl_cut_gpu_kernel.cu pair_gpu_precision.h
@@ -168,9 +209,21 @@ $(OBJ_DIR)/ljcl_cut_gpu_ptx.h: $(OBJ_DIR)/ljcl_cut_gpu_kernel.ptx $(OBJ_DIR)/ljc
 $(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp $(OBJ_DIR)/ljcl_cut_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
 	$(CUDR) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp
+$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp charge_gpu_memory.h
 	$(CUDR) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR)
 
+$(OBJ_DIR)/morse_gpu_kernel.ptx: morse_gpu_kernel.cu pair_gpu_precision.h
+	$(CUDA) --ptx -DNV_KERNEL -o $@ morse_gpu_kernel.cu
+
+$(OBJ_DIR)/morse_gpu_ptx.h: $(OBJ_DIR)/morse_gpu_kernel.ptx $(OBJ_DIR)/morse_gpu_kernel.ptx
+	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/morse_gpu_kernel.ptx $(OBJ_DIR)/morse_gpu_ptx.h
+
+$(OBJ_DIR)/morse_gpu_memory.o: $(ALL_H) morse_gpu_memory.h morse_gpu_memory.cpp $(OBJ_DIR)/morse_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
+	$(CUDR) -o $@ -c morse_gpu_memory.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/morse_gpu.o: $(ALL_H) morse_gpu_memory.h morse_gpu.cpp atomic_gpu_memory.h
+	$(CUDR) -o $@ -c morse_gpu.cpp -I$(OBJ_DIR)
+
 $(OBJ_DIR)/crml_gpu_kernel.ptx: crml_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ crml_gpu_kernel.cu
 
@@ -180,7 +233,7 @@ $(OBJ_DIR)/crml_gpu_ptx.h: $(OBJ_DIR)/crml_gpu_kernel.ptx $(OBJ_DIR)/crml_gpu_ke
 $(OBJ_DIR)/crml_gpu_memory.o: $(ALL_H) crml_gpu_memory.h crml_gpu_memory.cpp $(OBJ_DIR)/crml_gpu_ptx.h $(OBJ_DIR)/charge_gpu_memory.o
 	$(CUDR) -o $@ -c crml_gpu_memory.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp
+$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp charge_gpu_memory.h
 	$(CUDR) -o $@ -c crml_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx: lj96_cut_gpu_kernel.cu pair_gpu_precision.h
@@ -192,9 +245,21 @@ $(OBJ_DIR)/lj96_cut_gpu_ptx.h: $(OBJ_DIR)/lj96_cut_gpu_kernel.ptx $(OBJ_DIR)/lj9
 $(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp $(OBJ_DIR)/lj96_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(CUDR) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp
+$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp atomic_gpu_memory.h
 	$(CUDR) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR)
 
+$(OBJ_DIR)/lj_expand_gpu_kernel.ptx: lj_expand_gpu_kernel.cu pair_gpu_precision.h
+	$(CUDA) --ptx -DNV_KERNEL -o $@ lj_expand_gpu_kernel.cu
+
+$(OBJ_DIR)/lj_expand_gpu_ptx.h: $(OBJ_DIR)/lj_expand_gpu_kernel.ptx $(OBJ_DIR)/lj_expand_gpu_kernel.ptx
+	$(BSH) ./geryon/file_to_cstr.sh $(OBJ_DIR)/lj_expand_gpu_kernel.ptx $(OBJ_DIR)/lj_expand_gpu_ptx.h
+
+$(OBJ_DIR)/lj_expand_gpu_memory.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu_memory.cpp $(OBJ_DIR)/lj_expand_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
+	$(CUDR) -o $@ -c lj_expand_gpu_memory.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj_expand_gpu.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu.cpp atomic_gpu_memory.h
+	$(CUDR) -o $@ -c lj_expand_gpu.cpp -I$(OBJ_DIR)
+
 $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx: cmm_cut_gpu_kernel.cu pair_gpu_precision.h
 	$(CUDA) --ptx -DNV_KERNEL -o $@ cmm_cut_gpu_kernel.cu
 
@@ -204,7 +269,7 @@ $(OBJ_DIR)/cmm_cut_gpu_ptx.h: $(OBJ_DIR)/cmm_cut_gpu_kernel.ptx $(OBJ_DIR)/cmm_c
 $(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp $(OBJ_DIR)/cmm_cut_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(CUDR) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp
+$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp atomic_gpu_memory.h
 	$(CUDR) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx: cmmc_long_gpu_kernel.cu pair_gpu_precision.h
@@ -216,7 +281,7 @@ $(OBJ_DIR)/cmmc_long_gpu_ptx.h: $(OBJ_DIR)/cmmc_long_gpu_kernel.ptx $(OBJ_DIR)/c
 $(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp $(OBJ_DIR)/cmmc_long_gpu_ptx.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(CUDR) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp
+$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp charge_gpu_memory.h
 	$(CUDR) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR)
 
 $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVC_H)
diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile
index ac7aecc2ee..45e21736a3 100644
--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@@ -14,6 +14,7 @@
 # /* ----------------------------------------------------------------------   
 #    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov               
 #                          Peng Wang (Nvidia), penwang@nvidia.com             
+#                          Inderaj Bains (NVIDIA), ibains@nvidia.com
 #                          Paul Crozier (SNL), pscrozi@sandia.gov             
 # ------------------------------------------------------------------------- */
 
@@ -23,30 +24,37 @@ OCL_LIB = $(LIB_DIR)/libgpu.a
 UCL_H  = $(wildcard ./geryon/ucl*.h)
 OCL_H  = $(wildcard ./geryon/ocl*.h) $(UCL_H)
 # Headers for Pair Stuff
-PAIR_H  = pair_gpu_atom.h pair_gpu_nbor.h pair_gpu_precision.h \
-          pair_gpu_device.h pair_gpu_balance.h
+PAIR_H  = pair_gpu_atom.h pair_gpu_ans.h pair_gpu_nbor_shared.h \
+          pair_gpu_nbor.h pair_gpu_precision.h pair_gpu_device.h \
+          pair_gpu_balance.h pppm_gpu_memory.h
 
 ALL_H = $(OCL_H) $(PAIR_H)
 
 EXECS = $(BIN_DIR)/ocl_get_devices
-OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_nbor.o \
-       $(OBJ_DIR)/pair_gpu_device.o $(OBJ_DIR)/atomic_gpu_memory.o \
-       $(OBJ_DIR)/charge_gpu_memory.o \
+OBJS = $(OBJ_DIR)/pair_gpu_atom.o $(OBJ_DIR)/pair_gpu_ans.o \
+       $(OBJ_DIR)/pair_gpu_nbor_shared.o $(OBJ_DIR)/pair_gpu_nbor.o \
+       $(OBJ_DIR)/pair_gpu_device.o \
+       $(OBJ_DIR)/atomic_gpu_memory.o $(OBJ_DIR)/charge_gpu_memory.o \
+       $(OBJ_DIR)/pppm_gpu_memory.o $(OBJ_DIR)/pppm_l_gpu.o \
        $(OBJ_DIR)/gb_gpu_memory.o $(OBJ_DIR)/gb_gpu.o \
        $(OBJ_DIR)/lj_cut_gpu_memory.o $(OBJ_DIR)/lj_cut_gpu.o \
        $(OBJ_DIR)/lj96_cut_gpu_memory.o $(OBJ_DIR)/lj96_cut_gpu.o \
+       $(OBJ_DIR)/lj_expand_gpu_memory.o $(OBJ_DIR)/lj_expand_gpu.o \
        $(OBJ_DIR)/ljc_cut_gpu_memory.o $(OBJ_DIR)/ljc_cut_gpu.o \
        $(OBJ_DIR)/ljcl_cut_gpu_memory.o $(OBJ_DIR)/ljcl_cut_gpu.o \
+       $(OBJ_DIR)/morse_gpu_memory.o $(OBJ_DIR)/morse_gpu.o \
        $(OBJ_DIR)/crml_gpu_memory.o $(OBJ_DIR)/crml_gpu.o \
        $(OBJ_DIR)/cmm_cut_gpu_memory.o $(OBJ_DIR)/cmm_cut_gpu.o \
        $(OBJ_DIR)/cmmc_long_gpu_memory.o $(OBJ_DIR)/cmmc_long_gpu.o 
-KERS = $(OBJ_DIR)/pair_gpu_atom_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h \
+KERS = $(OBJ_DIR)/pair_gpu_dev_cl.h $(OBJ_DIR)/pair_gpu_atom_cl.h \
+       $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/pppm_gpu_cl.h \
        $(OBJ_DIR)/gb_gpu_nbor_cl.h $(OBJ_DIR)/gb_gpu_cl.h \
        $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h \
-       $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h \
-       $(OBJ_DIR)/crml_gpu_cl.h \
-       $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h 
-       
+       $(OBJ_DIR)/lj_expand_gpu_cl.h $(OBJ_DIR)/ljc_cut_gpu_cl.h \
+       $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/morse_gpu_cl.h \
+       $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/cmm_cut_gpu_cl.h \
+       $(OBJ_DIR)/cmmc_long_gpu_cl.h 
+
 OCL_EXECS = $(BIN_DIR)/ocl_get_devices
 
 all: $(OCL_LIB) $(EXECS)
@@ -57,14 +65,23 @@ $(OBJ_DIR)/pair_gpu_atom_cl.h: pair_gpu_atom_kernel.cu
 $(OBJ_DIR)/pair_gpu_atom.o: pair_gpu_atom.cpp pair_gpu_atom.h $(OCL_H) $(OBJ_DIR)/pair_gpu_atom_cl.h
 	$(OCL) -o $@ -c pair_gpu_atom.cpp -I$(OBJ_DIR)
 
+$(OBJ_DIR)/pair_gpu_ans.o: pair_gpu_ans.cpp pair_gpu_ans.h $(OCL_H)
+	$(OCL) -o $@ -c pair_gpu_ans.cpp -I$(OBJ_DIR)
+
 $(OBJ_DIR)/pair_gpu_nbor_cl.h: pair_gpu_nbor_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh pair_gpu_nbor_kernel.cu $(OBJ_DIR)/pair_gpu_nbor_cl.h
 
-$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OCL_H) $(OBJ_DIR)/pair_gpu_nbor_cl.h
+$(OBJ_DIR)/pair_gpu_nbor_shared.o: pair_gpu_nbor_shared.cpp pair_gpu_nbor_shared.h $(OCL_H) $(OBJ_DIR)/pair_gpu_nbor_cl.h
+	$(OCL) -o $@ -c pair_gpu_nbor_shared.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/pair_gpu_nbor.o: pair_gpu_nbor.cpp pair_gpu_nbor.h $(OCL_H) pair_gpu_nbor_shared.h
 	$(OCL) -o $@ -c pair_gpu_nbor.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(OCL_H)
-	$(OCL) -o $@ -c pair_gpu_device.cpp
+$(OBJ_DIR)/pair_gpu_dev_cl.h: pair_gpu_dev_kernel.cu
+	$(BSH) ./geryon/file_to_cstr.sh pair_gpu_dev_kernel.cu $(OBJ_DIR)/pair_gpu_dev_cl.h
+
+$(OBJ_DIR)/pair_gpu_device.o: pair_gpu_device.cpp pair_gpu_device.h $(ALL_H) $(OBJ_DIR)/pair_gpu_dev_cl.h
+	$(OCL) -o $@ -c pair_gpu_device.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/atomic_gpu_memory.o: $(OCL_H) atomic_gpu_memory.h atomic_gpu_memory.cpp
 	$(OCL) -o $@ -c atomic_gpu_memory.cpp
@@ -72,6 +89,15 @@ $(OBJ_DIR)/atomic_gpu_memory.o: $(OCL_H) atomic_gpu_memory.h atomic_gpu_memory.c
 $(OBJ_DIR)/charge_gpu_memory.o: $(OCL_H) charge_gpu_memory.h charge_gpu_memory.cpp
 	$(OCL) -o $@ -c charge_gpu_memory.cpp
 
+$(OBJ_DIR)/pppm_gpu_cl.h: pppm_gpu_kernel.cu
+	$(BSH) ./geryon/file_to_cstr.sh pppm_gpu_kernel.cu $(OBJ_DIR)/pppm_gpu_cl.h;
+
+$(OBJ_DIR)/pppm_gpu_memory.o: $(ALL_H) pppm_gpu_memory.h pppm_gpu_memory.cpp  $(OBJ_DIR)/pppm_gpu_cl.h $(OBJ_DIR)/pppm_gpu_cl.h
+	$(OCL) -o $@ -c pppm_gpu_memory.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/pppm_l_gpu.o: $(ALL_H) pppm_gpu_memory.h pppm_l_gpu.cpp
+	$(OCL) -o $@ -c pppm_l_gpu.cpp -I$(OBJ_DIR)
+
 $(OBJ_DIR)/gb_gpu_nbor_cl.h: gb_gpu_kernel_nbor.cu
 	$(BSH) ./geryon/file_to_cstr.sh gb_gpu_kernel_nbor.cu $(OBJ_DIR)/gb_gpu_nbor_cl.h
 
@@ -93,7 +119,7 @@ $(OBJ_DIR)/lj_cut_gpu_cl.h: lj_cut_gpu_kernel.cu
 $(OBJ_DIR)/lj_cut_gpu_memory.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu_memory.cpp  $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(OCL) -o $@ -c lj_cut_gpu_memory.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp
+$(OBJ_DIR)/lj_cut_gpu.o: $(ALL_H) lj_cut_gpu_memory.h lj_cut_gpu.cpp atomic_gpu_memory.h
 	$(OCL) -o $@ -c lj_cut_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/ljc_cut_gpu_cl.h: ljc_cut_gpu_kernel.cu
@@ -102,7 +128,7 @@ $(OBJ_DIR)/ljc_cut_gpu_cl.h: ljc_cut_gpu_kernel.cu
 $(OBJ_DIR)/ljc_cut_gpu_memory.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu_memory.cpp  $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljc_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
 	$(OCL) -o $@ -c ljc_cut_gpu_memory.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp
+$(OBJ_DIR)/ljc_cut_gpu.o: $(ALL_H) ljc_cut_gpu_memory.h ljc_cut_gpu.cpp charge_gpu_memory.h
 	$(OCL) -o $@ -c ljc_cut_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/ljcl_cut_gpu_cl.h: ljcl_cut_gpu_kernel.cu
@@ -111,16 +137,25 @@ $(OBJ_DIR)/ljcl_cut_gpu_cl.h: ljcl_cut_gpu_kernel.cu
 $(OBJ_DIR)/ljcl_cut_gpu_memory.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu_memory.cpp  $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/ljcl_cut_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
 	$(OCL) -o $@ -c ljcl_cut_gpu_memory.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp
+$(OBJ_DIR)/ljcl_cut_gpu.o: $(ALL_H) ljcl_cut_gpu_memory.h ljcl_cut_gpu.cpp charge_gpu_memory.h
 	$(OCL) -o $@ -c ljcl_cut_gpu.cpp -I$(OBJ_DIR)
 
+$(OBJ_DIR)/morse_gpu_cl.h: morse_gpu_kernel.cu
+	$(BSH) ./geryon/file_to_cstr.sh morse_gpu_kernel.cu $(OBJ_DIR)/morse_gpu_cl.h;
+
+$(OBJ_DIR)/morse_gpu_memory.o: $(ALL_H) morse_gpu_memory.h morse_gpu_memory.cpp  $(OBJ_DIR)/morse_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/morse_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
+	$(OCL) -o $@ -c morse_gpu_memory.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/morse_gpu.o: $(ALL_H) morse_gpu_memory.h morse_gpu.cpp atomic_gpu_memory.h
+	$(OCL) -o $@ -c morse_gpu.cpp -I$(OBJ_DIR)
+
 $(OBJ_DIR)/crml_gpu_cl.h: crml_gpu_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh crml_gpu_kernel.cu $(OBJ_DIR)/crml_gpu_cl.h;
 
 $(OBJ_DIR)/crml_gpu_memory.o: $(ALL_H) crml_gpu_memory.h crml_gpu_memory.cpp  $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/crml_gpu_cl.h $(OBJ_DIR)/charge_gpu_memory.o
 	$(OCL) -o $@ -c crml_gpu_memory.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp
+$(OBJ_DIR)/crml_gpu.o: $(ALL_H) crml_gpu_memory.h crml_gpu.cpp charge_gpu_memory.h
 	$(OCL) -o $@ -c crml_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/lj96_cut_gpu_cl.h: lj96_cut_gpu_kernel.cu
@@ -129,16 +164,25 @@ $(OBJ_DIR)/lj96_cut_gpu_cl.h: lj96_cut_gpu_kernel.cu
 $(OBJ_DIR)/lj96_cut_gpu_memory.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu_memory.cpp  $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj96_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(OCL) -o $@ -c lj96_cut_gpu_memory.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp
+$(OBJ_DIR)/lj96_cut_gpu.o: $(ALL_H) lj96_cut_gpu_memory.h lj96_cut_gpu.cpp atomic_gpu_memory.h
 	$(OCL) -o $@ -c lj96_cut_gpu.cpp -I$(OBJ_DIR)
 
+$(OBJ_DIR)/lj_expand_gpu_cl.h: lj_expand_gpu_kernel.cu
+	$(BSH) ./geryon/file_to_cstr.sh lj_expand_gpu_kernel.cu $(OBJ_DIR)/lj_expand_gpu_cl.h;
+
+$(OBJ_DIR)/lj_expand_gpu_memory.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu_memory.cpp  $(OBJ_DIR)/lj_expand_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/lj_expand_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
+	$(OCL) -o $@ -c lj_expand_gpu_memory.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj_expand_gpu.o: $(ALL_H) lj_expand_gpu_memory.h lj_expand_gpu.cpp atomic_gpu_memory.h
+	$(OCL) -o $@ -c lj_expand_gpu.cpp -I$(OBJ_DIR)
+
 $(OBJ_DIR)/cmm_cut_gpu_cl.h: cmm_cut_gpu_kernel.cu
 	$(BSH) ./geryon/file_to_cstr.sh cmm_cut_gpu_kernel.cu $(OBJ_DIR)/cmm_cut_gpu_cl.h;
 
 $(OBJ_DIR)/cmm_cut_gpu_memory.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu_memory.cpp  $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmm_cut_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(OCL) -o $@ -c cmm_cut_gpu_memory.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp
+$(OBJ_DIR)/cmm_cut_gpu.o: $(ALL_H) cmm_cut_gpu_memory.h cmm_cut_gpu.cpp atomic_gpu_memory.h
 	$(OCL) -o $@ -c cmm_cut_gpu.cpp -I$(OBJ_DIR)
 
 $(OBJ_DIR)/cmmc_long_gpu_cl.h: cmmc_long_gpu_kernel.cu
@@ -147,7 +191,7 @@ $(OBJ_DIR)/cmmc_long_gpu_cl.h: cmmc_long_gpu_kernel.cu
 $(OBJ_DIR)/cmmc_long_gpu_memory.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu_memory.cpp  $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/pair_gpu_nbor_cl.h $(OBJ_DIR)/cmmc_long_gpu_cl.h $(OBJ_DIR)/atomic_gpu_memory.o
 	$(OCL) -o $@ -c cmmc_long_gpu_memory.cpp -I$(OBJ_DIR)
 
-$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp
+$(OBJ_DIR)/cmmc_long_gpu.o: $(ALL_H) cmmc_long_gpu_memory.h cmmc_long_gpu.cpp charge_gpu_memory.h
 	$(OCL) -o $@ -c cmmc_long_gpu.cpp -I$(OBJ_DIR)
 
 $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
diff --git a/lib/gpu/README b/lib/gpu/README
index 567d81886b..a60d43064a 100644
--- a/lib/gpu/README
+++ b/lib/gpu/README
@@ -14,6 +14,7 @@
 /* ----------------------------------------------------------------------
    Contributing authors: Mike Brown (ORNL), brownw@ornl.gov
                          Peng Wang (Nvidia), penwang@nvidia.com
+                         Inderaj Bains (NVIDIA), ibains@nvidia.com
                          Paul Crozier (SNL), pscrozi@sandia.gov
 ------------------------------------------------------------------------- */
 
diff --git a/lib/gpu/atomic_gpu_memory.cpp b/lib/gpu/atomic_gpu_memory.cpp
index e1cc48048b..531ea4000d 100644
--- a/lib/gpu/atomic_gpu_memory.cpp
+++ b/lib/gpu/atomic_gpu_memory.cpp
@@ -23,23 +23,28 @@ extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
 template <class numtyp, class acctyp>
 AtomicGPUMemoryT::AtomicGPUMemory() : _compiled(false), _max_bytes(0)  {
   device=&pair_gpu_device;
+  ans=new PairGPUAns<numtyp,acctyp>();
+  nbor=new PairGPUNbor();
 }
 
 template <class numtyp, class acctyp>
 AtomicGPUMemoryT::~AtomicGPUMemory() {
+  delete ans;
+  delete nbor;
 }
 
 template <class numtyp, class acctyp>
 int AtomicGPUMemoryT::bytes_per_atom_atomic(const int max_nbors) const {
-  return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors);
+  return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
+         nbor->bytes_per_atom(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
-                                   const int max_nbors, const int maxspecial,
-                                   const double cell_size,
-                                   const double gpu_split, FILE *_screen,
-                                   const char *pair_program) {
+int AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
+                                  const int max_nbors, const int maxspecial,
+                                  const double cell_size,
+                                  const double gpu_split, FILE *_screen,
+                                  const char *pair_program) {
   nbor_time_avail=false;
   screen=_screen;
 
@@ -48,24 +53,30 @@ bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
     gpu_nbor=true;
 
   int _gpu_host=0;
-  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split);
+  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
   if (host_nlocal>0)
     _gpu_host=1;
 
-  if (!device->init(false,false,nlocal,host_nlocal,nall,maxspecial,gpu_nbor,
-                    _gpu_host,max_nbors,cell_size,false))
-    return false;
+  _threads_per_atom=device->threads_per_atom();
+  if (_threads_per_atom>1 && gpu_nbor==false) {
+    nbor->packing(true);
+    _nbor_data=&(nbor->dev_packed);
+  } else
+    _nbor_data=&(nbor->dev_nbor);
+    
+  int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
+                           maxspecial,_gpu_host,max_nbors,cell_size,false);
+  if (success!=0)
+    return success;
+    
   ucl_device=device->gpu;
   atom=&device->atom;
-  nbor=&device->nbor;
 
-  _block_size=BLOCK_1D;
-  if (static_cast<size_t>(_block_size)>ucl_device->group_size())
-    _block_size=ucl_device->group_size();
+  _block_size=device->pair_block_size();
   compile_kernels(*ucl_device,pair_program);
 
   // Initialize host-device load balancer
-  hd_balancer.init(device,gpu_split);
+  hd_balancer.init(device,gpu_nbor,gpu_split);
 
   // Initialize timers for the selected GPU
   time_pair.init(*ucl_device);
@@ -73,9 +84,14 @@ bool AtomicGPUMemoryT::init_atomic(const int nlocal, const int nall,
 
   pos_tex.bind_float(atom->dev_x,4);
 
-  _max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
 
-  return true;
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void AtomicGPUMemoryT::estimate_gpu_overhead() {
+  device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
 }
 
 template <class numtyp, class acctyp>
@@ -83,7 +99,10 @@ void AtomicGPUMemoryT::clear_atomic() {
   // Output any timing information
   acc_timers();
   double avg_split=hd_balancer.all_avg_split();
-  device->output_times(time_pair,avg_split,_max_bytes+_max_an_bytes,screen);
+  _gpu_overhead*=hd_balancer.timestep();
+  _driver_overhead*=hd_balancer.timestep();
+  device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
+                       _gpu_overhead,_driver_overhead,_threads_per_atom,screen);
 
   if (_compiled) {
     k_pair_fast.clear();
@@ -107,8 +126,7 @@ int * AtomicGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
   success=true;
 
   nbor_time_avail=true;
-
-  int mn=nbor->max_nbor_loop(inum,numj);
+  int mn=nbor->max_nbor_loop(inum,numj,ilist);
   resize_atom(inum,nall,success);
   resize_local(inum,mn,success);
   if (!success)
@@ -116,7 +134,7 @@ int * AtomicGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
 
   nbor->get_host(inum,ilist,numj,firstneigh,block_size());
 
-  double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
   if (bytes>_max_an_bytes)
     _max_an_bytes=bytes;
   
@@ -130,8 +148,8 @@ template <class numtyp, class acctyp>
 inline void AtomicGPUMemoryT::build_nbor_list(const int inum,
                                               const int host_inum,
                                               const int nall, double **host_x,
-                                              int *host_type, double *boxlo,
-                                              double *boxhi, int *tag,
+                                              int *host_type, double *sublo,
+                                              double *subhi, int *tag,
                                               int **nspecial, int **special,
                                               bool &success) {
   nbor_time_avail=true;
@@ -144,10 +162,10 @@ inline void AtomicGPUMemoryT::build_nbor_list(const int inum,
   atom->cast_copy_x(host_x,host_type);
 
   int mn;
-  nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag,
+  nbor->build_nbor_list(inum, host_inum, nall, *atom, sublo, subhi, tag,
                         nspecial, special, success, mn);
 
-  double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
   if (bytes>_max_an_bytes)
     _max_an_bytes=bytes;
 }
@@ -156,24 +174,25 @@ inline void AtomicGPUMemoryT::build_nbor_list(const int inum,
 // Copy nbor list from host if necessary and then calculate forces, virials,..
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void AtomicGPUMemoryT::compute(const int timestep, const int f_ago,
-			      const int inum_full, const int nall,
-                              double **host_x, int *host_type,
-                              int *ilist, int *numj, int **firstneigh,
-                              const bool eflag, const bool vflag,
-                              const bool eatom, const bool vatom,
-                              int &host_start, const double cpu_time,
-                              bool &success) {
+void AtomicGPUMemoryT::compute(const int f_ago, const int inum_full,
+                               const int nall, double **host_x, int *host_type,
+                               int *ilist, int *numj, int **firstneigh,
+                               const bool eflag, const bool vflag,
+                               const bool eatom, const bool vatom,
+                               int &host_start, const double cpu_time,
+                               bool &success) {
   acc_timers();
   if (inum_full==0) {
+    host_start=0;
+    // Make sure textures are correct if realloc by a different hybrid style
+    resize_atom(0,nall,success);
     zero_timers();
     return;
   }
   
   int ago=hd_balancer.ago_first(f_ago);
-  int inum=hd_balancer.balance(timestep,ago,inum_full,cpu_time,
-		               nbor->gpu_nbor());
-  atom->inum(inum);
+  int inum=hd_balancer.balance(ago,inum_full,cpu_time);
+  ans->inum(inum);
   host_start=inum;
 
   if (ago==0) {
@@ -187,7 +206,8 @@ void AtomicGPUMemoryT::compute(const int timestep, const int f_ago,
   atom->add_x_data(host_x,host_type);
 
   loop(eflag,vflag);
-  atom->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  device->add_ans_object(ans);
   hd_balancer.stop_timer();
 }
 
@@ -195,29 +215,32 @@ void AtomicGPUMemoryT::compute(const int timestep, const int f_ago,
 // Reneighbor on GPU if necessary and then compute forces, virials, energies
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int * AtomicGPUMemoryT::compute(const int timestep, const int ago,
-                                const int inum_full, const int nall,
-                                double **host_x, int *host_type, double *boxlo,
-                                double *boxhi, int *tag, int **nspecial,
-                                int **special, const bool eflag, 
-                                const bool vflag, const bool eatom,
-                                const bool vatom, int &host_start,
-                                const double cpu_time, bool &success) {
+int ** AtomicGPUMemoryT::compute(const int ago, const int inum_full,
+                                 const int nall, double **host_x, int *host_type,
+                                 double *sublo, double *subhi, int *tag,
+                                 int **nspecial, int **special, const bool eflag, 
+                                 const bool vflag, const bool eatom,
+                                 const bool vatom, int &host_start,
+                                 int **ilist, int **jnum,
+                                 const double cpu_time, bool &success) {
   acc_timers();
   if (inum_full==0) {
+    host_start=0;
+    // Make sure textures are correct if realloc by a different hybrid style
+    resize_atom(0,nall,success);
     zero_timers();
     return NULL;
   }
   
-  hd_balancer.balance(cpu_time,nbor->gpu_nbor());
-  int inum=hd_balancer.get_gpu_count(timestep,ago,inum_full);
-  atom->inum(inum);
+  hd_balancer.balance(cpu_time);
+  int inum=hd_balancer.get_gpu_count(ago,inum_full);
+  ans->inum(inum);
   host_start=inum;
  
   // Build neighbor list on GPU if necessary
   if (ago==0) {
     build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
-                    boxlo, boxhi, tag, nspecial, special, success);
+                    sublo, subhi, tag, nspecial, special, success);
     if (!success)
       return NULL;
     hd_balancer.start_timer();
@@ -226,19 +249,21 @@ int * AtomicGPUMemoryT::compute(const int timestep, const int ago,
     hd_balancer.start_timer();
     atom->add_x_data(host_x,host_type);
   }
+  *ilist=nbor->host_ilist.begin();
+  *jnum=nbor->host_acc.begin();
 
   loop(eflag,vflag);
-  atom->copy_answers(eflag,vflag,eatom,vatom);
+  ans->copy_answers(eflag,vflag,eatom,vatom);
+  device->add_ans_object(ans);
   hd_balancer.stop_timer();
   
-  return device->nbor.host_nbor.begin();
+  return nbor->host_jlist.begin()-host_start;
 }
 
 template <class numtyp, class acctyp>
 double AtomicGPUMemoryT::host_memory_usage_atomic() const {
-  return device->atom.host_memory_usage()+
-         device->nbor.host_memory_usage()+4*sizeof(numtyp)+
-         sizeof(AtomicGPUMemory<numtyp,acctyp>);
+  return device->atom.host_memory_usage()+nbor->host_memory_usage()+
+         4*sizeof(numtyp)+sizeof(AtomicGPUMemory<numtyp,acctyp>);
 }
 
 template <class numtyp, class acctyp>
diff --git a/lib/gpu/atomic_gpu_memory.h b/lib/gpu/atomic_gpu_memory.h
index 81de41f3b7..238a4d9c1e 100644
--- a/lib/gpu/atomic_gpu_memory.h
+++ b/lib/gpu/atomic_gpu_memory.h
@@ -18,8 +18,6 @@
 #ifndef ATOMIC_GPU_MEMORY_H
 #define ATOMIC_GPU_MEMORY_H
 
-#define BLOCK_1D 64
-
 #include "pair_gpu_device.h"
 #include "pair_gpu_balance.h"
 #include "mpi.h"
@@ -39,17 +37,28 @@ class AtomicGPUMemory {
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
-  bool init_atomic(const int nlocal, const int nall, const int max_nbors,
-                   const int maxspecial, const double cell_size, 
-                   const double gpu_split, FILE *screen, 
-                   const char *pair_program);
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init_atomic(const int nlocal, const int nall, const int max_nbors,
+                  const int maxspecial, const double cell_size, 
+                  const double gpu_split, FILE *screen, 
+                  const char *pair_program);
+
+  /// Estimate the overhead for GPU context changes and CPU driver
+  void estimate_gpu_overhead();
 
   /// Check if there is enough storage for atom arrays and realloc if not
   /** \param success set to false if insufficient memory **/
   inline void resize_atom(const int inum, const int nall, bool &success) {
-    if (atom->resize(inum, nall, success))
+    if (atom->resize(nall, success))
       pos_tex.bind_float(atom->dev_x,4);
+    ans->resize(inum,success);
   }
 
   /// Check if there is enough storage for neighbors and realloc if not
@@ -85,13 +94,16 @@ class AtomicGPUMemory {
 
   /// Accumulate timers
   inline void acc_timers() {
-    if (nbor_time_avail) {
-      nbor->time_nbor.add_to_total();
-      nbor->time_kernel.add_to_total();
-      nbor_time_avail=false;
+    if (device->time_device()) {
+      if (nbor_time_avail) {
+        nbor->time_nbor.add_to_total();
+        nbor->time_kernel.add_to_total();
+        nbor_time_avail=false;
+      }
+      time_pair.add_to_total();
+      atom->acc_timers();
+      ans->acc_timers();
     }
-    time_pair.add_to_total();
-    atom->acc_timers();
   }
 
   /// Zero timers
@@ -99,6 +111,7 @@ class AtomicGPUMemory {
     nbor_time_avail=false;
     time_pair.zero();
     atom->zero_timers();
+    ans->zero_timers();
   }
 
   /// Copy neighbor list from host
@@ -108,24 +121,32 @@ class AtomicGPUMemory {
   /// Build neighbor list on device
   void build_nbor_list(const int inum, const int host_inum,
                        const int nall, double **host_x, int *host_type,
-                       double *boxlo, double *boxhi, int *tag, int **nspecial, 
+                       double *sublo, double *subhi, int *tag, int **nspecial, 
                        int **special, bool &success);
 
   /// Pair loop with host neighboring
-  void compute(const int timestep, const int f_ago, const int inum_full,
+  void compute(const int f_ago, const int inum_full,
                const int nall, double **host_x, int *host_type,
                int *ilist, int *numj, int **firstneigh, const bool eflag,
                const bool vflag, const bool eatom, const bool vatom,
                int &host_start, const double cpu_time, bool &success);
 
   /// Pair loop with device neighboring
-  int * compute(const int timestep, const int ago, const int inum_full,
-                const int nall, double **host_x, int *host_type, double *boxlo,
-                double *boxhi, int *tag, int **nspecial,
+  int * compute(const int ago, const int inum_full,
+                const int nall, double **host_x, int *host_type, double *sublo,
+                double *subhi, int *tag, int **nspecial,
                 int **special, const bool eflag, const bool vflag, 
                 const bool eatom, const bool vatom, int &host_start, 
                 const double cpu_time, bool &success);
 
+  /// Pair loop with device neighboring
+  int ** compute(const int ago, const int inum_full,
+                 const int nall, double **host_x, int *host_type, double *sublo,
+                 double *subhi, int *tag, int **nspecial,
+                 int **special, const bool eflag, const bool vflag, 
+                 const bool eatom, const bool vatom, int &host_start, 
+                 int **ilist, int **numj, const double cpu_time, bool &success);
+
   // -------------------------- DEVICE DATA ------------------------- 
 
   /// Device Properties and Atom and Neighbor storage
@@ -148,6 +169,9 @@ class AtomicGPUMemory {
   /// Atom Data
   PairGPUAtom<numtyp,acctyp> *atom;
 
+  // ------------------------ FORCE/ENERGY DATA -----------------------
+
+  PairGPUAns<numtyp,acctyp> *ans;
 
   // --------------------------- NBOR DATA ----------------------------
 
@@ -167,8 +191,10 @@ class AtomicGPUMemory {
 
  protected:
   bool _compiled;
-  int _block_size;
+  int _block_size, _threads_per_atom;
   double _max_bytes, _max_an_bytes;
+  double _gpu_overhead, _driver_overhead;
+  UCL_D_Vec<int> *_nbor_data;
 
   void compile_kernels(UCL_Device &dev, const char *pair_string);
 
diff --git a/lib/gpu/charge_gpu_memory.cpp b/lib/gpu/charge_gpu_memory.cpp
index ce43fdfda1..412596f5f2 100644
--- a/lib/gpu/charge_gpu_memory.cpp
+++ b/lib/gpu/charge_gpu_memory.cpp
@@ -23,23 +23,28 @@ extern PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
 template <class numtyp, class acctyp>
 ChargeGPUMemoryT::ChargeGPUMemory() : _compiled(false), _max_bytes(0) {
   device=&pair_gpu_device;
+  ans=new PairGPUAns<numtyp,acctyp>();
+  nbor=new PairGPUNbor();
 }
 
 template <class numtyp, class acctyp>
 ChargeGPUMemoryT::~ChargeGPUMemory() {
+  delete ans;
+  delete nbor;
 }
 
 template <class numtyp, class acctyp>
 int ChargeGPUMemoryT::bytes_per_atom_atomic(const int max_nbors) const {
-  return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors);
+  return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
+         nbor->bytes_per_atom(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
-                                   const int max_nbors, const int maxspecial,
-                                   const double cell_size,
-                                   const double gpu_split, FILE *_screen,
-                                   const char *pair_program) {
+int ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
+                                  const int max_nbors, const int maxspecial,
+                                  const double cell_size,
+                                  const double gpu_split, FILE *_screen,
+                                  const char *pair_program) {
   nbor_time_avail=false;
   screen=_screen;
 
@@ -48,24 +53,31 @@ bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
     gpu_nbor=true;
 
   int _gpu_host=0;
-  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split);
+  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
   if (host_nlocal>0)
     _gpu_host=1;
 
-  if (!device->init(true,false,nlocal,host_nlocal,nall,maxspecial,gpu_nbor,
-                    _gpu_host,max_nbors,cell_size,false))
-    return false;
+  _threads_per_atom=device->threads_per_charge();
+  if (_threads_per_atom>1 && gpu_nbor==false) {
+    nbor->packing(true);
+    _nbor_data=&(nbor->dev_packed);
+  } else
+    _nbor_data=&(nbor->dev_nbor);
+    
+  int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor,
+                           maxspecial,_gpu_host,max_nbors,cell_size,false);
+  if (success!=0)
+    return success;
+
   ucl_device=device->gpu;
   atom=&device->atom;
-  nbor=&device->nbor;
 
-  _block_size=BLOCK_1D;
-  if (static_cast<size_t>(_block_size)>ucl_device->group_size())
-    _block_size=ucl_device->group_size();
+  _block_size=device->pair_block_size();
+  _block_bio_size=device->block_bio_pair();
   compile_kernels(*ucl_device,pair_program);
 
   // Initialize host-device load balancer
-  hd_balancer.init(device,gpu_split);
+  hd_balancer.init(device,gpu_nbor,gpu_split);
 
   // Initialize timers for the selected GPU
   time_pair.init(*ucl_device);
@@ -74,9 +86,14 @@ bool ChargeGPUMemoryT::init_atomic(const int nlocal, const int nall,
   pos_tex.bind_float(atom->dev_x,4);
   q_tex.bind_float(atom->dev_q,1);
 
-  _max_an_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
 
-  return true;
+  return success;
+}
+
+template <class numtyp, class acctyp>
+void ChargeGPUMemoryT::estimate_gpu_overhead() {
+  device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
 }
 
 template <class numtyp, class acctyp>
@@ -84,7 +101,10 @@ void ChargeGPUMemoryT::clear_atomic() {
   // Output any timing information
   acc_timers();
   double avg_split=hd_balancer.all_avg_split();
-  device->output_times(time_pair,avg_split,_max_bytes+_max_an_bytes,screen);
+  _gpu_overhead*=hd_balancer.timestep();
+  _driver_overhead*=hd_balancer.timestep();
+  device->output_times(time_pair,*ans,*nbor,avg_split,_max_bytes+_max_an_bytes,
+                       _gpu_overhead,_driver_overhead,_threads_per_atom,screen);
 
   if (_compiled) {
     k_pair_fast.clear();
@@ -109,7 +129,7 @@ int * ChargeGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
 
   nbor_time_avail=true;
 
-  int mn=nbor->max_nbor_loop(inum,numj);
+  int mn=nbor->max_nbor_loop(inum,numj,ilist);
   resize_atom(inum,nall,success);
   resize_local(inum,mn,success);
   if (!success)
@@ -117,7 +137,7 @@ int * ChargeGPUMemoryT::reset_nbors(const int nall, const int inum, int *ilist,
 
   nbor->get_host(inum,ilist,numj,firstneigh,block_size());
 
-  double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
   if (bytes>_max_an_bytes)
     _max_an_bytes=bytes;
 
@@ -131,8 +151,8 @@ template <class numtyp, class acctyp>
 inline void ChargeGPUMemoryT::build_nbor_list(const int inum,
                                               const int host_inum,
                                               const int nall, double **host_x,
-                                              int *host_type, double *boxlo,
-                                              double *boxhi, int *tag, 
+                                              int *host_type, double *sublo,
+                                              double *subhi, int *tag, 
                                               int **nspecial, int **special,
                                               bool &success) {
   nbor_time_avail=true;
@@ -145,10 +165,10 @@ inline void ChargeGPUMemoryT::build_nbor_list(const int inum,
   atom->cast_copy_x(host_x,host_type);
 
   int mn;
-  nbor->build_nbor_list(inum, host_inum, nall, *atom, boxlo, boxhi, tag,
+  nbor->build_nbor_list(inum, host_inum, nall, *atom, sublo, subhi, tag,
                         nspecial, special, success, mn);
 
-  double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
   if (bytes>_max_an_bytes)
     _max_an_bytes=bytes;
 }
@@ -157,24 +177,26 @@ inline void ChargeGPUMemoryT::build_nbor_list(const int inum,
 // Copy nbor list from host if necessary and then calculate forces, virials,..
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void ChargeGPUMemoryT::compute(const int timestep, const int f_ago,
-                               const int inum_full, const int nall,
-                              double **host_x, int *host_type,
-                              int *ilist, int *numj, int **firstneigh,
-                              const bool eflag, const bool vflag,
-                              const bool eatom, const bool vatom,
-                              int &host_start, const double cpu_time,
-                              bool &success, double *host_q) {
+void ChargeGPUMemoryT::compute(const int f_ago, const int inum_full,
+                               const int nall, double **host_x, int *host_type,
+                               int *ilist, int *numj, int **firstneigh,
+                               const bool eflag, const bool vflag,
+                               const bool eatom, const bool vatom,
+                               int &host_start, const double cpu_time,
+                               bool &success, double *host_q,
+                               const int nlocal, double *boxlo, double *prd) {
   acc_timers();
   if (inum_full==0) {
+    host_start=0;
+    // Make sure textures are correct if realloc by a different hybrid style
+    resize_atom(0,nall,success);
     zero_timers();
     return;
   }
   
   int ago=hd_balancer.ago_first(f_ago);
-  int inum=hd_balancer.balance(timestep,ago,inum_full,cpu_time,
-		               nbor->gpu_nbor());
-  atom->inum(inum);
+  int inum=hd_balancer.balance(ago,inum_full,cpu_time);
+  ans->inum(inum);
   host_start=inum;
 
   if (ago==0) {
@@ -187,10 +209,14 @@ void ChargeGPUMemoryT::compute(const int timestep, const int f_ago,
   atom->cast_q_data(host_q);
   hd_balancer.start_timer();
   atom->add_x_data(host_x,host_type);
-  atom->add_other_data();
+  atom->add_q_data();
+
+  device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
+                     boxlo, prd);
 
   loop(eflag,vflag);
-  atom->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  device->add_ans_object(ans);
   hd_balancer.stop_timer();
 }
 
@@ -198,30 +224,33 @@ void ChargeGPUMemoryT::compute(const int timestep, const int f_ago,
 // Reneighbor on GPU if necessary and then compute forces, virials, energies
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int * ChargeGPUMemoryT::compute(const int timestep, const int ago,
-                                const int inum_full, const int nall, 
-                                double **host_x, int *host_type, double *boxlo,
-                                double *boxhi, int *tag, int **nspecial,
-                                int **special, const bool eflag, 
+int** ChargeGPUMemoryT::compute(const int ago, const int inum_full,
+                                const int nall, double **host_x, int *host_type,
+                                double *sublo, double *subhi, int *tag,
+                                int **nspecial, int **special, const bool eflag, 
                                 const bool vflag, const bool eatom,
                                 const bool vatom, int &host_start,
+                                int **ilist, int **jnum,
                                 const double cpu_time, bool &success,
-                                double *host_q) {
+                                double *host_q, double *boxlo, double *prd) {
   acc_timers();
   if (inum_full==0) {
+    host_start=0;
+    // Make sure textures are correct if realloc by a different hybrid style
+    resize_atom(0,nall,success);
     zero_timers();
     return NULL;
   }
   
-  hd_balancer.balance(cpu_time,nbor->gpu_nbor());
-  int inum=hd_balancer.get_gpu_count(timestep,ago,inum_full);
-  atom->inum(inum);
+  hd_balancer.balance(cpu_time);
+  int inum=hd_balancer.get_gpu_count(ago,inum_full);
+  ans->inum(inum);
   host_start=inum;
  
   // Build neighbor list on GPU if necessary
   if (ago==0) {
     build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
-                    boxlo, boxhi, tag, nspecial, special, success);
+                    sublo, subhi, tag, nspecial, special, success);
     if (!success)
       return NULL;
     atom->cast_q_data(host_q);
@@ -232,20 +261,25 @@ int * ChargeGPUMemoryT::compute(const int timestep, const int ago,
     hd_balancer.start_timer();
     atom->add_x_data(host_x,host_type);
   }
-  atom->add_other_data();
+  atom->add_q_data();
+  *ilist=nbor->host_ilist.begin();
+  *jnum=nbor->host_acc.begin();
+
+  device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
+                     boxlo, prd);
 
   loop(eflag,vflag);
-  atom->copy_answers(eflag,vflag,eatom,vatom);
+  ans->copy_answers(eflag,vflag,eatom,vatom);
+  device->add_ans_object(ans);
   hd_balancer.stop_timer();
   
-  return device->nbor.host_nbor.begin();
+  return nbor->host_jlist.begin()-host_start;
 }
 
 template <class numtyp, class acctyp>
 double ChargeGPUMemoryT::host_memory_usage_atomic() const {
-  return device->atom.host_memory_usage()+
-         device->nbor.host_memory_usage()+4*sizeof(numtyp)+
-         sizeof(ChargeGPUMemory<numtyp,acctyp>);
+  return device->atom.host_memory_usage()+nbor->host_memory_usage()+
+         4*sizeof(numtyp)+sizeof(ChargeGPUMemory<numtyp,acctyp>);
 }
 
 template <class numtyp, class acctyp>
diff --git a/lib/gpu/charge_gpu_memory.h b/lib/gpu/charge_gpu_memory.h
index d18857e4d6..768f0e0c08 100644
--- a/lib/gpu/charge_gpu_memory.h
+++ b/lib/gpu/charge_gpu_memory.h
@@ -18,8 +18,6 @@
 #ifndef CHARGE_GPU_MEMORY_H
 #define CHARGE_GPU_MEMORY_H
 
-#define BLOCK_1D 64
-
 #include "pair_gpu_device.h"
 #include "pair_gpu_balance.h"
 #include "mpi.h"
@@ -39,19 +37,30 @@ class ChargeGPUMemory {
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
-  bool init_atomic(const int nlocal, const int nall, const int max_nbors,
-                   const int maxspecial, const double cell_size,
-                   const double gpu_split, FILE *screen,
-                   const char *pair_program);
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init_atomic(const int nlocal, const int nall, const int max_nbors,
+                  const int maxspecial, const double cell_size,
+                  const double gpu_split, FILE *screen,
+                  const char *pair_program);
+
+  /// Estimate the overhead for GPU context changes and CPU driver
+  void estimate_gpu_overhead();
 
   /// Check if there is enough storage for atom arrays and realloc if not
   /** \param success set to false if insufficient memory **/
   inline void resize_atom(const int inum, const int nall, bool &success) {
-    if (atom->resize(inum, nall, success)) {
+    if (atom->resize(nall, success)) {
       pos_tex.bind_float(atom->dev_x,4);
       q_tex.bind_float(atom->dev_q,1);
     }
+    ans->resize(inum,success);
   }
 
   /// Check if there is enough storage for neighbors and realloc if not
@@ -87,13 +96,16 @@ class ChargeGPUMemory {
 
   /// Accumulate timers
   inline void acc_timers() {
-    if (nbor_time_avail) {
-      nbor->time_nbor.add_to_total();
-      nbor->time_kernel.add_to_total();
-      nbor_time_avail=false;
+    if (device->time_device()) {
+      if (nbor_time_avail) {
+        nbor->time_nbor.add_to_total();
+        nbor->time_kernel.add_to_total();
+        nbor_time_avail=false;
+      }
+      time_pair.add_to_total();
+      atom->acc_timers();
+      ans->acc_timers();
     }
-    time_pair.add_to_total();
-    atom->acc_timers();
   }
 
   /// Zero timers
@@ -101,6 +113,7 @@ class ChargeGPUMemory {
     nbor_time_avail=false;
     time_pair.zero();
     atom->zero_timers();
+    ans->zero_timers();
   }
 
   /// Copy neighbor list from host
@@ -110,24 +123,25 @@ class ChargeGPUMemory {
   /// Build neighbor list on device
   void build_nbor_list(const int inum, const int host_inum,
                        const int nall, double **host_x, int *host_type,
-                       double *boxlo, double *boxhi, int *tag, int **nspecial,
+                       double *sublo, double *subhi, int *tag, int **nspecial,
                        int **special, bool &success);
 
   /// Pair loop with host neighboring
-  void compute(const int timestep, const int f_ago, const int inum_full,
-               const int nall, double **host_x, int *host_type,
-               int *ilist, int *numj, int **firstneigh, const bool eflag,
-               const bool vflag, const bool eatom, const bool vatom,
-               int &host_start, const double cpu_time, bool &success,
-               double *charge);
+  void compute(const int f_ago, const int inum_full, const int nall,
+               double **host_x, int *host_type, int *ilist, int *numj,
+               int **firstneigh, const bool eflag, const bool vflag,
+               const bool eatom, const bool vatom, int &host_start,
+               const double cpu_time, bool &success, double *charge,
+               const int nlocal, double *boxlo, double *prd);
 
   /// Pair loop with device neighboring
-  int * compute(const int timestep, const int ago, const int inum_full,
-                const int nall, double **host_x, int *host_type, double *boxlo,
-                double *boxhi, int *tag, int **nspecial,
+  int** compute(const int ago, const int inum_full, const int nall,
+                double **host_x, int *host_type, double *sublo,
+                double *subhi, int *tag, int **nspecial,
                 int **special, const bool eflag, const bool vflag, 
                 const bool eatom, const bool vatom, int &host_start, 
-                const double cpu_time, bool &success, double *charge);
+                int **ilist, int **numj, const double cpu_time, bool &success,
+                double *charge, double *boxlo, double *prd);
 
   // -------------------------- DEVICE DATA ------------------------- 
 
@@ -152,6 +166,10 @@ class ChargeGPUMemory {
   PairGPUAtom<numtyp,acctyp> *atom;
 
 
+  // ------------------------ FORCE/ENERGY DATA -----------------------
+
+  PairGPUAns<numtyp,acctyp> *ans;
+
   // --------------------------- NBOR DATA ----------------------------
 
   /// Neighbor data
@@ -171,8 +189,10 @@ class ChargeGPUMemory {
 
  protected:
   bool _compiled;
-  int _block_size;
+  int _block_size, _block_bio_size, _threads_per_atom;
   double  _max_bytes, _max_an_bytes;
+  double _gpu_overhead, _driver_overhead;
+  UCL_D_Vec<int> *_nbor_data;
 
   void compile_kernels(UCL_Device &dev, const char *pair_string);
 
diff --git a/lib/gpu/cmm_cut_gpu.cpp b/lib/gpu/cmm_cut_gpu.cpp
index 53976ff7e8..7be958615a 100644
--- a/lib/gpu/cmm_cut_gpu.cpp
+++ b/lib/gpu/cmm_cut_gpu.cpp
@@ -28,12 +28,12 @@ static CMM_GPU_Memory<PRECISION,ACC_PRECISION> CMMMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
-                  double **host_lj1, double **host_lj2, double **host_lj3, 
-                  double **host_lj4, double **offset, double *special_lj,
-                  const int inum, const int nall, const int max_nbors, 
-                  const int maxspecial, const double cell_size, int &gpu_mode,
-                  FILE *screen) {
+int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
+                 double **host_lj1, double **host_lj2, double **host_lj3, 
+                 double **host_lj4, double **offset, double *special_lj,
+                 const int inum, const int nall, const int max_nbors, 
+                 const int maxspecial, const double cell_size, int &gpu_mode,
+                 FILE *screen) {
   CMMMF.clear();
   gpu_mode=CMMMF.device->gpu_mode();
   double gpu_split=CMMMF.device->particle_split();
@@ -54,13 +54,11 @@ bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
     fflush(screen);
   }
 
-  if (world_me==0) {
-    bool init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, 
-                            host_lj4, offset, special_lj, inum, nall, 300,
-                            maxspecial, cell_size, gpu_split, screen);
-    if (!init_ok)
-      return false;
-  }
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3, 
+                       host_lj4, offset, special_lj, inum, nall, 300,
+                       maxspecial, cell_size, gpu_split, screen);
 
   CMMMF.device->world_barrier();
   if (message)
@@ -75,45 +73,45 @@ bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types,
                 last_gpu,i);
       fflush(screen);
     }
-    if (gpu_rank==i && world_me!=0) {
-      bool init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
-                              host_lj4, offset, special_lj, inum, nall, 300,
-                              maxspecial, cell_size, gpu_split,
-			      screen);
-      if (!init_ok)
-        return false;
-    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
+                         host_lj4, offset, special_lj, inum, nall, 300,
+                         maxspecial, cell_size, gpu_split, screen);
+
     CMMMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
-  return true;
+
+  if (init_ok==0)
+    CMMMF.estimate_gpu_overhead();
+  return init_ok;
 }
 
 void cmm_gpu_clear() {
   CMMMF.clear();
 }
 
-int * cmm_gpu_compute_n(const int timestep, const int ago, const int inum_full,
+int** cmm_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
-                        double *boxlo, double *boxhi, int *tag, int **nspecial,
+                        double *sublo, double *subhi, int *tag, int **nspecial,
                         int **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
-                        const double cpu_time, bool &success) {
-  return CMMMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
-                       boxhi, tag, nspecial, special, eflag, vflag, eatom,
-                       vatom, host_start, cpu_time, success);
+                        int **ilist, int **jnum, const double cpu_time,
+                        bool &success) {
+  return CMMMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                       subhi, tag, nspecial, special, eflag, vflag, eatom,
+                       vatom, host_start, ilist, jnum, cpu_time, success);
 }  
 			
-void cmm_gpu_compute(const int timestep, const int ago, const int inum_full,
-	 	     const int nall, double **host_x, int *host_type,
-                     int *ilist, int *numj, int **firstneigh,
-		     const bool eflag, const bool vflag, const bool eatom,
-                     const bool vatom, int &host_start, const double cpu_time,
-                     bool &success) {
-  CMMMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
+void cmm_gpu_compute(const int ago, const int inum_full, const int nall,
+                     double **host_x, int *host_type, int *ilist, int *numj,
+                     int **firstneigh, const bool eflag, const bool vflag,
+                     const bool eatom, const bool vatom, int &host_start,
+                     const double cpu_time, bool &success) {
+  CMMMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                 firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }
 
diff --git a/lib/gpu/cmm_cut_gpu_kernel.cu b/lib/gpu/cmm_cut_gpu_kernel.cu
index 47504f621e..08cc31ed7f 100644
--- a/lib/gpu/cmm_cut_gpu_kernel.cu
+++ b/lib/gpu/cmm_cut_gpu_kernel.cu
@@ -18,8 +18,6 @@
 #ifndef CMM_GPU_KERNEL
 #define CMM_GPU_KERNEL
 
-#define MAX_SHARED_TYPES 8
-
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
@@ -46,7 +44,7 @@
 
 #ifdef NV_KERNEL
 
-#include "geryon/ucl_nv_kernel.h"
+#include "nv_kernel_def.h"
 texture<float4> pos_tex;
 
 #ifdef _DOUBLE_DOUBLE
@@ -72,6 +70,8 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 #define __inline inline
 
 #define fetch_pos(i,y) x_[i]
+#define BLOCK_PAIR 64
+#define MAX_SHARED_TYPES 8
 
 #endif
 
@@ -82,40 +82,56 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                           __global numtyp4* lj3, const int lj_types, 
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global acctyp4 *ans, __global acctyp *engv, 
-                          const int eflag, const int vflag, const int inum, 
-                          const int nall, const int nbor_pitch) {
-  // ii indexes the two interacting particles in gi
-  int ii=GLOBAL_ID_X;
+                          __global int *dev_packed, __global acctyp4 *ans,
+                          __global acctyp *engv, const int eflag,
+                          const int vflag, const int inum, const int nall,
+                          const int nbor_pitch, const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
   __local numtyp sp_lj[4];
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  if (ii<inum) {
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
   
-    acctyp energy=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
+  if (ii<inum) {
   
     __global int *nbor=dev_nbor+ii;
     int i=*nbor;
     nbor+=nbor_pitch;
     int numj=*nbor;
     nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
   
     numtyp4 ix=fetch_pos(i,x_); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
   
       int j=*nbor;
       factor_lj = sp_lj[sbmask(j)];
@@ -164,8 +180,47 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
       }
 
     } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
 
-    // Store answers
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<4; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
     __global acctyp *ap1=engv+ii;
     if (eflag>0) {
       *ap1=energy;
@@ -183,49 +238,64 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
 
 __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                                __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in,__global int *dev_nbor, 
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nall, const int nbor_pitch) {
-  // ii indexes the two interacting particles in gi
-  int ii=THREAD_ID_X;
+                               __global numtyp* sp_lj_in,__global int *dev_nbor,
+                               __global int *dev_packed, __global acctyp4 *ans,
+                               __global acctyp *engv, const int eflag,
+                               const int vflag, const int inum, const int nall,
+                               const int nbor_pitch, const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
-  if (ii<4)
-    sp_lj[ii]=sp_lj_in[ii];
-  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    lj1[ii]=lj1_in[ii];
+  if (tid<4)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
     if (eflag>0)
-      lj3[ii]=lj3_in[ii];
+      lj3[tid]=lj3_in[tid];
   }
-  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
+  
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
   __syncthreads();
   
   if (ii<inum) {
-  
-    acctyp energy=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
-  
     __global int *nbor=dev_nbor+ii;
     int i=*nbor;
     nbor+=nbor_pitch;
     int numj=*nbor;
     nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
   
     numtyp4 ix=fetch_pos(i,x_); //x_[i];
     int iw=ix.w;
     int itype=mul24((int)MAX_SHARED_TYPES,iw);
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
   
       int j=*nbor;
       factor_lj = sp_lj[sbmask(j)];
@@ -273,8 +343,47 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
       }
 
     } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
 
-    // Store answers
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<4; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
     __global acctyp *ap1=engv+ii;
     if (eflag>0) {
       *ap1=energy;
diff --git a/lib/gpu/cmm_cut_gpu_memory.cpp b/lib/gpu/cmm_cut_gpu_memory.cpp
index e5a83e5872..8a5949c9e7 100644
--- a/lib/gpu/cmm_cut_gpu_memory.cpp
+++ b/lib/gpu/cmm_cut_gpu_memory.cpp
@@ -42,22 +42,26 @@ int CMM_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
 }
 
 template <class numtyp, class acctyp>
-bool CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq, 
-                           int **host_cg_type, double **host_lj1, 
-                           double **host_lj2, double **host_lj3, 
-                           double **host_lj4, double **host_offset, 
-                           double *host_special_lj, const int nlocal,
-                           const int nall, const int max_nbors,
-                           const int maxspecial, const double cell_size, 
-                           const double gpu_split, FILE *_screen) {
-  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                    _screen,cmm_cut_gpu_kernel);
+int CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq, 
+                          int **host_cg_type, double **host_lj1, 
+                          double **host_lj2, double **host_lj3, 
+                          double **host_lj4, double **host_offset, 
+                          double *host_special_lj, const int nlocal,
+                          const int nall, const int max_nbors,
+                          const int maxspecial, const double cell_size, 
+                          const double gpu_split, FILE *_screen) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,cmm_cut_gpu_kernel);
+  if (success!=0)
+    return success;
 
   // If atom type constants fit in shared memory use fast kernel
   int cmm_types=ntypes;
   shared_types=false;
-  if (cmm_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
-    cmm_types=MAX_SHARED_TYPES;
+  int max_shared_types=this->device->max_shared_types();
+  if (cmm_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    cmm_types=max_shared_types;
     shared_types=true;
   }
   _cmm_types=cmm_types;
@@ -84,7 +88,7 @@ bool CMM_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
 
   _allocated=true;
   this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
-  return true;
+  return 0;
 }
 
 template <class numtyp, class acctyp>
@@ -122,9 +126,10 @@ void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
   else
     vflag=0;
   
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
 
-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
   int anall=this->atom->nall();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
@@ -133,16 +138,18 @@ void CMM_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
                           &lj3.begin(), &sp_lj.begin(),
                           &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &anall, &nbor_pitch);
+                          &this->_nbor_data->begin(),
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
+                          &ainum, &anall, &nbor_pitch, 
+                          &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                      &_cmm_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &anall, &nbor_pitch);
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &anall, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
 }
diff --git a/lib/gpu/cmm_cut_gpu_memory.h b/lib/gpu/cmm_cut_gpu_memory.h
index 8099d5b9c4..fff90e477d 100644
--- a/lib/gpu/cmm_cut_gpu_memory.h
+++ b/lib/gpu/cmm_cut_gpu_memory.h
@@ -29,13 +29,20 @@ class CMM_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
-  bool init(const int ntypes, double **host_cutsq, int **host_cg_type,
-            double **host_lj1, double **host_lj2, double **host_lj3,
-            double **host_lj4, double **host_offset, double *host_special_lj,
-            const int nlocal, const int nall, const int max_nbors, 
-            const int maxspecial, const double cell_size,
-            const double gpu_split, FILE *screen);
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq, int **host_cg_type,
+           double **host_lj1, double **host_lj2, double **host_lj3,
+           double **host_lj4, double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size,
+           const double gpu_split, FILE *screen);
 
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
diff --git a/lib/gpu/cmmc_long_gpu.cpp b/lib/gpu/cmmc_long_gpu.cpp
index a3fcf336c6..a6f3d090af 100644
--- a/lib/gpu/cmmc_long_gpu.cpp
+++ b/lib/gpu/cmmc_long_gpu.cpp
@@ -28,14 +28,14 @@ static CMML_GPU_Memory<PRECISION,ACC_PRECISION> CMMLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
-                   double **host_lj1, double **host_lj2, double **host_lj3, 
-                   double **host_lj4, double **offset, double *special_lj,
-                   const int inum, const int nall, const int max_nbors, 
-                   const int maxspecial, const double cell_size, int &gpu_mode,
-                   FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
-                   double *host_special_coul, const double qqrd2e,
-                   const double g_ewald) {
+int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
+                  double **host_lj1, double **host_lj2, double **host_lj3, 
+                  double **host_lj4, double **offset, double *special_lj,
+                  const int inum, const int nall, const int max_nbors, 
+                  const int maxspecial, const double cell_size, int &gpu_mode,
+                  FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
+                  double *host_special_coul, const double qqrd2e,
+                  const double g_ewald) {
   CMMLMF.clear();
   gpu_mode=CMMLMF.device->gpu_mode();
   double gpu_split=CMMLMF.device->particle_split();
@@ -56,15 +56,12 @@ bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
     fflush(screen);
   }
 
-  if (world_me==0) {
-    bool init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, 
-                             host_lj3, host_lj4, offset, special_lj, inum, 
-                             nall, 300, maxspecial, cell_size, gpu_split, 
-                             screen, host_cut_ljsq, host_cut_coulsq,
-                             host_special_coul, qqrd2e,g_ewald);
-    if (!init_ok)
-      return false;
-  }
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
+                        host_lj4, offset, special_lj, inum, nall, 300, 
+                        maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
+                        host_cut_coulsq, host_special_coul, qqrd2e,g_ewald);
 
   CMMLMF.device->world_barrier();
   if (message)
@@ -79,48 +76,51 @@ bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
                 last_gpu,i);
       fflush(screen);
     }
-    if (gpu_rank==i && world_me!=0) {
-      bool init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, 
-                               host_lj3, host_lj4, offset, special_lj, inum, 
-                               nall, 300, maxspecial, cell_size, gpu_split,
-                               screen, host_cut_ljsq, host_cut_coulsq,
-                               host_special_coul, qqrd2e, g_ewald);
-      if (!init_ok)
-        return false;
-    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
+                          host_lj4, offset, special_lj, inum,  nall, 300,
+                          maxspecial, cell_size, gpu_split, screen,
+                          host_cut_ljsq, host_cut_coulsq, host_special_coul,
+                          qqrd2e, g_ewald);
     CMMLMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
-  return true;
+
+  if (init_ok==0)
+    CMMLMF.estimate_gpu_overhead();
+  return init_ok;
 }
 
 void cmml_gpu_clear() {
   CMMLMF.clear();
 }
 
-int * cmml_gpu_compute_n(const int timestep, const int ago, const int inum_full,
+int** cmml_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
-                         double *boxlo, double *boxhi, int *tag, int **nspecial, 
+                         double *sublo, double *subhi, int *tag, int **nspecial, 
                          int **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
-                         const double cpu_time, bool &success, double *host_q) {
-  return CMMLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
-                        boxhi, tag, nspecial, special, eflag, vflag, eatom,
-                        vatom, host_start, cpu_time, success, host_q);
+                         int **ilist, int **jnum, const double cpu_time,
+                         bool &success, double *host_q, double *boxlo,
+                         double *prd) {
+  return CMMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                        subhi, tag, nspecial, special, eflag, vflag, eatom,
+                        vatom, host_start, ilist, jnum, cpu_time, success,
+                        host_q,boxlo,prd);
 }  
 			
-void cmml_gpu_compute(const int timestep, const int ago, const int inum_full,
-	 	     const int nall, double **host_x, int *host_type,
-                     int *ilist, int *numj, int **firstneigh,
-		     const bool eflag, const bool vflag, const bool eatom,
-                     const bool vatom, int &host_start, const double cpu_time,
-                     bool &success, double *host_q) {
-  CMMLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
+void cmml_gpu_compute(const int ago, const int inum_full, const int nall,
+                      double **host_x, int *host_type, int *ilist, int *numj,
+                      int **firstneigh, const bool eflag, const bool vflag,
+                      const bool eatom, const bool vatom, int &host_start,
+                      const double cpu_time, bool &success, double *host_q,
+                      const int nlocal, double *boxlo, double *prd) {
+  CMMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                 firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
-                host_q);
+                host_q,nlocal,boxlo,prd);
 }
 
 double cmml_gpu_bytes() {
diff --git a/lib/gpu/cmmc_long_gpu_kernel.cu b/lib/gpu/cmmc_long_gpu_kernel.cu
index 4a19b5fe03..5153cb5016 100644
--- a/lib/gpu/cmmc_long_gpu_kernel.cu
+++ b/lib/gpu/cmmc_long_gpu_kernel.cu
@@ -18,8 +18,6 @@
 #ifndef CMML_GPU_KERNEL
 #define CMML_GPU_KERNEL
 
-#define MAX_SHARED_TYPES 8
-
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
@@ -54,7 +52,7 @@
 
 #ifdef NV_KERNEL
 
-#include "geryon/ucl_nv_kernel.h"
+#include "nv_kernel_def.h"
 texture<float4> pos_tex;
 texture<float> q_tex;
 
@@ -90,6 +88,8 @@ __inline float fetch_q(const int& i, const float *q)
 
 #define fetch_pos(i,y) x_[i]
 #define fetch_q(i,y) q_[i]
+#define BLOCK_PAIR 64
+#define MAX_SHARED_TYPES 8
 
 #endif
 
@@ -100,13 +100,17 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                           __global numtyp4* lj3, const int lj_types, 
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global acctyp4 *ans, __global acctyp *engv, 
-                          const int eflag, const int vflag, const int inum, 
-                          const int nall, const int nbor_pitch,
-                          __global numtyp *q_ , const numtyp cut_coulsq,
-                          const numtyp qqrd2e, const numtyp g_ewald) {
-  // ii indexes the two interacting particles in gi
-  int ii=GLOBAL_ID_X;
+                          __global int *dev_packed, __global acctyp4 *ans,
+                          __global acctyp *engv, const int eflag, 
+                          const int vflag, const int inum, const int nall,
+                          const int nbor_pitch, __global numtyp *q_ ,
+                          const numtyp cut_coulsq, const numtyp qqrd2e,
+                          const numtyp g_ewald, const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
   __local numtyp sp_lj[8];
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
@@ -117,29 +121,41 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
   if (ii<inum) {
-    acctyp energy=(acctyp)0;
-    acctyp e_coul=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
-  
     __global int *nbor=dev_nbor+ii;
     int i=*nbor;
     nbor+=nbor_pitch;
     int numj=*nbor;
     nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
   
     numtyp4 ix=fetch_pos(i,x_); //x_[i];
     numtyp qtmp=fetch_q(i,q_);
     int itype=ix.w;
 
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
       int j=*nbor;
 
       numtyp factor_lj, factor_coul;
@@ -213,8 +229,49 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
       }
 
     } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
+    red_acc[4][tid]=e_coul;
 
-    // Store answers
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<5; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+    e_coul=red_acc[4][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
     __global acctyp *ap1=engv+ii;
     if (eflag>0) {
       *ap1=energy;
@@ -234,51 +291,67 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
 
 __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                                __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in, __global int *dev_nbor, 
+                               __global numtyp* sp_lj_in, 
+                               __global int *dev_nbor, __global int *dev_packed,
                                __global acctyp4 *ans, __global acctyp *engv, 
                                const int eflag, const int vflag, const int inum, 
                                const int nall, const int nbor_pitch,
                                __global numtyp *q_ , const numtyp cut_coulsq,
-                               const numtyp qqrd2e, const numtyp g_ewald) {
-  // ii indexes the two interacting particles in gi
-  int ii=THREAD_ID_X;
+                               const numtyp qqrd2e, const numtyp g_ewald,
+                               const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
-  if (ii<8)
-    sp_lj[ii]=sp_lj_in[ii];
-  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    lj1[ii]=lj1_in[ii];
-    lj3[ii]=lj3_in[ii];
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
+    lj3[tid]=lj3_in[tid];
   }
-  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
+  
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
   __syncthreads();
   
   if (ii<inum) {
-  
-    acctyp energy=(acctyp)0;
-    acctyp e_coul=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
-  
     __global int *nbor=dev_nbor+ii;
     int i=*nbor;
     nbor+=nbor_pitch;
     int numj=*nbor;
     nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
   
     numtyp4 ix=fetch_pos(i,x_); //x_[i];
     numtyp qtmp=fetch_q(i,q_);
     int iw=ix.w;
     int itype=mul24((int)MAX_SHARED_TYPES,iw);
 
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
       int j=*nbor;
 
       numtyp factor_lj, factor_coul;
@@ -351,8 +424,49 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
       }
 
     } // for nbor
+  } // if ii
 
-    // Store answers
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
+    red_acc[4][tid]=e_coul;
+
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<5; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+    e_coul=red_acc[4][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
     __global acctyp *ap1=engv+ii;
     if (eflag>0) {
       *ap1=energy;
diff --git a/lib/gpu/cmmc_long_gpu_memory.cpp b/lib/gpu/cmmc_long_gpu_memory.cpp
index 9a63bc5628..e2f99fceca 100644
--- a/lib/gpu/cmmc_long_gpu_memory.cpp
+++ b/lib/gpu/cmmc_long_gpu_memory.cpp
@@ -43,26 +43,30 @@ int CMML_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
 }
 
 template <class numtyp, class acctyp>
-bool CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq, 
-                            int **host_cg_type, double **host_lj1, 
-                            double **host_lj2, double **host_lj3, 
-                            double **host_lj4, double **host_offset, 
-                            double *host_special_lj, const int nlocal,
-                            const int nall, const int max_nbors,
-                            const int maxspecial, const double cell_size,
-                            const double gpu_split, FILE *_screen,
-                            double **host_cut_ljsq, 
-                            const double host_cut_coulsq,
-                            double *host_special_coul, const double qqrd2e,
-                            const double g_ewald) {
-  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                    _screen,cmmc_long_gpu_kernel);
+int CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq, 
+                           int **host_cg_type, double **host_lj1, 
+                           double **host_lj2, double **host_lj3, 
+                           double **host_lj4, double **host_offset, 
+                           double *host_special_lj, const int nlocal,
+                           const int nall, const int max_nbors,
+                           const int maxspecial, const double cell_size,
+                           const double gpu_split, FILE *_screen,
+                           double **host_cut_ljsq, 
+                           const double host_cut_coulsq,
+                           double *host_special_coul, const double qqrd2e,
+                           const double g_ewald) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,cmmc_long_gpu_kernel);
+  if (success!=0)
+    return success;
 
   // If atom type constants fit in shared memory use fast kernel
   int lj_types=ntypes;
   shared_types=false;
-  if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
-    lj_types=MAX_SHARED_TYPES;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
     shared_types=true;
   }
   _lj_types=lj_types;
@@ -95,7 +99,7 @@ bool CMML_GPU_MemoryT::init(const int ntypes, double **host_cutsq,
 
   _allocated=true;
   this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
-  return true;
+  return 0;
 }
 
 template <class numtyp, class acctyp>
@@ -133,9 +137,10 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
   else
     vflag=0;
   
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
 
-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
   int anall=this->atom->nall();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
@@ -144,19 +149,21 @@ void CMML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
                           &lj3.begin(), &sp_lj.begin(),
                           &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &this->_nbor_data->begin(),
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
                           &ainum, &anall, &nbor_pitch,
                           &this->atom->dev_q.begin(), &_cut_coulsq,
-                          &_qqrd2e, &_g_ewald);
+                          &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                      &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                      &anall, &nbor_pitch, &this->atom->dev_q.begin(),
-                     &_cut_coulsq, &_qqrd2e, &_g_ewald);
+                     &_cut_coulsq, &_qqrd2e, &_g_ewald, 
+                     &this->_threads_per_atom);
   }
   this->time_pair.stop();
 }
diff --git a/lib/gpu/cmmc_long_gpu_memory.h b/lib/gpu/cmmc_long_gpu_memory.h
index 8192c78249..45090368a5 100644
--- a/lib/gpu/cmmc_long_gpu_memory.h
+++ b/lib/gpu/cmmc_long_gpu_memory.h
@@ -29,15 +29,22 @@ class CMML_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
-  bool init(const int ntypes, double **host_cutsq, int ** cg_type,
-            double **host_lj1, double **host_lj2, double **host_lj3,
-            double **host_lj4, double **host_offset, double *host_special_lj,
-            const int nlocal, const int nall, const int max_nbors, 
-            const int maxspecial, const double cell_size, 
-            const double gpu_split, FILE *screen, double **host_cut_ljsq,
-            const double host_cut_coulsq, double *host_special_coul,
-            const double qqrd2e, const double g_ewald);
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq, int ** cg_type,
+           double **host_lj1, double **host_lj2, double **host_lj3,
+           double **host_lj4, double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen, double **host_cut_ljsq,
+           const double host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e, const double g_ewald);
 
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
diff --git a/lib/gpu/crml_gpu.cpp b/lib/gpu/crml_gpu.cpp
index 7458300907..1e59562ed5 100644
--- a/lib/gpu/crml_gpu.cpp
+++ b/lib/gpu/crml_gpu.cpp
@@ -28,16 +28,16 @@ static CRML_GPU_Memory<PRECISION,ACC_PRECISION> CRMLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
-                   double **host_lj2, double **host_lj3, double **host_lj4,
-                   double **offset, double *special_lj, const int inum,
-                   const int nall, const int max_nbors, const int maxspecial,
-                   const double cell_size, int &gpu_mode, FILE *screen,
-                   double host_cut_ljsq, double host_cut_coulsq,
-                   double *host_special_coul, const double qqrd2e,
-                   const double g_ewald, const double cut_lj_innersq,
-                   const double denom_lj, double **epsilon,
-                   double **sigma, const bool mix_arithmetic) {
+int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
+                  double **host_lj2, double **host_lj3, double **host_lj4,
+                  double **offset, double *special_lj, const int inum,
+                  const int nall, const int max_nbors, const int maxspecial,
+                  const double cell_size, int &gpu_mode, FILE *screen,
+                  double host_cut_ljsq, double host_cut_coulsq,
+                  double *host_special_coul, const double qqrd2e,
+                  const double g_ewald, const double cut_lj_innersq,
+                  const double denom_lj, double **epsilon,
+                  double **sigma, const bool mix_arithmetic) {
   CRMLMF.clear();
   gpu_mode=CRMLMF.device->gpu_mode();
   double gpu_split=CRMLMF.device->particle_split();
@@ -58,16 +58,13 @@ bool crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
     fflush(screen);
   }
 
-  if (world_me==0) {
-    bool init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
-                             host_lj4, offset, special_lj, inum, nall, 300,
-                             maxspecial, cell_size, gpu_split, screen,
-                             host_cut_ljsq, host_cut_coulsq, host_special_coul,
-                             qqrd2e, g_ewald, cut_lj_innersq, denom_lj,
-                             epsilon,sigma,mix_arithmetic);
-    if (!init_ok)
-      return false;
-  }
+  int init_ok=0;
+  if (world_me==0)
+    CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                offset, special_lj, inum, nall, 300, maxspecial, cell_size,
+                gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
+                host_special_coul, qqrd2e, g_ewald, cut_lj_innersq, denom_lj,
+                epsilon,sigma,mix_arithmetic);
 
   CRMLMF.device->world_barrier();
   if (message)
@@ -82,50 +79,54 @@ bool crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
                 last_gpu,i);
       fflush(screen);
     }
-    if (gpu_rank==i && world_me!=0) {
-      bool init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
-                               host_lj4, offset, special_lj, inum, nall, 300,
-                               maxspecial, cell_size, gpu_split,
-                               screen, host_cut_ljsq, host_cut_coulsq,
-                               host_special_coul, qqrd2e, g_ewald, 
-                               cut_lj_innersq, denom_lj, epsilon, sigma,
-                               mix_arithmetic);
-      if (!init_ok)
-        return false;
-    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
+                          host_lj4, offset, special_lj, inum, nall, 300,
+                          maxspecial, cell_size, gpu_split, screen,
+                          host_cut_ljsq, host_cut_coulsq, host_special_coul,
+                          qqrd2e, g_ewald,  cut_lj_innersq, denom_lj, epsilon,
+                          sigma, mix_arithmetic);
+
     CRMLMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
-  return true;
+
+  if (init_ok==0)
+    CRMLMF.estimate_gpu_overhead();
+  return init_ok;
 }
 
 void crml_gpu_clear() {
   CRMLMF.clear();
 }
 
-int * crml_gpu_compute_n(const int timestep, const int ago, const int inum_full,
+int** crml_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
-                         double *boxlo, double *boxhi, int *tag, int **nspecial, 
+                         double *sublo, double *subhi, int *tag, int **nspecial, 
                          int **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
-                         const double cpu_time, bool &success, double *host_q) {
-  return CRMLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
-                        boxhi, tag, nspecial, special, eflag, vflag, eatom,
-                        vatom, host_start, cpu_time, success, host_q);
+                         int **ilist, int **jnum, const double cpu_time,
+                         bool &success, double *host_q, double *boxlo,
+                         double *prd) {
+  return CRMLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                        subhi, tag, nspecial, special, eflag, vflag, eatom,
+                        vatom, host_start, ilist, jnum, cpu_time, success,
+                        host_q, boxlo, prd);
 }  
 			
-void crml_gpu_compute(const int timestep, const int ago, const int inum_full,
-	 	     const int nall, double **host_x, int *host_type,
-                     int *ilist, int *numj, int **firstneigh,
-		     const bool eflag, const bool vflag, const bool eatom,
-                     const bool vatom, int &host_start, const double cpu_time,
-                     bool &success, double *host_q) {
-  CRMLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
-                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
-                host_q);
+void crml_gpu_compute(const int ago, const int inum_full,
+	 	                  const int nall, double **host_x, int *host_type,
+                      int *ilist, int *numj, int **firstneigh,
+		                  const bool eflag, const bool vflag, const bool eatom,
+                      const bool vatom, int &host_start, const double cpu_time,
+                      bool &success, double *host_q, const int nlocal, 
+                      double *boxlo, double *prd) {
+  CRMLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,
+                 eflag,vflag,eatom,vatom,host_start,cpu_time,success,host_q,
+                 nlocal,boxlo,prd);
 }
 
 double crml_gpu_bytes() {
diff --git a/lib/gpu/crml_gpu_kernel.cu b/lib/gpu/crml_gpu_kernel.cu
index 6ba6eaedca..63ce924581 100644
--- a/lib/gpu/crml_gpu_kernel.cu
+++ b/lib/gpu/crml_gpu_kernel.cu
@@ -54,7 +54,7 @@
 
 #ifdef NV_KERNEL
 
-#include "geryon/ucl_nv_kernel.h"
+#include "nv_kernel_def.h"
 texture<float4> pos_tex;
 texture<float> q_tex;
 
@@ -90,6 +90,7 @@ __inline float fetch_q(const int& i, const float *q)
 
 #define fetch_pos(i,y) x_[i]
 #define fetch_q(i,y) q_[i]
+#define BLOCK_BIO_PAIR 64
 
 #endif
 
@@ -98,18 +99,22 @@ __inline float fetch_q(const int& i, const float *q)
 __inline int sbmask(int j) { return j >> SBBITS & 3; }
 
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
-                          const int lj_types, 
-                          __global numtyp *sp_lj_in, __global int *dev_nbor, 
+                          const int lj_types, __global numtyp *sp_lj_in,
+                          __global int *dev_nbor, __global int *dev_packed,
                           __global acctyp4 *ans, __global acctyp *engv, 
                           const int eflag, const int vflag, const int inum, 
                           const int nall, const int nbor_pitch,
                           __global numtyp *q_, const numtyp cut_coulsq,
                           const numtyp qqrd2e, const numtyp g_ewald,
                           const numtyp denom_lj, const numtyp cut_bothsq, 
-                          const numtyp cut_ljsq, const numtyp cut_lj_innersq) {
+                          const numtyp cut_ljsq, const numtyp cut_lj_innersq,
+                          const int t_per_atom) {
+
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
 
-  // ii indexes the two interacting particles in gi
-  int ii=GLOBAL_ID_X;
   __local numtyp sp_lj[8];
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
@@ -120,29 +125,41 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  if (ii<inum) {
-    acctyp energy=(acctyp)0;
-    acctyp e_coul=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
 
+  if (ii<inum) {
     __global int *nbor=dev_nbor+ii;
     int i=*nbor;
     nbor+=nbor_pitch;
     int numj=*nbor;
     nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
   
     numtyp4 ix=fetch_pos(i,x_); //x_[i];
     numtyp qtmp=fetch_q(i,q_);
     int itype=ix.w;
 
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
       int j=*nbor;
 
       numtyp factor_lj, factor_coul;
@@ -219,8 +236,49 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
       }
 
     } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
+    red_acc[4][tid]=e_coul;
 
-    // Store answers
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<5; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+    e_coul=red_acc[4][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
     __global acctyp *ap1=engv+ii;
     if (eflag>0) {
       *ap1=energy;
@@ -240,50 +298,65 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
 
 __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
                                __global numtyp* sp_lj_in, __global int *dev_nbor, 
-                               __global acctyp4 *ans, __global acctyp *engv, 
-                               const int eflag, const int vflag, const int inum, 
-                               const int nall, const int nbor_pitch,
-                               __global numtyp *q_, const numtyp cut_coulsq, 
-                               const numtyp qqrd2e, const numtyp g_ewald,
-                               const numtyp denom_lj, const numtyp cut_bothsq, 
-                               const numtyp cut_ljsq,
-                               const numtyp cut_lj_innersq) {
-  // ii indexes the two interacting particles in gi
-  int ii=THREAD_ID_X;
+                               __global int *dev_packed, __global acctyp4 *ans,
+                               __global acctyp *engv, const int eflag,
+                               const int vflag, const int inum, const int nall,
+                               const int nbor_pitch, __global numtyp *q_,
+                               const numtyp cut_coulsq, const numtyp qqrd2e,
+                               const numtyp g_ewald, const numtyp denom_lj,
+                               const numtyp cut_bothsq, const numtyp cut_ljsq, 
+                               const numtyp cut_lj_innersq,
+                               const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
   __local numtyp2 ljd[MAX_BIO_SHARED_TYPES];
   __local numtyp sp_lj[8];
-  if (ii<8)
-    sp_lj[ii]=sp_lj_in[ii];
-  ljd[ii]=ljd_in[ii];
-  ljd[ii+64]=ljd_in[ii+64];
-
-  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  ljd[tid]=ljd_in[tid];
+  if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
+    ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];
+  
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
   __syncthreads();
   
   if (ii<inum) {
-  
-    acctyp energy=(acctyp)0;
-    acctyp e_coul=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
-  
     __global int *nbor=dev_nbor+ii;
     int i=*nbor;
     nbor+=nbor_pitch;
     int numj=*nbor;
     nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
   
     numtyp4 ix=fetch_pos(i,x_); //x_[i];
     numtyp qtmp=fetch_q(i,q_);
     int itype=ix.w;
 
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
       int j=*nbor;
 
       numtyp factor_lj, factor_coul;
@@ -366,8 +439,49 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
       }
 
     } // for nbor
+  } // if ii
 
-    // Store answers
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
+    red_acc[4][tid]=e_coul;
+
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<5; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+    e_coul=red_acc[4][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
     __global acctyp *ap1=engv+ii;
     if (eflag>0) {
       *ap1=energy;
diff --git a/lib/gpu/crml_gpu_memory.cpp b/lib/gpu/crml_gpu_memory.cpp
index e877503e87..6661f67585 100644
--- a/lib/gpu/crml_gpu_memory.cpp
+++ b/lib/gpu/crml_gpu_memory.cpp
@@ -43,7 +43,7 @@ int CRML_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
 }
 
 template <class numtyp, class acctyp>
-bool CRML_GPU_MemoryT::init(const int ntypes,
+int CRML_GPU_MemoryT::init(const int ntypes,
                            double host_cut_bothsq, double **host_lj1, 
                            double **host_lj2, double **host_lj3, 
                            double **host_lj4, double **host_offset, 
@@ -56,20 +56,24 @@ bool CRML_GPU_MemoryT::init(const int ntypes,
                            const double g_ewald, const double cut_lj_innersq,
                            const double denom_lj, double **epsilon,
                            double **sigma, const bool mix_arithmetic) {
-  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                    _screen,crml_gpu_kernel);
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,crml_gpu_kernel);
+  if (success!=0)
+    return success;
 
   // If atom type constants fit in shared memory use fast kernel
   int lj_types=ntypes;
   shared_types=false;
-  if (this->_block_size>=64 && mix_arithmetic)
+  if (this->_block_bio_size>=64 && mix_arithmetic)
     shared_types=true;
   _lj_types=lj_types;
 
   // Allocate a host write buffer for data initialization
   int h_size=lj_types*lj_types;
-  if (h_size<MAX_BIO_SHARED_TYPES)
-    h_size=MAX_BIO_SHARED_TYPES;
+  int max_bio_shared_types=this->device->max_bio_shared_types();
+  if (h_size<max_bio_shared_types)
+    h_size=max_bio_shared_types;
   UCL_H_Vec<numtyp> host_write(h_size*32,*(this->ucl_device),
                                UCL_WRITE_OPTIMIZED);
   for (int i=0; i<h_size*32; i++)
@@ -79,7 +83,7 @@ bool CRML_GPU_MemoryT::init(const int ntypes,
   this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
                          host_lj3,host_lj4);
 
-  ljd.alloc(MAX_BIO_SHARED_TYPES,*(this->ucl_device),UCL_READ_ONLY);
+  ljd.alloc(max_bio_shared_types,*(this->ucl_device),UCL_READ_ONLY);
   this->atom->self_pack2(ntypes,ljd,host_write,epsilon,sigma);
 
   sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
@@ -99,7 +103,7 @@ bool CRML_GPU_MemoryT::init(const int ntypes,
 
   _allocated=true;
   this->_max_bytes=lj1.row_bytes()+ljd.row_bytes()+sp_lj.row_bytes();
-  return true;
+  return 0;
 }
 
 template <class numtyp, class acctyp>
@@ -125,7 +129,7 @@ double CRML_GPU_MemoryT::host_memory_usage() const {
 template <class numtyp, class acctyp>
 void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
   // Compute the block size and grid size to keep all cores busy
-  const int BX=this->block_size();
+  const int BX=this->_block_bio_size;
   int eflag, vflag;
   if (_eflag)
     eflag=1;
@@ -137,9 +141,10 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
   else
     vflag=0;
   
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
 
-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
   int anall=this->atom->nall();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
@@ -147,21 +152,24 @@ void CRML_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.set_size(GX,BX);
     this->k_pair_fast.run(&this->atom->dev_x.begin(), &ljd.begin(),
                           &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &this->_nbor_data->begin(),
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
                           &ainum, &anall, &nbor_pitch,
                           &this->atom->dev_q.begin(), &_cut_coulsq,
                           &_qqrd2e, &_g_ewald, &_denom_lj, &_cut_bothsq,
-                          &_cut_ljsq, &_cut_lj_innersq);
+                          &_cut_ljsq, &_cut_lj_innersq, 
+                          &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(),
                      &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                      &anall, &nbor_pitch, &this->atom->dev_q.begin(),
                      &_cut_coulsq, &_qqrd2e, &_g_ewald, &_denom_lj,
-                     &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq);
+                     &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
+                     &this->_threads_per_atom);
   }
   this->time_pair.stop();
 }
diff --git a/lib/gpu/crml_gpu_memory.h b/lib/gpu/crml_gpu_memory.h
index 5520cd3a17..a474d5982d 100644
--- a/lib/gpu/crml_gpu_memory.h
+++ b/lib/gpu/crml_gpu_memory.h
@@ -29,17 +29,24 @@ class CRML_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
-  bool init(const int ntypes, double host_cut_bothsq,
-            double **host_lj1, double **host_lj2, double **host_lj3,
-            double **host_lj4, double **host_offset, double *host_special_lj,
-            const int nlocal, const int nall, const int max_nbors, 
-            const int maxspecial, const double cell_size, 
-            const double gpu_split, FILE *screen, double host_cut_ljsq,
-            const double host_cut_coulsq, double *host_special_coul,
-            const double qqrd2e, const double g_ewald,
-            const double cut_lj_innersq, const double denom_lj, 
-            double **epsilon, double **sigma, const bool mix_arithmetic);
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double host_cut_bothsq,
+           double **host_lj1, double **host_lj2, double **host_lj3,
+           double **host_lj4, double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen, double host_cut_ljsq,
+           const double host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e, const double g_ewald,
+           const double cut_lj_innersq, const double denom_lj, 
+           double **epsilon, double **sigma, const bool mix_arithmetic);
 
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
diff --git a/lib/gpu/gb_gpu.cpp b/lib/gpu/gb_gpu.cpp
index 5ca88fd70f..70eb4d9344 100644
--- a/lib/gpu/gb_gpu.cpp
+++ b/lib/gpu/gb_gpu.cpp
@@ -49,14 +49,14 @@ void gb_gpu_pack_nbors(GBMT &gbm, const int GX, const int BX, const int start,
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool gb_gpu_init(const int ntypes, const double gamma,
-                 const double upsilon, const double mu, double **shape,
-                 double **well, double **cutsq, double **sigma,
-                 double **epsilon, double *host_lshape, int **form,
-                 double **host_lj1, double **host_lj2, double **host_lj3,
-                 double **host_lj4, double **offset, double *special_lj,
-                 const int inum, const int nall, const int max_nbors, 
-                 const double cell_size, int &gpu_mode, FILE *screen) {
+int gb_gpu_init(const int ntypes, const double gamma,
+                const double upsilon, const double mu, double **shape,
+                double **well, double **cutsq, double **sigma,
+                double **epsilon, double *host_lshape, int **form,
+                double **host_lj1, double **host_lj2, double **host_lj3,
+                double **host_lj4, double **offset, double *special_lj,
+                const int inum, const int nall, const int max_nbors, 
+                const double cell_size, int &gpu_mode, FILE *screen) {
   GBMF.clear();
   gpu_mode=GBMF.device->gpu_mode();
   double gpu_split=GBMF.device->particle_split();
@@ -77,14 +77,12 @@ bool gb_gpu_init(const int ntypes, const double gamma,
     fflush(screen);
   }
 
-  if (world_me==0) {
-    bool init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, 
-                           sigma, epsilon, host_lshape, form, host_lj1, 
-                           host_lj2, host_lj3, host_lj4, offset, special_lj, 
-                           inum, nall, max_nbors, cell_size, gpu_split, screen);
-    if (!init_ok)
-      return false;
-  }
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, 
+                      sigma, epsilon, host_lshape, form, host_lj1, 
+                      host_lj2, host_lj3, host_lj4, offset, special_lj, 
+                      inum, nall, max_nbors, cell_size, gpu_split, screen);
 
   GBMF.device->world_barrier();
   if (message)
@@ -99,22 +97,22 @@ bool gb_gpu_init(const int ntypes, const double gamma,
                 last_gpu,i);
       fflush(screen);
     }
-    if (gpu_rank==i && world_me!=0) {
-      bool init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq, 
-                             sigma, epsilon, host_lshape, form, host_lj1, 
-                             host_lj2, host_lj3, host_lj4, offset, special_lj, 
-                             inum, nall, max_nbors, cell_size, gpu_split, 
-                             screen);
-      if (!init_ok)
-        return false;
-    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=GBMF.init(ntypes, gamma, upsilon, mu, shape, well, cutsq,  sigma,
+                        epsilon, host_lshape, form, host_lj1, host_lj2,
+                        host_lj3, host_lj4, offset, special_lj,  inum, nall,
+                        max_nbors, cell_size, gpu_split,  screen);
+
     GBMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
-  return true;
+
+  if (init_ok==0)
+    GBMF.estimate_gpu_overhead();
+  return init_ok;
 }
 
 // ---------------------------------------------------------------------------
@@ -131,8 +129,8 @@ template <class gbmtyp>
 inline void _gb_gpu_build_nbor_list(gbmtyp &gbm, const int inum,
                                     const int host_inum, const int nall, 
                                     double **host_x, double **host_quat,
-                                    int *host_type, double *boxlo,
-                                    double *boxhi, bool &success) {
+                                    int *host_type, double *sublo,
+                                    double *subhi, bool &success) {
   gbm.nbor_time_avail=true;
 
   success=true;
@@ -144,7 +142,7 @@ inline void _gb_gpu_build_nbor_list(gbmtyp &gbm, const int inum,
   gbm.atom->cast_copy_x(host_x,host_type);
   int mn;
   gbm.nbor->build_nbor_list(inum, host_inum, nall, *gbm.atom,
-                            boxlo, boxhi, NULL, NULL, NULL, success, mn);
+                            sublo, subhi, NULL, NULL, NULL, success, mn);
   gbm.nbor->copy_unpacked(inum,mn);
   gbm.last_ellipse=inum;
   gbm.max_last_ellipse=inum;
@@ -163,7 +161,7 @@ void _gb_gpu_reset_nbors(gbmtyp &gbm, const int nall,
     
   gbm.nbor_time_avail=true;
 
-  int mn=gbm.nbor->max_nbor_loop(inum,numj);
+  int mn=gbm.nbor->max_nbor_loop(inum,numj,ilist);
   gbm.resize_atom(inum,nall,success);
   gbm.resize_local(inum,0,mn,osize,success);
   if (!success)
@@ -216,9 +214,10 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
   else
     vflag=0;
   
-  int GX=static_cast<int>(ceil(static_cast<double>(gbm.atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(gbm.ans->inum())/
+                               (BX/gbm._threads_per_atom)));
   int stride=gbm.nbor->nbor_pitch();
-  int ainum=gbm.atom->inum();
+  int ainum=gbm.ans->inum();
   int anall=gbm.atom->nall();
 
   if (gbm.multiple_forms) {
@@ -226,7 +225,7 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
     if (gbm.last_ellipse>0) {
       // ------------ ELLIPSE_ELLIPSE and ELLIPSE_SPHERE ---------------
       GX=static_cast<int>(ceil(static_cast<double>(gbm.last_ellipse)/
-                               static_cast<double>(BX)));
+                               (BX/gbm._threads_per_atom)));
       gb_gpu_pack_nbors(gbm,GX,BX, 0, gbm.last_ellipse,ELLIPSE_SPHERE,
 			ELLIPSE_ELLIPSE);
       gbm.time_kernel.stop();
@@ -237,11 +236,12 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
            &gbm.atom->dev_quat.begin(), &gbm.shape.begin(), &gbm.well.begin(),
            &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), 
            &gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(),
-           &stride, &gbm.atom->dev_ans.begin(),&ainum,&gbm.atom->dev_engv.begin(),
-           &gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &anall);
+           &stride, &gbm.ans->dev_ans.begin(),&ainum,&gbm.ans->dev_engv.begin(),
+           &gbm.dev_error.begin(), &eflag, &vflag, &gbm.last_ellipse, &anall,
+           &gbm._threads_per_atom);
       gbm.time_gayberne.stop();
 
-      if (gbm.last_ellipse==gbm.atom->inum()) {
+      if (gbm.last_ellipse==gbm.ans->inum()) {
         gbm.time_kernel2.start();
         gbm.time_kernel2.stop();
         gbm.time_gayberne2.start();
@@ -254,9 +254,10 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
       // ------------ SPHERE_ELLIPSE ---------------
 
       gbm.time_kernel2.start();
-      GX=static_cast<int>(ceil(static_cast<double>(gbm.atom->inum()-
-                               gbm.last_ellipse)/BX));
-      gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.atom->inum(),
+      GX=static_cast<int>(ceil(static_cast<double>(gbm.ans->inum()-
+                               gbm.last_ellipse)/
+                               (BX/gbm._threads_per_atom)));
+      gb_gpu_pack_nbors(gbm,GX,BX,gbm.last_ellipse,gbm.ans->inum(),
 			SPHERE_ELLIPSE,SPHERE_ELLIPSE);
       gbm.time_kernel2.stop();
 
@@ -266,13 +267,14 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
               &gbm.shape.begin(), &gbm.well.begin(), 
               &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), 
               &gbm._lj_types, &gbm.lshape.begin(), 
-              &gbm.nbor->dev_nbor.begin(), &stride, &gbm.atom->dev_ans.begin(),
-              &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(), &eflag,
-              &vflag, &gbm.last_ellipse, &ainum, &anall);
+              &gbm.nbor->dev_nbor.begin(), &stride, &gbm.ans->dev_ans.begin(),
+              &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(), &eflag,
+              &vflag, &gbm.last_ellipse, &ainum, &anall,
+              &gbm._threads_per_atom);
       gbm.time_gayberne2.stop();
    } else {
-      gbm.atom->dev_ans.zero();
-      gbm.atom->dev_engv.zero();
+      gbm.ans->dev_ans.zero();
+      gbm.ans->dev_engv.zero();
       gbm.time_kernel.stop();
       gbm.time_gayberne.start();                                 
       gbm.time_gayberne.stop();
@@ -284,29 +286,31 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
     
     // ------------         LJ      ---------------
     gbm.time_pair.start();
-    if (gbm.last_ellipse<gbm.atom->inum()) {
+    if (gbm.last_ellipse<gbm.ans->inum()) {
       if (gbm.shared_types) {
         GBMF.k_lj_fast.set_size(GX,BX);
         GBMF.k_lj_fast.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(),
                            &gbm.lj3.begin(), &gbm.gamma_upsilon_mu.begin(),
                            &stride, &gbm.nbor->dev_packed.begin(),
-                           &gbm.atom->dev_ans.begin(),
-                           &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
-                           &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall);
+                           &gbm.ans->dev_ans.begin(),
+                           &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
+                           &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall,
+                           &gbm._threads_per_atom);
       } else {
         GBMF.k_lj.set_size(GX,BX);
         GBMF.k_lj.run(&gbm.atom->dev_x.begin(), &gbm.lj1.begin(),
                       &gbm.lj3.begin(), &gbm._lj_types, 
                       &gbm.gamma_upsilon_mu.begin(), &stride, 
-                      &gbm.nbor->dev_packed.begin(), &gbm.atom->dev_ans.begin(),
-                      &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
-                      &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall);
+                      &gbm.nbor->dev_packed.begin(), &gbm.ans->dev_ans.begin(),
+                      &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
+                      &eflag, &vflag, &gbm.last_ellipse, &ainum, &anall,
+                      &gbm._threads_per_atom);
       }
     }
     gbm.time_pair.stop();
   } else {
     gbm.time_kernel.start();
-    gb_gpu_pack_nbors(gbm, GX, BX, 0, gbm.atom->inum(),SPHERE_SPHERE,
+    gb_gpu_pack_nbors(gbm, GX, BX, 0, gbm.ans->inum(),SPHERE_SPHERE,
 		      ELLIPSE_ELLIPSE);
     gbm.time_kernel.stop();
     gbm.time_gayberne.start(); 
@@ -315,9 +319,9 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
             &gbm.shape.begin(), &gbm.well.begin(), 
             &gbm.gamma_upsilon_mu.begin(), &gbm.sigma_epsilon.begin(), 
             &gbm._lj_types, &gbm.lshape.begin(), &gbm.nbor->dev_nbor.begin(),
-            &stride, &gbm.atom->dev_ans.begin(), &ainum,
-            &gbm.atom->dev_engv.begin(), &gbm.dev_error.begin(),
-            &eflag, &vflag, &ainum, &anall);
+            &stride, &gbm.ans->dev_ans.begin(), &ainum,
+            &gbm.ans->dev_engv.begin(), &gbm.dev_error.begin(),
+            &eflag, &vflag, &ainum, &anall, &gbm._threads_per_atom);
     gbm.time_gayberne.stop();
   }
 }
@@ -326,30 +330,31 @@ void _gb_gpu_gayberne(GBMT &gbm, const bool _eflag, const bool _vflag) {
 // Reneighbor on GPU if necessary and then compute forces, torques, energies
 // ---------------------------------------------------------------------------
 template <class gbmtyp>
-inline int * _gb_gpu_compute_n(gbmtyp &gbm, const int timestep, const int ago,
-		               const int inum_full, const int nall,
-			       double **host_x, int *host_type,
-			       double *boxlo, double *boxhi, const bool eflag,
-			       const bool vflag, const bool eatom,
+inline int** _gb_gpu_compute_n(gbmtyp &gbm, const int ago,
+                               const int inum_full, const int nall,
+                               double **host_x, int *host_type,
+                               double *sublo, double *subhi, const bool eflag,
+                               const bool vflag, const bool eatom,
                                const bool vatom, int &host_start,
-		               const double cpu_time, bool &success,
-			       double **host_quat) {
+                               int **ilist, int **jnum, const double cpu_time,
+                               bool &success, double **host_quat) {
   gbm.acc_timers();
   if (inum_full==0) {
+    host_start=0;
     gbm.zero_timers();
     return NULL;
   }
 
-  gbm.hd_balancer.balance(cpu_time,gbm.nbor->gpu_nbor());
-  int inum=gbm.hd_balancer.get_gpu_count(timestep,ago,inum_full);
-  gbm.atom->inum(inum);
+  gbm.hd_balancer.balance(cpu_time);
+  int inum=gbm.hd_balancer.get_gpu_count(ago,inum_full);
+  gbm.ans->inum(inum);
   gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
   host_start=inum;
   
   // Build neighbor list on GPU if necessary
   if (ago==0) {
     _gb_gpu_build_nbor_list(gbm, inum, inum_full-inum, nall, host_x,
-                            host_quat, host_type, boxlo, boxhi, success);
+                            host_quat, host_type, sublo, subhi, success);
     if (!success)
       return NULL;
     gbm.atom->cast_quat_data(host_quat[0]);
@@ -361,47 +366,49 @@ inline int * _gb_gpu_compute_n(gbmtyp &gbm, const int timestep, const int ago,
     gbm.atom->add_x_data(host_x,host_type);
   }
 
-  gbm.atom->add_other_data();
+  gbm.atom->add_quat_data();
+  *ilist=gbm.nbor->host_ilist.begin();
+  *jnum=gbm.nbor->host_acc.begin();
 
   _gb_gpu_gayberne<PRECISION,ACC_PRECISION>(gbm,eflag,vflag);
-  gbm.atom->copy_answers(eflag,vflag,eatom,vatom);
+  gbm.ans->copy_answers(eflag,vflag,eatom,vatom);
+  gbm.device->add_ans_object(gbm.ans);
   gbm.hd_balancer.stop_timer();
-  return gbm.device->nbor.host_nbor.begin();
+  return gbm.nbor->host_jlist.begin()-host_start;
 }
 
-int * gb_gpu_compute_n(const int timestep, const int ago, const int inum_full,
-	 	       const int nall, double **host_x, int *host_type,
-                       double *boxlo, double *boxhi, const bool eflag,
-		       const bool vflag, const bool eatom, const bool vatom,
-                       int &host_start, const double cpu_time, bool &success,
-		       double **host_quat) {
-  return _gb_gpu_compute_n(GBMF, timestep, ago, inum_full, nall, host_x,
-			   host_type, boxlo, boxhi, eflag, vflag, eatom, vatom,
-                           host_start, cpu_time, success, host_quat);
+int** gb_gpu_compute_n(const int ago, const int inum_full, const int nall,
+                       double **host_x, int *host_type, double *sublo,
+                       double *subhi, const bool eflag, const bool vflag,
+                       const bool eatom, const bool vatom, int &host_start,
+                       int **ilist, int **jnum, const double cpu_time,
+                       bool &success, double **host_quat) {
+  return _gb_gpu_compute_n(GBMF, ago, inum_full, nall, host_x, host_type, sublo,
+                           subhi, eflag, vflag, eatom, vatom, host_start, ilist,
+                           jnum, cpu_time, success, host_quat);
 }  
 
 // ---------------------------------------------------------------------------
 // Copy nbor list from host if necessary and then calculate forces, torques,..
 // ---------------------------------------------------------------------------
 template <class gbmtyp>
-inline int * _gb_gpu_compute(gbmtyp &gbm, const int timestep, const int f_ago,
-			     const int inum_full,const int nall,double **host_x,
-			     int *host_type, int *ilist, int *numj,
-			     int **firstneigh, const bool eflag,
-			     const bool vflag, const bool eatom,
-                             const bool vatom, int &host_start,
-			     const double cpu_time, bool &success,
-			     double **host_quat) {
+inline int * _gb_gpu_compute(gbmtyp &gbm, const int f_ago, const int inum_full,
+                             const int nall,double **host_x, int *host_type,
+                             int *ilist, int *numj, int **firstneigh,
+                             const bool eflag, const bool vflag,
+                             const bool eatom, const bool vatom,
+                             int &host_start, const double cpu_time,
+                             bool &success, double **host_quat) {
   gbm.acc_timers();
   if (inum_full==0) {
+    host_start=0;
     gbm.zero_timers();
     return NULL;
   }
   
   int ago=gbm.hd_balancer.ago_first(f_ago);
-  int inum=gbm.hd_balancer.balance(timestep,ago,inum_full,cpu_time,
-				   gbm.nbor->gpu_nbor());
-  gbm.atom->inum(inum);
+  int inum=gbm.hd_balancer.balance(ago,inum_full,cpu_time);
+  gbm.ans->inum(inum);
   gbm.last_ellipse=std::min(inum,gbm.max_last_ellipse);
   host_start=inum;
 
@@ -421,21 +428,21 @@ inline int * _gb_gpu_compute(gbmtyp &gbm, const int timestep, const int f_ago,
   gbm.atom->cast_quat_data(host_quat[0]);
   gbm.hd_balancer.start_timer();
   gbm.atom->add_x_data(host_x,host_type);
-  gbm.atom->add_other_data();
+  gbm.atom->add_quat_data();
 
   _gb_gpu_gayberne<PRECISION,ACC_PRECISION>(gbm,eflag,vflag);
-  gbm.atom->copy_answers(eflag,vflag,eatom,vatom,list);
+  gbm.ans->copy_answers(eflag,vflag,eatom,vatom,list);
+  gbm.device->add_ans_object(gbm.ans);
   gbm.hd_balancer.stop_timer();
   return list;
 }
 
-int * gb_gpu_compute(const int timestep, const int ago, const int inum_full,
-	 	     const int nall, double **host_x, int *host_type,
-                     int *ilist, int *numj, int **firstneigh,
-		     const bool eflag, const bool vflag, const bool eatom,
-                     const bool vatom, int &host_start, const double cpu_time,
-                     bool &success, double **host_quat) {
-  return _gb_gpu_compute(GBMF, timestep, ago, inum_full, nall, host_x,
+int * gb_gpu_compute(const int ago, const int inum_full, const int nall,
+                     double **host_x, int *host_type, int *ilist, int *numj,
+                     int **firstneigh, const bool eflag, const bool vflag,
+                     const bool eatom, const bool vatom, int &host_start,
+                     const double cpu_time, bool &success, double **host_quat) {
+  return _gb_gpu_compute(GBMF, ago, inum_full, nall, host_x,
 			 host_type, ilist, numj, firstneigh, eflag, vflag,
 			 eatom, vatom, host_start, cpu_time, success,
                          host_quat);
diff --git a/lib/gpu/gb_gpu_extra.h b/lib/gpu/gb_gpu_extra.h
index 6ac390437a..a341940c0a 100644
--- a/lib/gpu/gb_gpu_extra.h
+++ b/lib/gpu/gb_gpu_extra.h
@@ -18,7 +18,6 @@
 #ifndef GB_GPU_EXTRA_H
 #define GB_GPU_EXTRA_H
 
-#define MAX_SHARED_TYPES 8
 enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
 
 #ifdef _DOUBLE_DOUBLE
@@ -47,7 +46,7 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
 
 #ifdef NV_KERNEL
 
-#include "geryon/ucl_nv_kernel.h"
+#include "nv_kernel_def.h"
 
 #else
 
@@ -58,6 +57,8 @@ enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
 #define BLOCK_SIZE_X get_local_size(0)
 #define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
 #define __inline inline
+#define BLOCK_PAIR 64
+#define MAX_SHARED_TYPES 8
 
 #endif
 
diff --git a/lib/gpu/gb_gpu_kernel.cu b/lib/gpu/gb_gpu_kernel.cu
index b8d06ec6da..7bb320f5d0 100644
--- a/lib/gpu/gb_gpu_kernel.cu
+++ b/lib/gpu/gb_gpu_kernel.cu
@@ -97,17 +97,17 @@ __kernel void kernel_gayberne(__global numtyp4* x_,__global numtyp4 *q,
                               __global acctyp4 *ans, const int astride, 
                               __global acctyp *engv, __global int *err_flag, 
                               const int eflag, const int vflag, const int inum,
-                              const int nall) {
+                              const int nall, const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
   __local numtyp sp_lj[4];
-
-  // ii indexes the two interacting particles in gi
-  int ii=THREAD_ID_X;
-  if (ii<4)
-    sp_lj[ii]=gum[ii+3];    
-  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);                                  
-  __syncthreads();
-
-  if (ii<inum) {
+  sp_lj[0]=gum[3];    
+  sp_lj[1]=gum[4];    
+  sp_lj[2]=gum[5];    
+  sp_lj[3]=gum[6];    
 
   acctyp energy=(acctyp)0;
   acctyp4 f;
@@ -121,262 +121,309 @@ __kernel void kernel_gayberne(__global numtyp4* x_,__global numtyp4 *q,
   acctyp virial[6];
   for (int i=0; i<6; i++)
     virial[i]=(acctyp)0;
+
+  if (ii<inum) {
+    __global int *nbor=dev_nbor+ii;
+    int i=*nbor;
+    nbor+=stride;
+    int numj=*nbor;
+    nbor+=stride;
+    __global int *nbor_end=nbor+mul24(stride,numj);
+    nbor+=mul24(offset,stride);
+    int n_stride=mul24(t_per_atom,stride);
   
-  __global int *nbor=dev_nbor+ii;
-  int i=*nbor;
-  nbor+=stride;
-  int numj=*nbor;
-  nbor+=stride;
-  __global int *nbor_end=nbor+mul24(stride,numj);
-  
-  numtyp4 ix=x_[i];
-  int itype=ix.w;
-  numtyp a1[9], b1[9], g1[9];
-  numtyp4 ishape=shape[itype];
-  {
-    numtyp t[9];
-    gpu_quat_to_mat_trans(q,i,a1);
-    gpu_times3(ishape,a1,t);
-    gpu_transpose_times3(a1,t,g1);
-    gpu_times3(well[itype],a1,t);
-    gpu_transpose_times3(a1,t,b1);
-  }
-
-  numtyp factor_lj;
-  for ( ; nbor<nbor_end; nbor+=stride) {
-
-  int j=*nbor;
-  factor_lj = sp_lj[sbmask(j)];
-  j &= NEIGHMASK;
-
-  numtyp4 jx=x_[j];
-  int jtype=jx.w;
-
-  // Compute r12
-  numtyp r12[3];
-  r12[0] = jx.x-ix.x;
-  r12[1] = jx.y-ix.y;
-  r12[2] = jx.z-ix.z;
-  numtyp ir = gpu_dot3(r12,r12);
-
-  ir = rsqrt(ir);
-  numtyp r = (numtyp)1.0/ir;
-
-  numtyp a2[9];
-  gpu_quat_to_mat_trans(q,j,a2);
-  
-  numtyp u_r, dUr[3], tUr[3], eta, teta[3];
-  { // Compute U_r, dUr, eta, and teta
-    // Compute g12
-    numtyp g12[9];
+    numtyp4 ix=x_[i];
+    int itype=ix.w;
+    numtyp a1[9], b1[9], g1[9];
+    numtyp4 ishape=shape[itype];
     {
-      numtyp g2[9];
-      {
-          gpu_times3(shape[jtype],a2,g12);
-          gpu_transpose_times3(a2,g12,g2);
-          gpu_plus3(g1,g2,g12);
+      numtyp t[9];
+      gpu_quat_to_mat_trans(q,i,a1);
+      gpu_times3(ishape,a1,t);
+      gpu_transpose_times3(a1,t,g1);
+      gpu_times3(well[itype],a1,t);
+      gpu_transpose_times3(a1,t,b1);
+    }
+
+    numtyp factor_lj;
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=*nbor;
+      factor_lj = sp_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx=x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp r12[3];
+      r12[0] = jx.x-ix.x;
+      r12[1] = jx.y-ix.y;
+      r12[2] = jx.z-ix.z;
+      numtyp ir = gpu_dot3(r12,r12);
+
+      ir = rsqrt(ir);
+      numtyp r = (numtyp)1.0/ir;
+
+      numtyp a2[9];
+      gpu_quat_to_mat_trans(q,j,a2);
+  
+      numtyp u_r, dUr[3], tUr[3], eta, teta[3];
+      { // Compute U_r, dUr, eta, and teta
+        // Compute g12
+        numtyp g12[9];
+        {
+          numtyp g2[9];
+          {
+              gpu_times3(shape[jtype],a2,g12);
+              gpu_transpose_times3(a2,g12,g2);
+              gpu_plus3(g1,g2,g12);
+          }
+
+          { // Compute U_r and dUr
+    
+            // Compute kappa
+            numtyp kappa[3];
+            gpu_mldivide3(g12,r12,kappa,err_flag);
+
+            // -- replace r12 with r12 hat
+            r12[0]*=ir;
+            r12[1]*=ir;
+            r12[2]*=ir;
+
+            // -- kappa is now / r
+            kappa[0]*=ir;
+            kappa[1]*=ir;
+            kappa[2]*=ir;
+
+            // energy
+  
+            // compute u_r and dUr
+            numtyp uslj_rsq;
+            {
+              // Compute distance of closest approach
+              numtyp h12, sigma12;
+              sigma12 = gpu_dot3(r12,kappa);
+              sigma12 = rsqrt((numtyp)0.5*sigma12);
+              h12 = r-sigma12;
+
+              // -- kappa is now ok
+              kappa[0]*=r;
+              kappa[1]*=r;
+              kappa[2]*=r;
+          
+              int mtype=mul24(ntypes,itype)+jtype;
+              numtyp sigma = sig_eps[mtype].x;
+              numtyp epsilon = sig_eps[mtype].y;
+              numtyp varrho = sigma/(h12+gum[0]*sigma);
+              numtyp varrho6 = varrho*varrho*varrho;
+              varrho6*=varrho6;
+              numtyp varrho12 = varrho6*varrho6;
+              u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
+
+              numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
+              temp1 = temp1*(numtyp)24.0*epsilon;
+              uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
+              numtyp temp2 = gpu_dot3(kappa,r12);
+              uslj_rsq = uslj_rsq*ir*ir;
+
+              dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]);
+              dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]);
+              dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]);
+            }
+
+            // torque for particle 1
+            {
+              numtyp tempv[3], tempv2[3];
+              tempv[0] = -uslj_rsq*kappa[0];
+              tempv[1] = -uslj_rsq*kappa[1];
+              tempv[2] = -uslj_rsq*kappa[2];
+              gpu_row_times3(kappa,g1,tempv2);
+              gpu_cross3(tempv,tempv2,tUr);
+            }
+          }
+        }
+     
+        // Compute eta
+        {
+          eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
+          numtyp det_g12 = gpu_det3(g12);
+          eta = pow(eta/det_g12,gum[1]);
+        }
+    
+        // Compute teta
+        numtyp temp[9], tempv[3], tempv2[3];
+        compute_eta_torque(g12,a1,ishape,temp);
+        numtyp temp1 = -eta*gum[1];
+
+        tempv[0] = temp1*temp[0];
+        tempv[1] = temp1*temp[1];
+        tempv[2] = temp1*temp[2];
+        gpu_cross3(a1,tempv,tempv2);
+        teta[0] = tempv2[0];
+        teta[1] = tempv2[1];
+        teta[2] = tempv2[2];
+  
+        tempv[0] = temp1*temp[3];
+        tempv[1] = temp1*temp[4];
+        tempv[2] = temp1*temp[5];
+        gpu_cross3(a1+3,tempv,tempv2);
+        teta[0] += tempv2[0];
+        teta[1] += tempv2[1];
+        teta[2] += tempv2[2];
+
+        tempv[0] = temp1*temp[6];
+        tempv[1] = temp1*temp[7];
+        tempv[2] = temp1*temp[8];
+        gpu_cross3(a1+6,tempv,tempv2);
+        teta[0] += tempv2[0];
+        teta[1] += tempv2[1];
+        teta[2] += tempv2[2];
       }
   
-      { // Compute U_r and dUr
-    
-        // Compute kappa
-        numtyp kappa[3];
-        gpu_mldivide3(g12,r12,kappa,err_flag);
+      numtyp chi, dchi[3], tchi[3];
+      { // Compute chi and dchi
 
-        // -- replace r12 with r12 hat
+        // Compute b12
+        numtyp b2[9], b12[9];
+        {
+          gpu_times3(well[jtype],a2,b12);
+          gpu_transpose_times3(a2,b12,b2);
+          gpu_plus3(b1,b2,b12);
+        }
+
+        // compute chi_12
+        r12[0]*=r;
+        r12[1]*=r;
+        r12[2]*=r;
+        numtyp iota[3];
+        gpu_mldivide3(b12,r12,iota,err_flag);
+        // -- iota is now iota/r
+        iota[0]*=ir;
+        iota[1]*=ir;
+        iota[2]*=ir;
         r12[0]*=ir;
         r12[1]*=ir;
         r12[2]*=ir;
+        chi = gpu_dot3(r12,iota);
+        chi = pow(chi*(numtyp)2.0,gum[2]);
 
-        // -- kappa is now / r
-        kappa[0]*=ir;
-        kappa[1]*=ir;
-        kappa[2]*=ir;
+        // -- iota is now ok
+        iota[0]*=r;
+        iota[1]*=r;
+        iota[2]*=r;
+
+        numtyp temp1 = gpu_dot3(iota,r12);
+        numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/
+                                                          gum[2]);
+        dchi[0] = temp2*(iota[0]-temp1*r12[0]);
+        dchi[1] = temp2*(iota[1]-temp1*r12[1]);
+        dchi[2] = temp2*(iota[2]-temp1*r12[2]);
+
+        // compute t_chi
+        numtyp tempv[3];
+        gpu_row_times3(iota,b1,tempv);
+        gpu_cross3(tempv,iota,tchi);
+        temp1 = (numtyp)-4.0*ir*ir;
+        tchi[0] *= temp1;
+        tchi[1] *= temp1;
+        tchi[2] *= temp1;
+      }
+
+      numtyp temp2 = factor_lj*eta*chi;
+      if (eflag>0)
+        energy+=u_r*temp2;
+      numtyp temp1 = -eta*u_r*factor_lj;
+      if (vflag>0) {
+        r12[0]*=-r;
+        r12[1]*=-r;
+        r12[2]*=-r;
+        numtyp ft=temp1*dchi[0]-temp2*dUr[0];
+        f.x+=ft;
+        virial[0]+=r12[0]*ft;
+        ft=temp1*dchi[1]-temp2*dUr[1];
+        f.y+=ft;
+        virial[1]+=r12[1]*ft;
+        virial[3]+=r12[0]*ft;
+        ft=temp1*dchi[2]-temp2*dUr[2];
+        f.z+=ft;
+        virial[2]+=r12[2]*ft;
+        virial[4]+=r12[0]*ft;
+        virial[5]+=r12[1]*ft;
+      } else {
+        f.x+=temp1*dchi[0]-temp2*dUr[0];
+        f.y+=temp1*dchi[1]-temp2*dUr[1];
+        f.z+=temp1*dchi[2]-temp2*dUr[2];
+      }
+
+      // Torque on 1
+      temp1 = -u_r*eta*factor_lj;
+      temp2 = -u_r*chi*factor_lj;
+      numtyp temp3 = -chi*eta*factor_lj;
+      tor.x+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0];
+      tor.y+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1];
+      tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
+ 
+    } // for nbor
+  } // if ii
   
-        // energy
-  
-        // compute u_r and dUr
-        numtyp uslj_rsq;
-        {
-          // Compute distance of closest approach
-          numtyp h12, sigma12;
-          sigma12 = gpu_dot3(r12,kappa);
-          sigma12 = rsqrt((numtyp)0.5*sigma12);
-          h12 = r-sigma12;
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[7][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=tor.x;
+    red_acc[4][tid]=tor.y;
+    red_acc[5][tid]=tor.z;
 
-          // -- kappa is now ok
-          kappa[0]*=r;
-          kappa[1]*=r;
-          kappa[2]*=r;
-          
-          int mtype=mul24(ntypes,itype)+jtype;
-          numtyp sigma = sig_eps[mtype].x;
-          numtyp epsilon = sig_eps[mtype].y;
-          numtyp varrho = sigma/(h12+gum[0]*sigma);
-          numtyp varrho6 = varrho*varrho*varrho;
-          varrho6*=varrho6;
-          numtyp varrho12 = varrho6*varrho6;
-          u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
-
-          numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
-          temp1 = temp1*(numtyp)24.0*epsilon;
-          uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
-          numtyp temp2 = gpu_dot3(kappa,r12);
-          uslj_rsq = uslj_rsq*ir*ir;
-
-          dUr[0] = temp1*r12[0]+uslj_rsq*(kappa[0]-temp2*r12[0]);
-          dUr[1] = temp1*r12[1]+uslj_rsq*(kappa[1]-temp2*r12[1]);
-          dUr[2] = temp1*r12[2]+uslj_rsq*(kappa[2]-temp2*r12[2]);
-        }
-
-        // torque for particle 1
-        {
-          numtyp tempv[3], tempv2[3];
-          tempv[0] = -uslj_rsq*kappa[0];
-          tempv[1] = -uslj_rsq*kappa[1];
-          tempv[2] = -uslj_rsq*kappa[2];
-          gpu_row_times3(kappa,g1,tempv2);
-          gpu_cross3(tempv,tempv2,tUr);
-        }
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<6; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
       }
     }
-     
-    // Compute eta
-    {
-      eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
-      numtyp det_g12 = gpu_det3(g12);
-      eta = pow(eta/det_g12,gum[1]);
-    }
     
-    // Compute teta
-    numtyp temp[9], tempv[3], tempv2[3];
-    compute_eta_torque(g12,a1,ishape,temp);
-    numtyp temp1 = -eta*gum[1];
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    tor.x=red_acc[3][tid];
+    tor.y=red_acc[4][tid];
+    tor.z=red_acc[5][tid];
 
-    tempv[0] = temp1*temp[0];
-    tempv[1] = temp1*temp[1];
-    tempv[2] = temp1*temp[2];
-    gpu_cross3(a1,tempv,tempv2);
-    teta[0] = tempv2[0];
-    teta[1] = tempv2[1];
-    teta[2] = tempv2[2];
-  
-    tempv[0] = temp1*temp[3];
-    tempv[1] = temp1*temp[4];
-    tempv[2] = temp1*temp[5];
-    gpu_cross3(a1+3,tempv,tempv2);
-    teta[0] += tempv2[0];
-    teta[1] += tempv2[1];
-    teta[2] += tempv2[2];
+    if (eflag>0 || vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+      red_acc[6][tid]=energy;
 
-    tempv[0] = temp1*temp[6];
-    tempv[1] = temp1*temp[7];
-    tempv[2] = temp1*temp[8];
-    gpu_cross3(a1+6,tempv,tempv2);
-    teta[0] += tempv2[0];
-    teta[1] += tempv2[1];
-    teta[2] += tempv2[2];
-  }
-  
-  numtyp chi, dchi[3], tchi[3];
-  { // Compute chi and dchi
-
-    // Compute b12
-    numtyp b2[9], b12[9];
-    {
-      gpu_times3(well[jtype],a2,b12);
-      gpu_transpose_times3(a2,b12,b2);
-      gpu_plus3(b1,b2,b12);
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<7; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+      energy=red_acc[6][tid];
     }
-
-    // compute chi_12
-    r12[0]*=r;
-    r12[1]*=r;
-    r12[2]*=r;
-    numtyp iota[3];
-    gpu_mldivide3(b12,r12,iota,err_flag);
-    // -- iota is now iota/r
-    iota[0]*=ir;
-    iota[1]*=ir;
-    iota[2]*=ir;
-    r12[0]*=ir;
-    r12[1]*=ir;
-    r12[2]*=ir;
-    chi = gpu_dot3(r12,iota);
-    chi = pow(chi*(numtyp)2.0,gum[2]);
-
-    // -- iota is now ok
-    iota[0]*=r;
-    iota[1]*=r;
-    iota[2]*=r;
-
-    numtyp temp1 = gpu_dot3(iota,r12);
-    numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*pow(chi,(gum[2]-(numtyp)1.0)/
-                                                      gum[2]);
-    dchi[0] = temp2*(iota[0]-temp1*r12[0]);
-    dchi[1] = temp2*(iota[1]-temp1*r12[1]);
-    dchi[2] = temp2*(iota[2]-temp1*r12[2]);
-
-    // compute t_chi
-    numtyp tempv[3];
-    gpu_row_times3(iota,b1,tempv);
-    gpu_cross3(tempv,iota,tchi);
-    temp1 = (numtyp)-4.0*ir*ir;
-    tchi[0] *= temp1;
-    tchi[1] *= temp1;
-    tchi[2] *= temp1;
   }
 
-  numtyp temp2 = factor_lj*eta*chi;
-  if (eflag>0)
-    energy+=u_r*temp2;
-  numtyp temp1 = -eta*u_r*factor_lj;
-  if (vflag>0) {
-    r12[0]*=-r;
-    r12[1]*=-r;
-    r12[2]*=-r;
-    numtyp ft=temp1*dchi[0]-temp2*dUr[0];
-    f.x+=ft;
-    virial[0]+=r12[0]*ft;
-    ft=temp1*dchi[1]-temp2*dUr[1];
-    f.y+=ft;
-    virial[1]+=r12[1]*ft;
-    virial[3]+=r12[0]*ft;
-    ft=temp1*dchi[2]-temp2*dUr[2];
-    f.z+=ft;
-    virial[2]+=r12[2]*ft;
-    virial[4]+=r12[0]*ft;
-    virial[5]+=r12[1]*ft;
-  } else {
-    f.x+=temp1*dchi[0]-temp2*dUr[0];
-    f.y+=temp1*dchi[1]-temp2*dUr[1];
-    f.z+=temp1*dchi[2]-temp2*dUr[2];
-  }
-
-  // Torque on 1
-  temp1 = -u_r*eta*factor_lj;
-  temp2 = -u_r*chi*factor_lj;
-  numtyp temp3 = -chi*eta*factor_lj;
-  tor.x+=temp1*tchi[0]+temp2*teta[0]+temp3*tUr[0];
-  tor.y+=temp1*tchi[1]+temp2*teta[1]+temp3*tUr[1];
-  tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
-
-  } // for nbor
-
   // Store answers
-  __global acctyp *ap1=engv+ii;
-  if (eflag>0) {
-    *ap1=energy;
-    ap1+=astride;
-  }
-  if (vflag>0) {
-    for (int i=0; i<6; i++) {
-      *ap1=virial[i];
+  if (ii<inum && offset==0) {
+    __global acctyp *ap1=engv+ii;
+    if (eflag>0) {
+      *ap1=energy;
       ap1+=astride;
     }
-  }
-  ans[ii]=f;
-  ans[ii+astride]=tor;
+    if (vflag>0) {
+      for (int i=0; i<6; i++) {
+        *ap1=virial[i];
+        ap1+=astride;
+      }
+    }
+    ans[ii]=f;
+    ans[ii+astride]=tor;
   } // if ii
 }
 
diff --git a/lib/gpu/gb_gpu_kernel_lj.cu b/lib/gpu/gb_gpu_kernel_lj.cu
index 3e42cbcbbc..657fc20cd5 100644
--- a/lib/gpu/gb_gpu_kernel_lj.cu
+++ b/lib/gpu/gb_gpu_kernel_lj.cu
@@ -34,33 +34,36 @@ __kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q,
                                __global acctyp4 *ans, __global acctyp *engv, 
                                __global int *err_flag, const int eflag, 
                                const int vflag,const int start, const int inum, 
-                               const int nall) {
-  __local numtyp sp_lj[4];
+                               const int nall, const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom+start;
+  int offset=tid%t_per_atom;
 
-  // ii indexes the two interacting particles in gi
-  int ii=THREAD_ID_X;
-  if (ii<4)
-    sp_lj[ii]=gum[ii+3];    
-  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start;
-  __syncthreads();
+  __local numtyp sp_lj[4];
+  sp_lj[0]=gum[3];    
+  sp_lj[1]=gum[4];    
+  sp_lj[2]=gum[5];    
+  sp_lj[3]=gum[6];    
+
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
 
   if (ii<inum) {
-  
-    acctyp energy=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
-  
     __global int *nbor=dev_nbor+ii;
     int i=*nbor;
     nbor+=stride;
     int numj=*nbor;
     nbor+=stride;
     __global int *nbor_end=nbor+stride*numj;
+    nbor+=mul24(offset,stride);
+    int n_stride=mul24(t_per_atom,stride);
   
     numtyp4 ix=x_[i];
     int itype=ix.w;
@@ -69,7 +72,7 @@ __kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q,
     numtyp one_well=well[itype].x;
   
     numtyp factor_lj;
-    for ( ; nbor<nbor_end; nbor+=stride) {
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
   
       int j=*nbor;
       factor_lj = sp_lj[sbmask(j)];
@@ -241,8 +244,47 @@ __kernel void kernel_sphere_gb(__global numtyp4 *x_,__global numtyp4 *q,
         f.z+=temp1*dchi[2]-temp2*dUr[2];
       }
     } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
 
-    // Store answers
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<4; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
     __global acctyp *ap1=engv+ii;
     if (eflag>0) {
       *ap1=energy;
@@ -265,39 +307,42 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
                         __global acctyp4 *ans, __global acctyp *engv, 
                         __global int *err_flag, const int eflag, 
                         const int vflag, const int start, const int inum, 
-                        const int nall) {
-  __local numtyp sp_lj[4];                              
-  
-  // ii indexes the two interacting particles in gi
-  int ii=THREAD_ID_X;
-  if (ii<4)
-    sp_lj[ii]=gum[ii+3];    
-  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start;
-  __syncthreads();
+                        const int nall, const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom+start;
+  int offset=tid%t_per_atom;
 
+  __local numtyp sp_lj[4];
+  sp_lj[0]=gum[3];    
+  sp_lj[1]=gum[4];    
+  sp_lj[2]=gum[5];    
+  sp_lj[3]=gum[6];    
+
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
   if (ii<inum) {
-  
-    acctyp energy=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
-  
     __global int *nbor=dev_ij+ii;
     int i=*nbor;
     nbor+=stride;
     int numj=*nbor;
     nbor+=stride;
     __global int *list_end=nbor+mul24(stride,numj);
+    nbor+=mul24(offset,stride);
+    int n_stride=mul24(t_per_atom,stride);
   
     numtyp4 ix=x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=stride) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
   
       int j=*nbor;
       factor_lj = sp_lj[sbmask(j)];
@@ -338,8 +383,47 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
       }
 
     } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
 
-    // Store answers
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<4; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
     __global acctyp *ap1=engv+ii;
     if (eflag>0) {
       *ap1+=energy;
@@ -361,50 +445,54 @@ __kernel void kernel_lj(__global numtyp4 *x_, __global numtyp4 *lj1,
 
 __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in, 
                              __global numtyp4* lj3_in, __global numtyp *gum, 
-                             const int stride, 
-                             __global int *dev_ij, __global acctyp4 *ans, 
-                             __global acctyp *engv, __global int *err_flag,
-                             const int eflag,const int vflag, const int start,
-                             const int inum, const int nall) {
-  // ii indexes the two interacting particles in gi
-  int ii=THREAD_ID_X;
+                             const int stride, __global int *dev_ij,
+                             __global acctyp4 *ans, __global acctyp *engv,
+                             __global int *err_flag, const int eflag,
+                             const int vflag, const int start, const int inum,
+                             const int nall, const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom+start;
+  int offset=tid%t_per_atom;
+
   __local numtyp sp_lj[4];                              
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  if (ii<4)
-    sp_lj[ii]=gum[ii+3];    
-  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    lj1[ii]=lj1_in[ii];
+  if (tid<4)
+    sp_lj[tid]=gum[tid+3];    
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
     if (eflag>0)
-      lj3[ii]=lj3_in[ii];
+      lj3[tid]=lj3_in[tid];
   }
-  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X)+start;
+  
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
   __syncthreads();
   
   if (ii<inum) {
-  
-    acctyp energy=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
-  
     __global int *nbor=dev_ij+ii;
     int i=*nbor;
     nbor+=stride;
     int numj=*nbor;
     nbor+=stride;
     __global int *list_end=nbor+mul24(stride,numj);
-  
+    nbor+=mul24(offset,stride);
+    int n_stride=mul24(t_per_atom,stride);
+
     numtyp4 ix=x_[i];
     int iw=ix.w;
     int itype=mul24((int)MAX_SHARED_TYPES,iw);
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=stride) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
   
       int j=*nbor;
       factor_lj = sp_lj[sbmask(j)];
@@ -443,8 +531,47 @@ __kernel void kernel_lj_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
       }
 
     } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
 
-    // Store answers
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<4; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
     __global acctyp *ap1=engv+ii;
     if (eflag>0) {
       *ap1+=energy;
diff --git a/lib/gpu/gb_gpu_kernel_nbor.cu b/lib/gpu/gb_gpu_kernel_nbor.cu
index 80da8b8d9d..1b1d81fa42 100644
--- a/lib/gpu/gb_gpu_kernel_nbor.cu
+++ b/lib/gpu/gb_gpu_kernel_nbor.cu
@@ -18,8 +18,6 @@
 #ifndef PAIR_GPU_KERNEL_H
 #define PAIR_GPU_KERNEL_H
 
-#define MAX_SHARED_TYPES 8
-
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
@@ -32,7 +30,7 @@
 
 #ifdef NV_KERNEL
 
-#include "geryon/ucl_nv_kernel.h"
+#include "nv_kernel_def.h"
 
 #else
 
@@ -42,6 +40,7 @@
 #define BLOCK_ID_X get_group_id(0)
 #define BLOCK_SIZE_X get_local_size(0)
 #define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
+#define MAX_SHARED_TYPES 8
 
 #endif
 
diff --git a/lib/gpu/gb_gpu_memory.cpp b/lib/gpu/gb_gpu_memory.cpp
index 1d78204031..971649c6e8 100644
--- a/lib/gpu/gb_gpu_memory.cpp
+++ b/lib/gpu/gb_gpu_memory.cpp
@@ -32,30 +32,35 @@ template <class numtyp, class acctyp>
 GB_GPU_MemoryT::GB_GPU_Memory() : _allocated(false), _compiled(false),
                                   _max_bytes(0.0) {
   device=&pair_gpu_device;
+  ans=new PairGPUAns<numtyp,acctyp>();
+  nbor=new PairGPUNbor;
 }
 
 template <class numtyp, class acctyp>
 GB_GPU_MemoryT::~GB_GPU_Memory() { 
   clear();
+  delete ans;
+  delete nbor;
 }
  
 template <class numtyp, class acctyp>
 int GB_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
-  return device->atom.bytes_per_atom()+device->nbor.bytes_per_atom(max_nbors);
+  return device->atom.bytes_per_atom()+ans->bytes_per_atom()+
+         nbor->bytes_per_atom(max_nbors);
 }
 
 template <class numtyp, class acctyp>
-bool GB_GPU_MemoryT::init(const int ntypes, const double gamma, 
-                          const double upsilon, const double mu, 
-                          double **host_shape, double **host_well, 
-                          double **host_cutsq, double **host_sigma, 
-                          double **host_epsilon, double *host_lshape, 
-                          int **h_form, double **host_lj1, double **host_lj2,
-                          double **host_lj3, double **host_lj4,
-                          double **host_offset, const double *host_special_lj,
-                          const int nlocal, const int nall,
-                          const int max_nbors, const double cell_size,
-                          const double gpu_split, FILE *_screen) {
+int GB_GPU_MemoryT::init(const int ntypes, const double gamma, 
+                         const double upsilon, const double mu, 
+                         double **host_shape, double **host_well, 
+                         double **host_cutsq, double **host_sigma, 
+                         double **host_epsilon, double *host_lshape, 
+                         int **h_form, double **host_lj1, double **host_lj2,
+                         double **host_lj3, double **host_lj4,
+                         double **host_offset, const double *host_special_lj,
+                         const int nlocal, const int nall,
+                         const int max_nbors, const double cell_size,
+                         const double gpu_split, FILE *_screen) {
   nbor_time_avail=false;
   screen=_screen;
 
@@ -64,24 +69,24 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma,
     gpu_nbor=true;
 
   int _gpu_host=0;
-  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_nbor,gpu_split);
+  int host_nlocal=hd_balancer.first_host_count(nlocal,gpu_split,gpu_nbor);
   if (host_nlocal>0)
     _gpu_host=1;
   
-  if (!device->init(false,true,nlocal,host_nlocal,nall,0,gpu_nbor,_gpu_host,
-                    max_nbors,cell_size,true))
-    return false;
+  _threads_per_atom=device->threads_per_atom();
+  int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,0,
+                           _gpu_host,max_nbors,cell_size,true);
+  if (success!=0)
+    return success;
+    
   ucl_device=device->gpu;
   atom=&device->atom;
-  nbor=&device->nbor;
 
-  _block_size=BLOCK_1D;
-  if (static_cast<size_t>(_block_size)>ucl_device->group_size())
-    _block_size=ucl_device->group_size();
+  _block_size=device->pair_block_size();
   compile_kernels(*ucl_device);
 
   // Initialize host-device load balancer
-  hd_balancer.init(device,gpu_split);
+  hd_balancer.init(device,gpu_nbor,gpu_split);
 
   // Initialize timers for the selected GPU
   time_pair.init(*ucl_device);
@@ -90,8 +95,9 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma,
   // If atom type constants fit in shared memory use fast kernel
   int lj_types=ntypes;
   shared_types=false;
-  if (lj_types<=MAX_SHARED_TYPES && _block_size>=MAX_SHARED_TYPES) {
-    lj_types=MAX_SHARED_TYPES;
+  int max_shared_types=device->max_shared_types();
+  if (lj_types<=max_shared_types && _block_size>=max_shared_types) {
+    lj_types=max_shared_types;
     shared_types=true;
   }
   _lj_types=lj_types;
@@ -186,12 +192,19 @@ bool GB_GPU_MemoryT::init(const int ntypes, const double gamma,
   }
   
   if (multiple_forms)
-    atom->dev_ans.zero();
+    ans->dev_ans.zero();
 
-  _max_bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+  _max_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
 
   // Memory for ilist ordered by particle type
-  return (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS);
+  if (host_olist.alloc(nbor->max_atoms(),*ucl_device)==UCL_SUCCESS)
+    return 0;
+  else return -3;
+}
+
+template <class numtyp, class acctyp>
+void GB_GPU_MemoryT::estimate_gpu_overhead() {
+  device->estimate_gpu_overhead(2,_gpu_overhead,_driver_overhead);
 }
 
 template <class numtyp, class acctyp>
@@ -209,9 +222,9 @@ void GB_GPU_MemoryT::clear() {
 
   // Output any timing information
   acc_timers();
-  double single[6], times[6];
+  double single[9], times[9];
 
-  single[0]=atom->transfer_time();
+  single[0]=atom->transfer_time()+ans->transfer_time();
   single[1]=nbor->time_nbor.total_seconds();
   single[2]=time_kernel.total_seconds()+time_kernel2.total_seconds()+
             nbor->time_kernel.total_seconds();
@@ -220,15 +233,18 @@ void GB_GPU_MemoryT::clear() {
     single[4]=time_pair.total_seconds();
   else
     single[4]=0;
-  single[5]=atom->cast_time();
+  single[5]=atom->cast_time()+ans->cast_time();
+  single[6]=_gpu_overhead;
+  single[7]=_driver_overhead;
+  single[8]=ans->cpu_idle_time();
 
-  MPI_Reduce(single,times,6,MPI_DOUBLE,MPI_SUM,0,device->replica());
+  MPI_Reduce(single,times,9,MPI_DOUBLE,MPI_SUM,0,device->replica());
   double avg_split=hd_balancer.all_avg_split();
 
   _max_bytes+=dev_error.row_bytes()+lj1.row_bytes()+lj3.row_bytes()+
               sigma_epsilon.row_bytes()+cut_form.row_bytes()+
               shape.row_bytes()+well.row_bytes()+lshape.row_bytes()+
-              gamma_upsilon_mu.row_bytes();
+              gamma_upsilon_mu.row_bytes()+atom->max_gpu_bytes();
   double mpi_max_bytes;
   MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,
              device->replica());
@@ -255,10 +271,19 @@ void GB_GPU_MemoryT::clear() {
         fprintf(screen,"Force calc:      %.4f s.\n",times[3]/replica_size);
         fprintf(screen,"LJ calc:         %.4f s.\n",times[4]/replica_size);
       }
+      fprintf(screen,"GPU Overhead:    %.4f s.\n",times[6]/replica_size);
       fprintf(screen,"Average split:   %.4f.\n",avg_split);
       fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
+      fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size);
+      fprintf(screen,"CPU Idle_Time:   %.4f s.\n",times[8]/replica_size);
       fprintf(screen,"-------------------------------------");
       fprintf(screen,"--------------------------------\n\n");
+
+
+      fprintf(screen,"Average split:   %.4f.\n",avg_split);
+      fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
+
+
     }
   _max_bytes=0.0;
 
@@ -299,10 +324,9 @@ void GB_GPU_MemoryT::clear() {
 
 template <class numtyp, class acctyp>
 double GB_GPU_MemoryT::host_memory_usage() const {
-  return device->atom.host_memory_usage()+
-         device->nbor.host_memory_usage()+4*sizeof(numtyp)+
-         sizeof(GB_GPU_Memory<numtyp,acctyp>)+
-         device->nbor.max_atoms()*sizeof(int);
+  return device->atom.host_memory_usage()+nbor->host_memory_usage()+
+         4*sizeof(numtyp)+sizeof(GB_GPU_Memory<numtyp,acctyp>)+
+         nbor->max_atoms()*sizeof(int);
 }
 
 template <class numtyp, class acctyp>
diff --git a/lib/gpu/gb_gpu_memory.h b/lib/gpu/gb_gpu_memory.h
index 2cfc805cd8..40ed8bec51 100644
--- a/lib/gpu/gb_gpu_memory.h
+++ b/lib/gpu/gb_gpu_memory.h
@@ -18,8 +18,6 @@
 #ifndef GB_GPU_MEMORY_H
 #define GB_GPU_MEMORY_H
 
-#define BLOCK_1D 64
-
 #include "pair_gpu_device.h"
 #include "pair_gpu_balance.h"
 #include "mpi.h"
@@ -35,23 +33,34 @@ class GB_GPU_Memory {
     * \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
     * \param gpu_split fraction of particles handled by device 
-    * \return false if there is not sufficient memory or device init prob **/
-  bool init(const int ntypes, const double gamma,
-            const double upsilon, const double mu, double **host_shape,
-            double **host_well, double **host_cutsq, double **host_sigma, 
-            double **host_epsilon, double *host_lshape, int **h_form,
-            double **host_lj1, double **host_lj2, double **host_lj3, 
-            double **host_lj4, double **host_offset, 
-            const double *host_special_lj, const int nlocal, const int nall, 
-            const int max_nbors, const double cell_size,
-            const double gpu_split, FILE *screen);
+    * \return false if there is not sufficient memory or device init prob
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, const double gamma,
+           const double upsilon, const double mu, double **host_shape,
+           double **host_well, double **host_cutsq, double **host_sigma, 
+           double **host_epsilon, double *host_lshape, int **h_form,
+           double **host_lj1, double **host_lj2, double **host_lj3, 
+           double **host_lj4, double **host_offset, 
+           const double *host_special_lj, const int nlocal, const int nall, 
+           const int max_nbors, const double cell_size,
+           const double gpu_split, FILE *screen);
+
+  /// Estimate the overhead for GPU context changes and CPU driver
+  void estimate_gpu_overhead();
 
   /// Check if there is enough storage for atom arrays and realloc if not
   /** \param success set to false if insufficient memory **/
   inline void resize_atom(const int inum, const int nall, bool &success) {
-    atom->resize(inum, nall, success);
-    if (multiple_forms) atom->dev_ans.zero();
-    double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+    atom->resize(nall, success);
+    ans->resize(inum, success);
+    if (multiple_forms) ans->dev_ans.zero();
+    double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
     if (bytes>_max_bytes)
       _max_bytes=bytes;
   }
@@ -74,7 +83,7 @@ class GB_GPU_Memory {
       success=success && (host_olist.alloc(new_size,*ucl_device)==UCL_SUCCESS);
     }
     nbor->resize(nlocal,host_inum,max_nbors,success);
-    double bytes=atom->gpu_bytes()+nbor->gpu_bytes();
+    double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
     if (bytes>_max_bytes)
       _max_bytes=bytes;
   }
@@ -91,19 +100,22 @@ class GB_GPU_Memory {
 
   /// Accumulate timers
   inline void acc_timers() {
-    if (nbor_time_avail) {
-      nbor->time_nbor.add_to_total();
-      nbor->time_kernel.add_to_total();
-      nbor_time_avail=false;
+    if (device->time_device()) {
+      if (nbor_time_avail) {
+        nbor->time_nbor.add_to_total();
+        nbor->time_kernel.add_to_total();
+        nbor_time_avail=false;
+      }
+      time_kernel.add_to_total();
+      time_gayberne.add_to_total();
+      if (multiple_forms) {
+        time_kernel2.add_to_total();
+        time_gayberne2.add_to_total();
+        time_pair.add_to_total();
+      }
+      atom->acc_timers();
+      ans->acc_timers();
     }
-    time_kernel.add_to_total();
-    time_gayberne.add_to_total();
-    if (multiple_forms) {
-      time_kernel2.add_to_total();
-      time_gayberne2.add_to_total();
-      time_pair.add_to_total();
-    }
-    atom->acc_timers();
   }
   
   /// Accumulate timers
@@ -117,6 +129,7 @@ class GB_GPU_Memory {
       time_pair.zero();
     }
     atom->zero_timers();
+    ans->zero_timers();
   }
 
   // -------------------------- DEVICE DATA ------------------------- 
@@ -168,6 +181,10 @@ class GB_GPU_Memory {
 
   int last_ellipse, max_last_ellipse;
 
+  // ------------------------ FORCE/ENERGY DATA -----------------------
+
+  PairGPUAns<numtyp,acctyp> *ans;
+
   // --------------------------- NBOR DATA ----------------------------
 
   /// Neighbor data
@@ -183,10 +200,12 @@ class GB_GPU_Memory {
   UCL_Kernel k_gayberne, k_sphere_gb, k_lj_fast, k_lj;
   inline int block_size() { return _block_size; }
 
+  int _threads_per_atom;
  private:
   bool _allocated, _compiled;
   int _block_size;
   double _max_bytes;
+  double _gpu_overhead, _driver_overhead;
   
   void compile_kernels(UCL_Device &dev);
 };
diff --git a/lib/gpu/geryon/VERSION.txt b/lib/gpu/geryon/VERSION.txt
index 77e0a073c7..d260cab24e 100644
--- a/lib/gpu/geryon/VERSION.txt
+++ b/lib/gpu/geryon/VERSION.txt
@@ -1,2 +1,2 @@
-Geryon Version 10.280
- 
\ No newline at end of file
+Geryon Version 11.094
+ 
diff --git a/lib/gpu/geryon/nvc_device.h b/lib/gpu/geryon/nvc_device.h
index ed445716f6..6a232986ff 100644
--- a/lib/gpu/geryon/nvc_device.h
+++ b/lib/gpu/geryon/nvc_device.h
@@ -167,6 +167,7 @@ class UCL_Device {
   int _device, _num_devices;
   std::vector<cudaDeviceProp> _properties;
   std::vector<cudaStream_t> _cq;
+  std::vector<int> _device_ids;
 };
 
 // Grabs the properties for all devices
@@ -178,6 +179,7 @@ inline UCL_Device::UCL_Device() {
     if (deviceProp.major == 9999 && deviceProp.minor == 9999)
       break;
     _properties.push_back(deviceProp);
+    _device_ids.push_back(dev);
   }
   _device=-1;
   _cq.push_back(cudaStream_t());
@@ -194,7 +196,7 @@ inline void UCL_Device::set(int num) {
     return;
   for (int i=1; i<num_queues(); i++) pop_command_queue();
   cudaThreadExit();
-  CUDA_SAFE_CALL_NS(cudaSetDevice(num));
+  CUDA_SAFE_CALL_NS(cudaSetDevice(_device_ids[num]));
   _device=num;
 }
 
diff --git a/lib/gpu/geryon/nvd_device.h b/lib/gpu/geryon/nvd_device.h
index 6b70964ba1..fd466362d6 100644
--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@@ -42,6 +42,7 @@ inline void ucl_sync(CUstream &stream) {
 }
 
 struct NVDProperties {
+  int device_id;
   std::string name;
   int major;
   int minor;
@@ -208,15 +209,20 @@ inline UCL_Device::UCL_Device() {
   for (int dev=0; dev<_num_devices; ++dev) {
     CUdevice m;
     CU_SAFE_CALL_NS(cuDeviceGet(&m,dev));
+    int major, minor;
+    CU_SAFE_CALL_NS(cuDeviceComputeCapability(&major,&minor,m));
+    if (major==9999)
+      continue;
+      
     _properties.push_back(NVDProperties());
+    _properties.back().device_id=dev;
+    _properties.back().major=major;
+    _properties.back().minor=minor;
     
     char namecstr[1024];
     CU_SAFE_CALL_NS(cuDeviceGetName(namecstr,1024,m));
     _properties.back().name=namecstr;
     
-    CU_SAFE_CALL_NS(cuDeviceComputeCapability(&_properties.back().major,
-                                              &_properties.back().minor,m));
-    
     CU_SAFE_CALL_NS(cuDeviceTotalMem(&_properties.back().totalGlobalMem,m));
     CU_SAFE_CALL_NS(cuDeviceGetAttribute(&_properties.back().multiProcessorCount,
                                        CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
@@ -262,9 +268,9 @@ inline void UCL_Device::set(int num) {
     CU_SAFE_CALL_NS(cuCtxDestroy(_context));
     for (int i=1; i<num_queues(); i++) pop_command_queue();
   }
-  CU_SAFE_CALL_NS(cuDeviceGet(&_cu_device,num));
+  _device=_properties[num].device_id;
+  CU_SAFE_CALL_NS(cuDeviceGet(&_cu_device,_device));
   CU_SAFE_CALL_NS(cuCtxCreate(&_context,0,_cu_device));
-  _device=num;
 }
 
 // List all devices along with all properties
diff --git a/lib/gpu/geryon/nvd_timer.h b/lib/gpu/geryon/nvd_timer.h
index 59001c03fd..1f39abb971 100644
--- a/lib/gpu/geryon/nvd_timer.h
+++ b/lib/gpu/geryon/nvd_timer.h
@@ -25,6 +25,7 @@
 #define NVD_TIMER_H
 
 #include "nvd_macros.h"
+#include "nvd_device.h"
 
 namespace ucl_cudadr {
 
@@ -66,12 +67,23 @@ class UCL_Timer {
   /// Stop timing on command queue
   inline void stop() { CU_SAFE_CALL(cuEventRecord(stop_event,_cq)); }
   
+  /// Block until the start event has been reached on device
+  inline void sync_start() 
+    { CU_SAFE_CALL(cuEventSynchronize(start_event)); }
+
+  /// Block until the stop event has been reached on device
+  inline void sync_stop() 
+    { CU_SAFE_CALL(cuEventSynchronize(stop_event)); }
+
   /// Set the time elapsed to zero (not the total_time)
   inline void zero() {
     CU_SAFE_CALL(cuEventRecord(start_event,_cq));
     CU_SAFE_CALL(cuEventRecord(stop_event,_cq));
   }
   
+  /// Set the total time to zero
+  inline void zero_total() { _total_time=0.0; }
+  
   /// Add time from previous start and stop to total
   /** Forces synchronization **/
   inline double add_to_total() 
diff --git a/lib/gpu/geryon/ocl_timer.h b/lib/gpu/geryon/ocl_timer.h
index aafb0aac4b..a268dca256 100644
--- a/lib/gpu/geryon/ocl_timer.h
+++ b/lib/gpu/geryon/ocl_timer.h
@@ -25,6 +25,7 @@
 #define OCL_TIMER_H
 
 #include "ocl_macros.h"
+#include "ocl_device.h"
 
 namespace ucl_opencl {
 
@@ -67,10 +68,21 @@ class UCL_Timer {
   /// Stop timing on default command queue
   inline void stop() { clEnqueueMarker(_cq,&stop_event); }
   
+  /// Block until the start event has been reached on device
+  inline void sync_start() 
+    { CL_SAFE_CALL(clWaitForEvents(1,&start_event)); }
+
+  /// Block until the stop event has been reached on device
+  inline void sync_stop() 
+    { CL_SAFE_CALL(clWaitForEvents(1,&stop_event)); }
+
   /// Set the time elapsed to zero (not the total_time)
   inline void zero() 
     { clEnqueueMarker(_cq,&start_event); clEnqueueMarker(_cq,&stop_event); } 
   
+  /// Set the total time to zero
+  inline void zero_total() { _total_time=0.0; }
+  
   /// Add time from previous start and stop to total
   /** Forces synchronization **/
   inline double add_to_total() 
diff --git a/lib/gpu/geryon/ucl_arg_kludge.h b/lib/gpu/geryon/ucl_arg_kludge.h
index 208f9c824f..f039a2ff42 100644
--- a/lib/gpu/geryon/ucl_arg_kludge.h
+++ b/lib/gpu/geryon/ucl_arg_kludge.h
@@ -13,7 +13,7 @@
     copyright            : (C) 2010 by W. Michael Brown
     email                : brownw@ornl.gov
  ***************************************************************************/
- 
+
 /* -----------------------------------------------------------------------
    Copyright (2010) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
@@ -206,6 +206,191 @@
     add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
   }
 
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25,
+            class t26>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
+                       t26 *a26) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a26);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25,
+            class t26, class t27>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
+                       t26 *a26, t27 *a27) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a26); add_arg(a27);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25,
+            class t26, class t27, class t28>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
+                       t26 *a26, t27 *a27, t28 *a28) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a26); add_arg(a27); add_arg(a28);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25,
+            class t26, class t27, class t28, class t29>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
+                       t26 *a26, t27 *a27, t28 *a28, t29 *a29) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25,
+            class t26, class t27, class t28, class t29, class t30>
+  inline void add_args(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
+                       t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) {
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30); 
+  }
+
 
 // ---------------------------------------------------------------------------
 
@@ -439,6 +624,211 @@
     run();
   }
 
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21);
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22);
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23);
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24);
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25,
+            class t26>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
+                       t26 *a26) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a26);
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25,
+            class t26, class t27>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
+                       t26 *a26, t27 *a27) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a26); add_arg(a27);
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25,
+            class t26, class t27, class t28>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
+                       t26 *a26, t27 *a27, t28 *a28) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a26); add_arg(a27); add_arg(a28); 
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25,
+            class t26, class t27, class t28, class t29>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
+                       t26 *a26, t27 *a27, t28 *a28, t29 *a29) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29);
+    run();
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25,
+            class t26, class t27, class t28, class t29, class t30>
+  inline void run(t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
+                       t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30); 
+    run();
+  }
+
 // ---------------------------------------------------------------------------
 
   template <class t1>
@@ -671,3 +1061,208 @@
     run(cq);
   }
 
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21);
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22);
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23);
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24);
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25,
+            class t26>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
+                       t26 *a26) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a26);
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25,
+            class t26, class t27>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
+                       t26 *a26, t27 *a27) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a26); add_arg(a27);
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25,
+            class t26, class t27, class t28>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
+                       t26 *a26, t27 *a27, t28 *a28) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a26); add_arg(a27); add_arg(a28); 
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25,
+            class t26, class t27, class t28, class t29>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
+                       t26 *a26, t27 *a27, t28 *a28, t29 *a29) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29);
+    run(cq);
+  }
+
+  template <class t1, class t2, class t3, class t4, class t5,
+            class t6, class t7, class t8, class t9, class t10,
+            class t11, class t12, class t13, class t14, class t15,
+            class t16, class t17, class t18, class t19, class t20,
+            class t21, class t22, class t23, class t24, class t25,
+            class t26, class t27, class t28, class t29, class t30>
+  inline void run_cq(command_queue &cq, t1 *a1, t2 *a2, t3 *a3, t4 *a4, t5 *a5,
+                       t6 *a6, t7 *a7, t8 *a8, t9 *a9, t10 *a10,
+                       t11 *a11, t12 *a12, t13 *a13, t14 *a14, t15 *a15,
+                       t16 *a16, t17 *a17, t18 *a18, t19 *a19, t20 *a20,
+                       t21 *a21, t22 *a22, t23 *a23, t24 *a24, t25 *a25,
+                       t26 *a26, t27 *a27, t28 *a28, t29 *a29, t30 *a30) {
+    clear_args();
+    add_arg(a1); add_arg(a2); add_arg(a3); add_arg(a4); add_arg(a5); 
+    add_arg(a6); add_arg(a7); add_arg(a8); add_arg(a9); add_arg(a10); 
+    add_arg(a11); add_arg(a12); add_arg(a13); add_arg(a14); add_arg(a15); 
+    add_arg(a16); add_arg(a17); add_arg(a18); add_arg(a19); add_arg(a20); 
+    add_arg(a21); add_arg(a22); add_arg(a23); add_arg(a24); add_arg(a25); 
+    add_arg(a26); add_arg(a27); add_arg(a28); add_arg(a29); add_arg(a30); 
+    run(cq);
+  }
+
diff --git a/lib/gpu/geryon/ucl_d_mat.h b/lib/gpu/geryon/ucl_d_mat.h
index c0531b2f29..11ec58629a 100644
--- a/lib/gpu/geryon/ucl_d_mat.h
+++ b/lib/gpu/geryon/ucl_d_mat.h
@@ -13,7 +13,7 @@
     copyright            : (C) 2009 by W. Michael Brown
     email                : brownw@ornl.gov
  ***************************************************************************/
- 
+
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
@@ -61,20 +61,23 @@ class UCL_D_Mat : public UCL_BaseMat {
   inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
                    const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
     clear();
-    _kind=kind;
-    _rows=rows;
-    _cols=cols;
+
     int err=_device_alloc(*this,cq,rows,cols,_pitch,kind);
-    _row_size=_pitch/sizeof(numtyp);
-    #ifndef _UCL_DEVICE_PTR_MAT
-    _end=_array+_row_size*cols;
-    #endif
-    #ifndef UCL_NO_EXIT
     if (err!=UCL_SUCCESS) {
+      #ifndef UCL_NO_EXIT
       std::cerr << "UCL Error: Could not allocate " 
                 << rows*cols*sizeof(numtyp) << " bytes on device.\n";
       exit(1);
+      #endif
+      return err;
     }
+
+    _kind=kind;
+    _rows=rows;
+    _cols=cols;
+    _row_size=_pitch/sizeof(numtyp);
+    #ifndef _UCL_DEVICE_PTR_MAT
+    _end=_array+_row_size*cols;
     #endif
     #ifdef _OCL_MAT
     _offset=0;
@@ -94,20 +97,23 @@ class UCL_D_Mat : public UCL_BaseMat {
   inline int alloc(const size_t rows, const size_t cols, UCL_Device &device,
                    const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
     clear();
-    _kind=kind;
-    _rows=rows;
-    _cols=cols;
+
     int err=_device_alloc(*this,device,rows,cols,_pitch,kind);
-    _row_size=_pitch/sizeof(numtyp);
-    #ifndef _UCL_DEVICE_PTR_MAT
-    _end=_array+_row_size*cols;
-    #endif
-    #ifndef UCL_NO_EXIT
     if (err!=UCL_SUCCESS) {
+      #ifndef UCL_NO_EXIT
       std::cerr << "UCL Error: Could not allocate "
                 << rows*cols*sizeof(numtyp) << " bytes on device.\n";
       exit(1);
+      #endif
+      return err;
     }
+
+    _kind=kind;
+    _rows=rows;
+    _cols=cols;
+    _row_size=_pitch/sizeof(numtyp);
+    #ifndef _UCL_DEVICE_PTR_MAT
+    _end=_array+_row_size*cols;
     #endif
     #ifdef _OCL_MAT
     _offset=0;
diff --git a/lib/gpu/geryon/ucl_d_vec.h b/lib/gpu/geryon/ucl_d_vec.h
index 45c94bee82..0be063c940 100644
--- a/lib/gpu/geryon/ucl_d_vec.h
+++ b/lib/gpu/geryon/ucl_d_vec.h
@@ -13,7 +13,7 @@
     copyright            : (C) 2009 by W. Michael Brown
     email                : brownw@ornl.gov
  ***************************************************************************/
- 
+
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
@@ -60,19 +60,24 @@ class UCL_D_Vec : public UCL_BaseMat {
                    const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
                         
     clear();
-    _kind=kind;
-    _cols=cols;
+
     _row_bytes=cols*sizeof(numtyp);
     int err=_device_alloc(*this,cq,_row_bytes,kind);
-    #ifndef _UCL_DEVICE_PTR_MAT
-    _end=_array+cols;
-    #endif
-    #ifndef UCL_NO_EXIT
     if (err!=UCL_SUCCESS) {
+      #ifndef UCL_NO_EXIT
       std::cerr << "UCL Error: Could not allocate " << _row_bytes
                 << " bytes on device.\n";
+      _row_bytes=0;
       exit(1);
+      #endif
+      _row_bytes=0;
+      return err;
     }
+
+    _kind=kind;
+    _cols=cols;
+    #ifndef _UCL_DEVICE_PTR_MAT
+    _end=_array+cols;
     #endif
     #ifdef _OCL_MAT
     _offset=0;
@@ -90,19 +95,23 @@ class UCL_D_Vec : public UCL_BaseMat {
   inline int alloc(const size_t cols, UCL_Device &device,
                    const enum UCL_MEMOPT kind=UCL_READ_WRITE) {
     clear();
-    _kind=kind;
-    _cols=cols;
     _row_bytes=cols*sizeof(numtyp);
     int err=_device_alloc(*this,device,_row_bytes,kind);
-    #ifndef _UCL_DEVICE_PTR_MAT
-    _end=_array+cols;
-    #endif
-    #ifndef UCL_NO_EXIT
     if (err!=UCL_SUCCESS) {
+      #ifndef UCL_NO_EXIT
       std::cerr << "UCL Error: Could not allocate " << _row_bytes
                 << " bytes on device.\n";
+      _row_bytes=0;
       exit(1);
+      #endif
+      _row_bytes=0;
+      return err;
     }
+
+    _kind=kind;
+    _cols=cols;
+    #ifndef _UCL_DEVICE_PTR_MAT
+    _end=_array+cols;
     #endif
     #ifdef _OCL_MAT
     _offset=0;
diff --git a/lib/gpu/geryon/ucl_h_mat.h b/lib/gpu/geryon/ucl_h_mat.h
index 51593cfa23..762bb03131 100644
--- a/lib/gpu/geryon/ucl_h_mat.h
+++ b/lib/gpu/geryon/ucl_h_mat.h
@@ -13,7 +13,7 @@
     copyright            : (C) 2009 by W. Michael Brown
     email                : brownw@ornl.gov
  ***************************************************************************/
- 
+
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
@@ -39,7 +39,11 @@ class UCL_H_Mat : public UCL_BaseMat {
    };
    typedef numtyp data_type; 
    
-  UCL_H_Mat() : _kind(UCL_VIEW), _rows(0) { }
+  UCL_H_Mat() : _kind(UCL_VIEW), _rows(0) {
+    #ifdef _OCL_MAT
+    _carray=(cl_mem)(0);
+    #endif
+  }
   ~UCL_H_Mat() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); }
   
   /// Construct with specied number of rows and columns
@@ -59,18 +63,23 @@ class UCL_H_Mat : public UCL_BaseMat {
   inline int alloc(const size_t rows, const size_t cols, mat_type &cq,
                    const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
     clear();
-    _cols=cols;
-    _rows=rows;
+
     _row_bytes=cols*sizeof(numtyp);
-    _kind=kind;
-    int err=_host_alloc(*this,cq,_row_bytes*_rows,kind);
-    #ifndef UCL_NO_EXIT
+    int err=_host_alloc(*this,cq,_row_bytes*rows,kind);
     if (err!=UCL_SUCCESS) {
+      #ifndef UCL_NO_EXIT
       std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
                 << " bytes on host.\n";
+      _row_bytes=0;
       exit(1);
+      #endif 
+      _row_bytes=0;
+      return err;
     }
-    #endif 
+
+    _cols=cols;
+    _rows=rows;
+    _kind=kind;
     _end=_array+rows*cols;
     return err;
   }    
@@ -85,19 +94,24 @@ class UCL_H_Mat : public UCL_BaseMat {
   inline int alloc(const size_t rows, const size_t cols, UCL_Device &device,
                    const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
     clear();
-    _cols=cols;
-    _rows=rows;
+
     _row_bytes=cols*sizeof(numtyp);
-    _kind=kind;
-    int err=_host_alloc(*this,device,_row_bytes*_rows,kind);
-    _end=_array+rows*cols;
-    #ifndef UCL_NO_EXIT
+    int err=_host_alloc(*this,device,_row_bytes*rows,kind);
     if (err!=UCL_SUCCESS) {
+      #ifndef UCL_NO_EXIT
       std::cerr << "UCL Error: Could not allocate " << _row_bytes*_rows
                 << " bytes on host.\n";
+      _row_bytes=0;
       exit(1);
+      #endif
+      _row_bytes=0;
+      return err;
     }
-    #endif
+
+    _cols=cols;
+    _rows=rows;
+    _kind=kind;
+    _end=_array+rows*cols;
     return err;
   }    
   
diff --git a/lib/gpu/geryon/ucl_h_vec.h b/lib/gpu/geryon/ucl_h_vec.h
index ca1dd12a47..4af1e2179f 100644
--- a/lib/gpu/geryon/ucl_h_vec.h
+++ b/lib/gpu/geryon/ucl_h_vec.h
@@ -13,7 +13,7 @@
     copyright            : (C) 2009 by W. Michael Brown
     email                : brownw@ornl.gov
  ***************************************************************************/
- 
+
 /* -----------------------------------------------------------------------
    Copyright (2009) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
@@ -39,7 +39,11 @@ class UCL_H_Vec : public UCL_BaseMat {
    };
    typedef numtyp data_type; 
    
-  UCL_H_Vec() : _kind(UCL_VIEW), _cols(0) { }
+  UCL_H_Vec() : _kind(UCL_VIEW), _cols(0) {
+    #ifdef _OCL_MAT
+    _carray=(cl_mem)(0);
+    #endif
+  }
   ~UCL_H_Vec() { if (_kind!=UCL_VIEW) _host_free(*this,_kind); }
   
   /// Construct with n columns
@@ -59,18 +63,24 @@ class UCL_H_Vec : public UCL_BaseMat {
   inline int alloc(const size_t cols, mat_type &cq,
                    const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
     clear();
-    _cols=cols;
+
     _row_bytes=cols*sizeof(numtyp);
-    _kind=kind;
     int err=_host_alloc(*this,cq,_row_bytes,kind);
-    _end=_array+cols;
-    #ifndef UCL_NO_EXIT
+
     if (err!=UCL_SUCCESS) {
+      #ifndef UCL_NO_EXIT
       std::cerr << "UCL Error: Could not allocate " << _row_bytes
                 << " bytes on host.\n";
+      _row_bytes=0;
       exit(1);
+      #endif
+      _row_bytes=0;
+      return err;
     }
-    #endif 
+
+    _cols=cols;
+    _kind=kind;
+    _end=_array+cols;
     return err;
   }    
 
@@ -84,18 +94,24 @@ class UCL_H_Vec : public UCL_BaseMat {
   inline int alloc(const size_t cols, UCL_Device &device,
                    const enum UCL_MEMOPT kind=UCL_RW_OPTIMIZED) {
     clear();
-    _cols=cols;
+
     _row_bytes=cols*sizeof(numtyp);
-    _kind=kind;
     int err=_host_alloc(*this,device,_row_bytes,kind);
-    _end=_array+cols;
-    #ifndef UCL_NO_EXIT
+
     if (err!=UCL_SUCCESS) {
+      #ifndef UCL_NO_EXIT
       std::cerr << "UCL Error: Could not allocate " << _row_bytes
                 << " bytes on host.\n";
+      _row_bytes=0;
       exit(1);
+      #endif 
+      _row_bytes=0;
+      return err;
     }
-    #endif 
+
+    _cols=cols;
+    _kind=kind;
+    _end=_array+cols;
     return err;
   }
   
diff --git a/lib/gpu/geryon/ucl_nv_kernel.h b/lib/gpu/geryon/ucl_nv_kernel.h
index 1ea9175e3a..5c45dc3a87 100644
--- a/lib/gpu/geryon/ucl_nv_kernel.h
+++ b/lib/gpu/geryon/ucl_nv_kernel.h
@@ -13,7 +13,7 @@
     copyright            : (C) 2010 by W. Michael Brown
     email                : brownw@ornl.gov
  ***************************************************************************/
- 
+
 /* -----------------------------------------------------------------------
    Copyright (2010) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
@@ -25,8 +25,18 @@
 #ifndef UCL_NV_KERNEL_H
 #define UCL_NV_KERNEL_H
 
-#define GLOBAL_ID_X threadIdx.x+__mul24(blockIdx.x,blockDim.x)
-#define GLOBAL_ID_Y threadIdx.y+__mul24(blockIdx.y,blockDim.y)
+#if (__CUDA_ARCH__ < 200)
+#define mul24 __mul24
+#define MEM_THREADS 16
+#else
+#define mul24(X,Y) (X)*(Y)
+#define MEM_THREADS 32
+#endif
+
+#define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x)
+#define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y)
+#define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x);
+#define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y);
 #define THREAD_ID_X threadIdx.x
 #define THREAD_ID_Y threadIdx.y
 #define BLOCK_ID_X blockIdx.x
@@ -35,8 +45,9 @@
 #define BLOCK_SIZE_Y blockDim.y
 #define __kernel extern "C" __global__
 #define __local __shared__
-#define mul24 __mul24
 #define __global  
 #define __inline static __inline__ __device__ 
+#define atom_add atomicAdd
 
 #endif
+
diff --git a/lib/gpu/lj96_cut_gpu.cpp b/lib/gpu/lj96_cut_gpu.cpp
index 24fb5d8570..df83afd521 100644
--- a/lib/gpu/lj96_cut_gpu.cpp
+++ b/lib/gpu/lj96_cut_gpu.cpp
@@ -28,11 +28,11 @@ static LJ96_GPU_Memory<PRECISION,ACC_PRECISION> LJ96MF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-                   double **host_lj2, double **host_lj3, double **host_lj4,
-                   double **offset, double *special_lj, const int inum,
-                   const int nall, const int max_nbors, const int maxspecial,
-                   const double cell_size, int &gpu_mode, FILE *screen) {
+int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                  double **host_lj2, double **host_lj3, double **host_lj4,
+                  double **offset, double *special_lj, const int inum,
+                  const int nall, const int max_nbors, const int maxspecial,
+                  const double cell_size, int &gpu_mode, FILE *screen) {
   LJ96MF.clear();
   gpu_mode=LJ96MF.device->gpu_mode();
   double gpu_split=LJ96MF.device->particle_split();
@@ -53,13 +53,11 @@ bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     fflush(screen);
   }
 
-  if (world_me==0) {
-    bool init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                             host_lj4, offset, special_lj, inum, nall, 300,
-                             maxspecial, cell_size, gpu_split, screen);
-    if (!init_ok)
-      return false;
-  }
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
+                        host_lj4, offset, special_lj, inum, nall, 300,
+                        maxspecial, cell_size, gpu_split, screen);
 
   LJ96MF.device->world_barrier();
   if (message)
@@ -74,46 +72,46 @@ bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                 last_gpu,i);
       fflush(screen);
     }
-    if (gpu_rank==i && world_me!=0) {
-      bool init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                              host_lj4, offset, special_lj, inum, 
-                              nall, 300, maxspecial, cell_size, gpu_split,
-			      screen);
-      if (!init_ok)
-        return false;
-    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                          offset, special_lj, inum,  nall, 300, maxspecial,
+                          cell_size, gpu_split, screen);
+
     LJ96MF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
-  return true;
+
+  if (init_ok==0)
+    LJ96MF.estimate_gpu_overhead();
+  return init_ok;
 }
 
 void lj96_gpu_clear() {
   LJ96MF.clear();
 }
 
-int * lj96_gpu_compute_n(const int timestep, const int ago, const int inum_full,
+int** lj96_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
-                         double *boxlo, double *boxhi, int *tag, int **nspecial,
+                         double *sublo, double *subhi, int *tag, int **nspecial,
                          int **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
-                         const double cpu_time, bool &success) {
-  return LJ96MF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
-                        boxhi, tag, nspecial, special, eflag, vflag, eatom,
-                        vatom, host_start, cpu_time, success);
+                         int **ilist, int **jnum, const double cpu_time,
+                         bool &success) {
+  return LJ96MF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                        subhi, tag, nspecial, special, eflag, vflag, eatom,
+                        vatom, host_start, ilist, jnum, cpu_time, success);
 }  
 			
-void lj96_gpu_compute(const int timestep, const int ago, const int inum_full,
-	 	     const int nall, double **host_x, int *host_type,
-                     int *ilist, int *numj, int **firstneigh,
-		     const bool eflag, const bool vflag, const bool eatom,
-                     const bool vatom, int &host_start, const double cpu_time,
-                     bool &success) {
-  LJ96MF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
-                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
+void lj96_gpu_compute(const int ago, const int inum_full, const int nall,
+                      double **host_x, int *host_type, int *ilist, int *numj,
+                      int **firstneigh, const bool eflag, const bool vflag,
+                      const bool eatom, const bool vatom, int &host_start,
+                      const double cpu_time, bool &success) {
+  LJ96MF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,
+                 eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }
 
 double lj96_gpu_bytes() {
diff --git a/lib/gpu/lj96_cut_gpu_kernel.cu b/lib/gpu/lj96_cut_gpu_kernel.cu
index 0d3a01fbac..3fc6a2f308 100644
--- a/lib/gpu/lj96_cut_gpu_kernel.cu
+++ b/lib/gpu/lj96_cut_gpu_kernel.cu
@@ -18,8 +18,6 @@
 #ifndef LJ96_GPU_KERNEL
 #define LJ96_GPU_KERNEL
 
-#define MAX_SHARED_TYPES 8
-
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
@@ -46,7 +44,7 @@
 
 #ifdef NV_KERNEL
 
-#include "geryon/ucl_nv_kernel.h"
+#include "nv_kernel_def.h"
 texture<float4> pos_tex;
 
 #ifdef _DOUBLE_DOUBLE
@@ -72,6 +70,8 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 #define __inline inline
 
 #define fetch_pos(i,y) x_[i]
+#define BLOCK_PAIR 64
+#define MAX_SHARED_TYPES 8
 
 #endif
 
@@ -82,40 +82,55 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                           __global numtyp4* lj3, const int lj_types, 
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global acctyp4 *ans, __global acctyp *engv, 
-                          const int eflag, const int vflag, const int inum, 
-                          const int nall, const int nbor_pitch) {
-  // ii indexes the two interacting particles in gi
-  int ii=GLOBAL_ID_X;
+                          __global int *dev_packed, __global acctyp4 *ans,
+                          __global acctyp *engv, const int eflag,
+                          const int vflag, const int inum, const int nall,
+                          const int nbor_pitch, const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
   __local numtyp sp_lj[4];
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
   if (ii<inum) {
-  
-    acctyp energy=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
-  
     __global int *nbor=dev_nbor+ii;
     int i=*nbor;
     nbor+=nbor_pitch;
     int numj=*nbor;
     nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
   
     numtyp4 ix=fetch_pos(i,x_); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
   
       int j=*nbor;
       factor_lj = sp_lj[sbmask(j)];
@@ -157,8 +172,47 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
       }
 
     } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
 
-    // Store answers
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<4; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
     __global acctyp *ap1=engv+ii;
     if (eflag>0) {
       *ap1=energy;
@@ -176,49 +230,65 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
 
 __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                                __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in, __global int *dev_nbor, 
+                               __global numtyp* sp_lj_in,
+                               __global int *dev_nbor, __global int *dev_packed,
                                __global acctyp4 *ans, __global acctyp *engv, 
                                const int eflag, const int vflag, const int inum, 
-                               const int nall, const int nbor_pitch) {
-  // ii indexes the two interacting particles in gi
-  int ii=THREAD_ID_X;
+                               const int nall, const int nbor_pitch,
+                               const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
-  if (ii<4)
-    sp_lj[ii]=sp_lj_in[ii];
-  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    lj1[ii]=lj1_in[ii];
+  if (tid<4)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
     if (eflag>0)
-      lj3[ii]=lj3_in[ii];
+      lj3[tid]=lj3_in[tid];
   }
-  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
+  
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
   __syncthreads();
   
   if (ii<inum) {
-  
-    acctyp energy=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
-  
     __global int *nbor=dev_nbor+ii;
     int i=*nbor;
     nbor+=nbor_pitch;
     int numj=*nbor;
     nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
   
     numtyp4 ix=fetch_pos(i,x_); //x_[i];
     int iw=ix.w;
     int itype=mul24((int)MAX_SHARED_TYPES,iw);
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
   
       int j=*nbor;
       factor_lj = sp_lj[sbmask(j)];
@@ -258,8 +328,47 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
       }
 
     } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
 
-    // Store answers
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<4; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
     __global acctyp *ap1=engv+ii;
     if (eflag>0) {
       *ap1=energy;
diff --git a/lib/gpu/lj96_cut_gpu_memory.cpp b/lib/gpu/lj96_cut_gpu_memory.cpp
index d365d71044..0b066c0973 100644
--- a/lib/gpu/lj96_cut_gpu_memory.cpp
+++ b/lib/gpu/lj96_cut_gpu_memory.cpp
@@ -42,7 +42,7 @@ int LJ96_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
 }
 
 template <class numtyp, class acctyp>
-bool LJ96_GPU_MemoryT::init(const int ntypes,
+int LJ96_GPU_MemoryT::init(const int ntypes,
                            double **host_cutsq, double **host_lj1, 
                            double **host_lj2, double **host_lj3, 
                            double **host_lj4, double **host_offset, 
@@ -50,14 +50,18 @@ bool LJ96_GPU_MemoryT::init(const int ntypes,
                            const int nall, const int max_nbors,
                            const int maxspecial, const double cell_size,
                            const double gpu_split, FILE *_screen) {
-  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                    _screen,lj96_cut_gpu_kernel);
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,lj96_cut_gpu_kernel);
+  if (success!=0)
+    return success;
 
   // If atom type constants fit in shared memory use fast kernel
   int lj_types=ntypes;
   shared_types=false;
-  if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
-    lj_types=MAX_SHARED_TYPES;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
     shared_types=true;
   }
   _lj_types=lj_types;
@@ -84,7 +88,7 @@ bool LJ96_GPU_MemoryT::init(const int ntypes,
 
   _allocated=true;
   this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
-  return true;
+  return 0;
 }
 
 template <class numtyp, class acctyp>
@@ -122,9 +126,10 @@ void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
   else
     vflag=0;
   
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
 
-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
   int anall=this->atom->nall();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
@@ -133,16 +138,18 @@ void LJ96_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
                           &lj3.begin(), &sp_lj.begin(),
                           &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &anall, &nbor_pitch);
+                          &this->_nbor_data->begin(),
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
+                          &ainum, &anall, &nbor_pitch, 
+                          &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                      &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &anall, &nbor_pitch);
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &anall, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
 }
diff --git a/lib/gpu/lj96_cut_gpu_memory.h b/lib/gpu/lj96_cut_gpu_memory.h
index 483ef05570..fe0a0b1665 100644
--- a/lib/gpu/lj96_cut_gpu_memory.h
+++ b/lib/gpu/lj96_cut_gpu_memory.h
@@ -29,13 +29,20 @@ class LJ96_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
-  bool init(const int ntypes, double **host_cutsq, double **host_lj1,
-            double **host_lj2, double **host_lj3, double **host_lj4,
-            double **host_offset, double *host_special_lj,
-            const int nlocal, const int nall, const int max_nbors, 
-            const int maxspecial, const double cell_size, 
-            const double gpu_split, FILE *screen);
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq, double **host_lj1,
+           double **host_lj2, double **host_lj3, double **host_lj4,
+           double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen);
 
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
diff --git a/lib/gpu/lj_cut_gpu.cpp b/lib/gpu/lj_cut_gpu.cpp
index 12fab2f9f1..aef085f7c9 100644
--- a/lib/gpu/lj_cut_gpu.cpp
+++ b/lib/gpu/lj_cut_gpu.cpp
@@ -28,12 +28,11 @@ static LJL_GPU_Memory<PRECISION,ACC_PRECISION> LJLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool ljl_gpu_init(const int ntypes, double **cutsq,
-                  double **host_lj1, double **host_lj2, double **host_lj3, 
-                  double **host_lj4, double **offset, double *special_lj,
-                  const int inum, const int nall, const int max_nbors, 
-                  const int maxspecial, const double cell_size, int &gpu_mode,
-                  FILE *screen) {
+int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                 double **host_lj2, double **host_lj3, double **host_lj4,
+                 double **offset, double *special_lj, const int inum,
+                 const int nall, const int max_nbors,  const int maxspecial,
+                 const double cell_size, int &gpu_mode, FILE *screen) {
   LJLMF.clear();
   gpu_mode=LJLMF.device->gpu_mode();
   double gpu_split=LJLMF.device->particle_split();
@@ -54,13 +53,11 @@ bool ljl_gpu_init(const int ntypes, double **cutsq,
     fflush(screen);
   }
 
-  if (world_me==0) {
-    bool init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
-                            host_lj4, offset, special_lj, inum, nall, 300,
-                            maxspecial, cell_size, gpu_split, screen);
-    if (!init_ok)
-      return false;
-  }
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
+                       host_lj4, offset, special_lj, inum, nall, 300,
+                       maxspecial, cell_size, gpu_split, screen);
 
   LJLMF.device->world_barrier();
   if (message)
@@ -75,45 +72,45 @@ bool ljl_gpu_init(const int ntypes, double **cutsq,
                 last_gpu,i);
       fflush(screen);
     }
-    if (gpu_rank==i && world_me!=0) {
-      bool init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, 
-                              host_lj4, offset, special_lj, inum, nall, 300,
-                              maxspecial, cell_size, gpu_split,
-			      screen);
-      if (!init_ok)
-        return false;
-    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                         offset, special_lj, inum, nall, 300, maxspecial,
+                         cell_size, gpu_split, screen);
+
     LJLMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
-  return true;
+
+  if (init_ok==0)
+    LJLMF.estimate_gpu_overhead();
+  return init_ok;
 }
 
 void ljl_gpu_clear() {
   LJLMF.clear();
 }
 
-int * ljl_gpu_compute_n(const int timestep, const int ago, const int inum_full,
+int ** ljl_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
-                        double *boxlo, double *boxhi, int *tag, int **nspecial,
+                        double *sublo, double *subhi, int *tag, int **nspecial,
                         int **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
-                        const double cpu_time, bool &success) {
-  return LJLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
-                       boxhi, tag, nspecial, special, eflag, vflag, eatom,
-                       vatom, host_start, cpu_time, success);
+                        int **ilist, int **jnum, const double cpu_time,
+                        bool &success) {
+  return LJLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                       subhi, tag, nspecial, special, eflag, vflag, eatom,
+                       vatom, host_start, ilist, jnum, cpu_time, success);
 }  
 			
-void ljl_gpu_compute(const int timestep, const int ago, const int inum_full,
-	 	     const int nall, double **host_x, int *host_type,
-                     int *ilist, int *numj, int **firstneigh,
-		     const bool eflag, const bool vflag, const bool eatom,
-                     const bool vatom, int &host_start, const double cpu_time,
-                     bool &success) {
-  LJLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
+void ljl_gpu_compute(const int ago, const int inum_full, const int nall,
+                     double **host_x, int *host_type, int *ilist, int *numj,
+                     int **firstneigh, const bool eflag, const bool vflag,
+                     const bool eatom, const bool vatom, int &host_start,
+                     const double cpu_time, bool &success) {
+  LJLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                 firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success);
 }
 
diff --git a/lib/gpu/lj_cut_gpu_kernel.cu b/lib/gpu/lj_cut_gpu_kernel.cu
index 0e72e41f36..75f36446f7 100644
--- a/lib/gpu/lj_cut_gpu_kernel.cu
+++ b/lib/gpu/lj_cut_gpu_kernel.cu
@@ -18,8 +18,6 @@
 #ifndef LJ_GPU_KERNEL
 #define LJ_GPU_KERNEL
 
-#define MAX_SHARED_TYPES 8
-
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
@@ -46,7 +44,7 @@
 
 #ifdef NV_KERNEL
 
-#include "geryon/ucl_nv_kernel.h"
+#include "nv_kernel_def.h"
 texture<float4> pos_tex;
 
 #ifdef _DOUBLE_DOUBLE
@@ -72,6 +70,8 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 #define __inline inline
 
 #define fetch_pos(i,y) x_[i]
+#define BLOCK_PAIR 64
+#define MAX_SHARED_TYPES 8
 
 #endif
 
@@ -82,40 +82,55 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                           __global numtyp4* lj3, const int lj_types, 
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global acctyp4 *ans, __global acctyp *engv, 
-                          const int eflag, const int vflag, const int inum, 
-                          const int nall, const int nbor_pitch) {
-  // ii indexes the two interacting particles in gi
-  int ii=GLOBAL_ID_X;
+                          __global int *dev_packed, __global acctyp4 *ans,
+                          __global acctyp *engv, const int eflag, 
+                          const int vflag, const int inum, const int nall,
+                          const int nbor_pitch, const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
   __local numtyp sp_lj[4];
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
   if (ii<inum) {
-  
-    acctyp energy=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
-  
     __global int *nbor=dev_nbor+ii;
     int i=*nbor;
     nbor+=nbor_pitch;
     int numj=*nbor;
     nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
   
     numtyp4 ix=fetch_pos(i,x_); //x_[i];
     int itype=ix.w;
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
   
       int j=*nbor;
       factor_lj = sp_lj[sbmask(j)];
@@ -156,8 +171,47 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
       }
 
     } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
 
-    // Store answers
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<4; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
     __global acctyp *ap1=engv+ii;
     if (eflag>0) {
       *ap1=energy;
@@ -175,49 +229,65 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
 
 __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                                __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in, __global int *dev_nbor, 
+                               __global numtyp* sp_lj_in, 
+                               __global int *dev_nbor, __global int *dev_packed, 
                                __global acctyp4 *ans, __global acctyp *engv, 
                                const int eflag, const int vflag, const int inum, 
-                               const int nall, const int nbor_pitch) {
-  // ii indexes the two interacting particles in gi
-  int ii=THREAD_ID_X;
+                               const int nall, const int nbor_pitch,
+                               const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
-  if (ii<4)
-    sp_lj[ii]=sp_lj_in[ii];
-  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    lj1[ii]=lj1_in[ii];
+  if (tid<4)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
     if (eflag>0)
-      lj3[ii]=lj3_in[ii];
+      lj3[tid]=lj3_in[tid];
   }
-  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
+  
+  acctyp energy=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
   __syncthreads();
   
   if (ii<inum) {
-  
-    acctyp energy=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
-  
     __global int *nbor=dev_nbor+ii;
     int i=*nbor;
     nbor+=nbor_pitch;
     int numj=*nbor;
     nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
   
     numtyp4 ix=fetch_pos(i,x_); //x_[i];
     int iw=ix.w;
     int itype=mul24((int)MAX_SHARED_TYPES,iw);
 
     numtyp factor_lj;
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
   
       int j=*nbor;
       factor_lj = sp_lj[sbmask(j)];
@@ -256,8 +326,47 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
       }
 
     } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
 
-    // Store answers
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<4; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
     __global acctyp *ap1=engv+ii;
     if (eflag>0) {
       *ap1=energy;
diff --git a/lib/gpu/lj_cut_gpu_memory.cpp b/lib/gpu/lj_cut_gpu_memory.cpp
index 23b2fcf6d0..a294eb647f 100644
--- a/lib/gpu/lj_cut_gpu_memory.cpp
+++ b/lib/gpu/lj_cut_gpu_memory.cpp
@@ -42,22 +42,26 @@ int LJL_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
 }
 
 template <class numtyp, class acctyp>
-bool LJL_GPU_MemoryT::init(const int ntypes, 
-                           double **host_cutsq, double **host_lj1, 
-                           double **host_lj2, double **host_lj3, 
-                           double **host_lj4, double **host_offset, 
-                           double *host_special_lj, const int nlocal,
-                           const int nall, const int max_nbors,
-                           const int maxspecial, const double cell_size,
-                           const double gpu_split, FILE *_screen) {
-  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                    _screen,lj_cut_gpu_kernel);
+int LJL_GPU_MemoryT::init(const int ntypes, 
+                          double **host_cutsq, double **host_lj1, 
+                          double **host_lj2, double **host_lj3, 
+                          double **host_lj4, double **host_offset, 
+                          double *host_special_lj, const int nlocal,
+                          const int nall, const int max_nbors,
+                          const int maxspecial, const double cell_size,
+                          const double gpu_split, FILE *_screen) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,lj_cut_gpu_kernel);
+  if (success!=0)
+    return success;
 
   // If atom type constants fit in shared memory use fast kernel
   int lj_types=ntypes;
   shared_types=false;
-  if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
-    lj_types=MAX_SHARED_TYPES;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
     shared_types=true;
   }
   _lj_types=lj_types;
@@ -84,7 +88,7 @@ bool LJL_GPU_MemoryT::init(const int ntypes,
 
   _allocated=true;
   this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
-  return true;
+  return 0;
 }
 
 template <class numtyp, class acctyp>
@@ -122,9 +126,10 @@ void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
   else
     vflag=0;
   
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
 
-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
   int anall=this->atom->nall();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
@@ -133,16 +138,18 @@ void LJL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
                           &lj3.begin(), &sp_lj.begin(),
                           &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
-                          &ainum, &anall, &nbor_pitch);
+                          &this->_nbor_data->begin(),
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
+                          &ainum, &anall, &nbor_pitch, 
+                          &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                      &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
-                     &anall, &nbor_pitch);
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &anall, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
 }
diff --git a/lib/gpu/lj_cut_gpu_memory.h b/lib/gpu/lj_cut_gpu_memory.h
index 123b739649..4b86b133a1 100644
--- a/lib/gpu/lj_cut_gpu_memory.h
+++ b/lib/gpu/lj_cut_gpu_memory.h
@@ -29,13 +29,20 @@ class LJL_GPU_Memory : public AtomicGPUMemory<numtyp, acctyp> {
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
-  bool init(const int ntypes, double **host_cutsq,
-            double **host_lj1, double **host_lj2, double **host_lj3,
-            double **host_lj4, double **host_offset, double *host_special_lj,
-            const int nlocal, const int nall, const int max_nbors, 
-            const int maxspecial, const double cell_size, 
-            const double gpu_split, FILE *screen);
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq,
+           double **host_lj1, double **host_lj2, double **host_lj3,
+           double **host_lj4, double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen);
 
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
diff --git a/lib/gpu/ljc_cut_gpu.cpp b/lib/gpu/ljc_cut_gpu.cpp
index 955a25adce..de6f4f3e62 100644
--- a/lib/gpu/ljc_cut_gpu.cpp
+++ b/lib/gpu/ljc_cut_gpu.cpp
@@ -28,13 +28,13 @@ static LJC_GPU_Memory<PRECISION,ACC_PRECISION> LJCMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-                  double **host_lj2, double **host_lj3, double **host_lj4,
-                  double **offset, double *special_lj, const int inum,
-                  const int nall, const int max_nbors, const int maxspecial,
-                  const double cell_size, int &gpu_mode, FILE *screen,
-                  double **host_cut_ljsq, double **host_cut_coulsq,
-                  double *host_special_coul, const double qqrd2e) {
+int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                 double **host_lj2, double **host_lj3, double **host_lj4,
+                 double **offset, double *special_lj, const int inum,
+                 const int nall, const int max_nbors, const int maxspecial,
+                 const double cell_size, int &gpu_mode, FILE *screen,
+                 double **host_cut_ljsq, double **host_cut_coulsq,
+                 double *host_special_coul, const double qqrd2e) {
   LJCMF.clear();
   gpu_mode=LJCMF.device->gpu_mode();
   double gpu_split=LJCMF.device->particle_split();
@@ -55,15 +55,12 @@ bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     fflush(screen);
   }
 
-  if (world_me==0) {
-    bool init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                            host_lj4, offset, special_lj, inum, nall, 300,
-                            maxspecial, cell_size, gpu_split, screen,
-                            host_cut_ljsq, host_cut_coulsq, host_special_coul,
-                            qqrd2e);
-    if (!init_ok)
-      return false;
-  }
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
+                       host_lj4, offset, special_lj, inum, nall, 300,
+                       maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
+                       host_cut_coulsq, host_special_coul, qqrd2e);
 
   LJCMF.device->world_barrier();
   if (message)
@@ -78,48 +75,51 @@ bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                 last_gpu,i);
       fflush(screen);
     }
-    if (gpu_rank==i && world_me!=0) {
-      bool init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                              host_lj4, offset, special_lj, inum, nall, 300,
-                              maxspecial, cell_size, gpu_split,
-			      screen, host_cut_ljsq, host_cut_coulsq,
-                              host_special_coul, qqrd2e);
-      if (!init_ok)
-        return false;
-    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                         offset, special_lj, inum, nall, 300, maxspecial,
+                         cell_size, gpu_split, screen, host_cut_ljsq,
+                         host_cut_coulsq, host_special_coul, qqrd2e);
+
     LJCMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
-  return true;
+
+  if (init_ok==0)
+    LJCMF.estimate_gpu_overhead();
+  return init_ok;
 }
 
 void ljc_gpu_clear() {
   LJCMF.clear();
 }
 
-int * ljc_gpu_compute_n(const int timestep, const int ago, const int inum_full,
+int** ljc_gpu_compute_n(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
-                        double *boxlo, double *boxhi, int *tag, int **nspecial, 
+                        double *sublo, double *subhi, int *tag, int **nspecial, 
                         int **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
-                        const double cpu_time, bool &success, double *host_q) {
-  return LJCMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
-                       boxhi, tag, nspecial, special, eflag, vflag, eatom,
-                       vatom, host_start, cpu_time, success, host_q);
+                        int **ilist, int **jnum, const double cpu_time,
+                        bool &success, double *host_q, double *boxlo,
+                        double *prd) {
+  return LJCMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                       subhi, tag, nspecial, special, eflag, vflag, eatom,
+                       vatom, host_start, ilist, jnum, cpu_time, success,
+                       host_q, boxlo, prd);
 }  
 			
-void ljc_gpu_compute(const int timestep, const int ago, const int inum_full,
-	 	     const int nall, double **host_x, int *host_type,
-                     int *ilist, int *numj, int **firstneigh,
-		     const bool eflag, const bool vflag, const bool eatom,
-                     const bool vatom, int &host_start, const double cpu_time,
-                     bool &success, double *host_q) {
-  LJCMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
-                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
-                host_q);
+void ljc_gpu_compute(const int ago, const int inum_full, const int nall,
+                     double **host_x, int *host_type, int *ilist, int *numj,
+                     int **firstneigh, const bool eflag, const bool vflag,
+                     const bool eatom, const bool vatom, int &host_start,
+                     const double cpu_time, bool &success, double *host_q,
+                     const int nlocal, double *boxlo, double *prd) {
+  LJCMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
+                vflag,eatom,vatom,host_start,cpu_time,success,host_q,
+                nlocal,boxlo,prd);
 }
 
 double ljc_gpu_bytes() {
diff --git a/lib/gpu/ljc_cut_gpu_kernel.cu b/lib/gpu/ljc_cut_gpu_kernel.cu
index 2751e20702..44a607588a 100644
--- a/lib/gpu/ljc_cut_gpu_kernel.cu
+++ b/lib/gpu/ljc_cut_gpu_kernel.cu
@@ -18,8 +18,6 @@
 #ifndef LJC_GPU_KERNEL
 #define LJC_GPU_KERNEL
 
-#define MAX_SHARED_TYPES 8
-
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
@@ -46,7 +44,7 @@
 
 #ifdef NV_KERNEL
 
-#include "geryon/ucl_nv_kernel.h"
+#include "nv_kernel_def.h"
 texture<float4> pos_tex;
 texture<float> q_tex;
 
@@ -82,6 +80,8 @@ __inline float fetch_q(const int& i, const float *q)
 
 #define fetch_pos(i,y) x_[i]
 #define fetch_q(i,y) q_[i]
+#define BLOCK_PAIR 64
+#define MAX_SHARED_TYPES 8
 
 #endif
 
@@ -92,13 +92,17 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                           __global numtyp4* lj3, const int lj_types, 
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global acctyp4 *ans, __global acctyp *engv, 
-                          const int eflag, const int vflag, const int inum, 
-                          const int nall, const int nbor_pitch,
-                          __global numtyp *q_ , __global numtyp *cutsq,
-                          const numtyp qqrd2e) {
-  // ii indexes the two interacting particles in gi
-  int ii=GLOBAL_ID_X;
+                          __global int *dev_packed, __global acctyp4 *ans,
+                          __global acctyp *engv, const int eflag,
+                          const int vflag, const int inum, const int nall,
+                          const int nbor_pitch, __global numtyp *q_ ,
+                          __global numtyp *cutsq, const numtyp qqrd2e,
+                          const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
   __local numtyp sp_lj[8];
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
@@ -109,29 +113,41 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  if (ii<inum) {
-    acctyp energy=(acctyp)0;
-    acctyp e_coul=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
   
+  if (ii<inum) {
     __global int *nbor=dev_nbor+ii;
     int i=*nbor;
     nbor+=nbor_pitch;
     int numj=*nbor;
     nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
   
     numtyp4 ix=fetch_pos(i,x_); //x_[i];
     numtyp qtmp=fetch_q(i,q_);
     int itype=ix.w;
 
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
       int j=*nbor;
 
       numtyp factor_lj, factor_coul;
@@ -188,8 +204,49 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
       }
 
     } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
+    red_acc[4][tid]=e_coul;
 
-    // Store answers
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<5; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+    e_coul=red_acc[4][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
     __global acctyp *ap1=engv+ii;
     if (eflag>0) {
       *ap1=energy;
@@ -209,54 +266,69 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
 
 __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                                __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in, __global int *dev_nbor, 
+                               __global numtyp* sp_lj_in,
+                               __global int *dev_nbor, __global int *dev_packed,
                                __global acctyp4 *ans, __global acctyp *engv, 
                                const int eflag, const int vflag, const int inum, 
                                const int nall, const int nbor_pitch,
                                __global numtyp *q_ , __global numtyp *_cutsq,
-                               const numtyp qqrd2e) {
-  // ii indexes the two interacting particles in gi
-  int ii=THREAD_ID_X;
+                               const numtyp qqrd2e, const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
-  if (ii<8)
-    sp_lj[ii]=sp_lj_in[ii];
-  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    lj1[ii]=lj1_in[ii];
-    cutsq[ii]=_cutsq[ii];
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
+    cutsq[tid]=_cutsq[tid];
     if (eflag>0)
-      lj3[ii]=lj3_in[ii];
+      lj3[tid]=lj3_in[tid];
   }
-  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
+  
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
   __syncthreads();
   
   if (ii<inum) {
-  
-    acctyp energy=(acctyp)0;
-    acctyp e_coul=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
-  
     __global int *nbor=dev_nbor+ii;
     int i=*nbor;
     nbor+=nbor_pitch;
     int numj=*nbor;
     nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
   
     numtyp4 ix=fetch_pos(i,x_); //x_[i];
     numtyp qtmp=fetch_q(i,q_);
     int iw=ix.w;
     int itype=mul24((int)MAX_SHARED_TYPES,iw);
 
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
       int j=*nbor;
 
       numtyp factor_lj, factor_coul;
@@ -312,8 +384,49 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
       }
 
     } // for nbor
+  } // if ii
 
-    // Store answers
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
+    red_acc[4][tid]=e_coul;
+
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<5; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+    e_coul=red_acc[4][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
     __global acctyp *ap1=engv+ii;
     if (eflag>0) {
       *ap1=energy;
diff --git a/lib/gpu/ljc_cut_gpu_memory.cpp b/lib/gpu/ljc_cut_gpu_memory.cpp
index d63ed6e5d9..642ff6ecc7 100644
--- a/lib/gpu/ljc_cut_gpu_memory.cpp
+++ b/lib/gpu/ljc_cut_gpu_memory.cpp
@@ -43,24 +43,28 @@ int LJC_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
 }
 
 template <class numtyp, class acctyp>
-bool LJC_GPU_MemoryT::init(const int ntypes,
-                           double **host_cutsq, double **host_lj1, 
-                           double **host_lj2, double **host_lj3, 
-                           double **host_lj4, double **host_offset, 
-                           double *host_special_lj, const int nlocal,
-                           const int nall, const int max_nbors,
-                           const int maxspecial, const double cell_size,
-                           const double gpu_split, FILE *_screen,
-                           double **host_cut_ljsq, double **host_cut_coulsq,
-                           double *host_special_coul, const double qqrd2e) {
-  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                    _screen,ljc_cut_gpu_kernel);
+int LJC_GPU_MemoryT::init(const int ntypes,
+                          double **host_cutsq, double **host_lj1, 
+                          double **host_lj2, double **host_lj3, 
+                          double **host_lj4, double **host_offset, 
+                          double *host_special_lj, const int nlocal,
+                          const int nall, const int max_nbors,
+                          const int maxspecial, const double cell_size,
+                          const double gpu_split, FILE *_screen,
+                          double **host_cut_ljsq, double **host_cut_coulsq,
+                          double *host_special_coul, const double qqrd2e) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,ljc_cut_gpu_kernel);
+  if (success!=0)
+    return success;
 
   // If atom type constants fit in shared memory use fast kernel
   int lj_types=ntypes;
   shared_types=false;
-  if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
-    lj_types=MAX_SHARED_TYPES;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
     shared_types=true;
   }
   _lj_types=lj_types;
@@ -95,7 +99,7 @@ bool LJC_GPU_MemoryT::init(const int ntypes,
   _allocated=true;
   this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+
                    sp_lj.row_bytes();
-  return true;
+  return 0;
 }
 
 template <class numtyp, class acctyp>
@@ -134,9 +138,10 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
   else
     vflag=0;
   
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
 
-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
   int anall=this->atom->nall();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
@@ -145,19 +150,20 @@ void LJC_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
                           &lj3.begin(), &sp_lj.begin(),
                           &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &this->_nbor_data->begin(),
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
                           &ainum, &anall, &nbor_pitch,
                           &this->atom->dev_q.begin(), &cutsq.begin(),
-                          &_qqrd2e);
+                          &_qqrd2e, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                      &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                      &anall, &nbor_pitch, &this->atom->dev_q.begin(),
-                     &cutsq.begin(), &_qqrd2e);
+                     &cutsq.begin(), &_qqrd2e, &this->_threads_per_atom);
   }
   this->time_pair.stop();
 }
diff --git a/lib/gpu/ljc_cut_gpu_memory.h b/lib/gpu/ljc_cut_gpu_memory.h
index 4dedce957a..552f9d9881 100644
--- a/lib/gpu/ljc_cut_gpu_memory.h
+++ b/lib/gpu/ljc_cut_gpu_memory.h
@@ -29,15 +29,22 @@ class LJC_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
-  bool init(const int ntypes, double **host_cutsq, double **host_lj1,
-            double **host_lj2, double **host_lj3, double **host_lj4,
-            double **host_offset, double *host_special_lj,
-            const int nlocal, const int nall, const int max_nbors, 
-            const int maxspecial, const double cell_size,
-            const double gpu_split, FILE *screen, double **host_cut_ljsq,
-            double **host_cut_coulsq, double *host_special_coul,
-            const double qqrd2e);
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq, double **host_lj1,
+           double **host_lj2, double **host_lj3, double **host_lj4,
+           double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size,
+           const double gpu_split, FILE *screen, double **host_cut_ljsq,
+           double **host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e);
 
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
diff --git a/lib/gpu/ljcl_cut_gpu.cpp b/lib/gpu/ljcl_cut_gpu.cpp
index 8fa15998bf..167f41b374 100644
--- a/lib/gpu/ljcl_cut_gpu.cpp
+++ b/lib/gpu/ljcl_cut_gpu.cpp
@@ -28,14 +28,14 @@ static LJCL_GPU_Memory<PRECISION,ACC_PRECISION> LJCLMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-                   double **host_lj2, double **host_lj3, double **host_lj4,
-                   double **offset, double *special_lj, const int inum,
-                   const int nall, const int max_nbors, const int maxspecial,
-                   const double cell_size, int &gpu_mode, FILE *screen,
-                   double **host_cut_ljsq, double host_cut_coulsq,
-                   double *host_special_coul, const double qqrd2e,
-                   const double g_ewald) {
+int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                  double **host_lj2, double **host_lj3, double **host_lj4,
+                  double **offset, double *special_lj, const int inum,
+                  const int nall, const int max_nbors, const int maxspecial,
+                  const double cell_size, int &gpu_mode, FILE *screen,
+                  double **host_cut_ljsq, double host_cut_coulsq,
+                  double *host_special_coul, const double qqrd2e,
+                  const double g_ewald) {
   LJCLMF.clear();
   gpu_mode=LJCLMF.device->gpu_mode();
   double gpu_split=LJCLMF.device->particle_split();
@@ -56,15 +56,12 @@ bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     fflush(screen);
   }
 
-  if (world_me==0) {
-    bool init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                            host_lj4, offset, special_lj, inum, nall, 300,
-                            maxspecial, cell_size, gpu_split, screen,
-                            host_cut_ljsq, host_cut_coulsq, host_special_coul,
-                            qqrd2e,g_ewald);
-    if (!init_ok)
-      return false;
-  }
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                        offset, special_lj, inum, nall, 300, maxspecial,
+                        cell_size, gpu_split, screen, host_cut_ljsq,
+                        host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
   LJCLMF.device->world_barrier();
   if (message)
@@ -79,48 +76,51 @@ bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                 last_gpu,i);
       fflush(screen);
     }
-    if (gpu_rank==i && world_me!=0) {
-      bool init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                              host_lj4, offset, special_lj, inum, nall, 300,
-                              maxspecial, cell_size, gpu_split,
-			      screen, host_cut_ljsq, host_cut_coulsq,
-                              host_special_coul, qqrd2e, g_ewald);
-      if (!init_ok)
-        return false;
-    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                          offset, special_lj, inum, nall, 300, maxspecial,
+                          cell_size, gpu_split, screen, host_cut_ljsq,
+                          host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
+
     LJCLMF.device->gpu_barrier();
     if (message) 
       fprintf(screen,"Done.\n");
   }
   if (message)
     fprintf(screen,"\n");
-  return true;
+
+  if (init_ok==0)
+    LJCLMF.estimate_gpu_overhead();
+  return init_ok;
 }
 
 void ljcl_gpu_clear() {
   LJCLMF.clear();
 }
 
-int * ljcl_gpu_compute_n(const int timestep, const int ago, const int inum_full,
+int** ljcl_gpu_compute_n(const int ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
-                         double *boxlo, double *boxhi, int *tag, int **nspecial, 
+                         double *sublo, double *subhi, int *tag, int **nspecial, 
                          int **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
-                         const double cpu_time, bool &success, double *host_q) {
-  return LJCLMF.compute(timestep, ago, inum_full, nall, host_x, host_type, boxlo,
-                        boxhi, tag, nspecial, special, eflag, vflag, eatom,
-                        vatom, host_start, cpu_time, success, host_q);
+                         int **ilist, int **jnum,  const double cpu_time,
+                         bool &success, double *host_q, double *boxlo,
+                         double *prd) {
+  return LJCLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                        subhi, tag, nspecial, special, eflag, vflag, eatom,
+                        vatom, host_start, ilist, jnum, cpu_time, success,
+                        host_q, boxlo, prd);
 }  
 			
-void ljcl_gpu_compute(const int timestep, const int ago, const int inum_full,
-	 	     const int nall, double **host_x, int *host_type,
-                     int *ilist, int *numj, int **firstneigh,
-		     const bool eflag, const bool vflag, const bool eatom,
-                     const bool vatom, int &host_start, const double cpu_time,
-                     bool &success, double *host_q) {
-  LJCLMF.compute(timestep,ago,inum_full,nall,host_x,host_type,ilist,numj,
+void ljcl_gpu_compute(const int ago, const int inum_full, const int nall,
+                      double **host_x, int *host_type, int *ilist, int *numj,
+                      int **firstneigh, const bool eflag, const bool vflag,
+                      const bool eatom, const bool vatom, int &host_start,
+                      const double cpu_time, bool &success, double *host_q,
+                      const int nlocal, double *boxlo, double *prd) {
+  LJCLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
                 firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
-                host_q);
+                host_q,nlocal,boxlo,prd);
 }
 
 double ljcl_gpu_bytes() {
diff --git a/lib/gpu/ljcl_cut_gpu_kernel.cu b/lib/gpu/ljcl_cut_gpu_kernel.cu
index a0b27f0259..7be7a86114 100644
--- a/lib/gpu/ljcl_cut_gpu_kernel.cu
+++ b/lib/gpu/ljcl_cut_gpu_kernel.cu
@@ -18,8 +18,6 @@
 #ifndef LJCL_GPU_KERNEL
 #define LJCL_GPU_KERNEL
 
-#define MAX_SHARED_TYPES 8
-
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
@@ -54,7 +52,7 @@
 
 #ifdef NV_KERNEL
 
-#include "geryon/ucl_nv_kernel.h"
+#include "nv_kernel_def.h"
 texture<float4> pos_tex;
 texture<float> q_tex;
 
@@ -90,6 +88,8 @@ __inline float fetch_q(const int& i, const float *q)
 
 #define fetch_pos(i,y) x_[i]
 #define fetch_q(i,y) q_[i]
+#define BLOCK_PAIR 64
+#define MAX_SHARED_TYPES 8
 
 #endif
 
@@ -100,13 +100,17 @@ __inline int sbmask(int j) { return j >> SBBITS & 3; }
 __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
                           __global numtyp4* lj3, const int lj_types, 
                           __global numtyp *sp_lj_in, __global int *dev_nbor, 
-                          __global acctyp4 *ans, __global acctyp *engv, 
-                          const int eflag, const int vflag, const int inum, 
-                          const int nall, const int nbor_pitch,
-                          __global numtyp *q_ , const numtyp cut_coulsq,
-                          const numtyp qqrd2e, const numtyp g_ewald) {
-  // ii indexes the two interacting particles in gi
-  int ii=GLOBAL_ID_X;
+                          __global int *dev_packed, __global acctyp4 *ans,
+                          __global acctyp *engv, const int eflag, 
+                          const int vflag, const int inum, const int nall,
+                          const int nbor_pitch, __global numtyp *q_,
+                          const numtyp cut_coulsq, const numtyp qqrd2e,
+                          const numtyp g_ewald, const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
   __local numtyp sp_lj[8];
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
@@ -117,29 +121,41 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  if (ii<inum) {
-    acctyp energy=(acctyp)0;
-    acctyp e_coul=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
   
+  if (ii<inum) {
     __global int *nbor=dev_nbor+ii;
     int i=*nbor;
     nbor+=nbor_pitch;
     int numj=*nbor;
     nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
   
     numtyp4 ix=fetch_pos(i,x_); //x_[i];
     numtyp qtmp=fetch_q(i,q_);
     int itype=ix.w;
 
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
       int j=*nbor;
 
       numtyp factor_lj, factor_coul;
@@ -204,8 +220,49 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
       }
 
     } // for nbor
+  } // if ii
+  
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
+    red_acc[4][tid]=e_coul;
 
-    // Store answers
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<5; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+    e_coul=red_acc[4][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
     __global acctyp *ap1=engv+ii;
     if (eflag>0) {
       *ap1=energy;
@@ -225,52 +282,68 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
 
 __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
                                __global numtyp4* lj3_in, 
-                               __global numtyp* sp_lj_in, __global int *dev_nbor, 
+                               __global numtyp* sp_lj_in,
+                               __global int *dev_nbor, __global int *dev_packed,
                                __global acctyp4 *ans, __global acctyp *engv, 
                                const int eflag, const int vflag, const int inum, 
                                const int nall, const int nbor_pitch,
                                __global numtyp *q_ , const numtyp cut_coulsq,
-                               const numtyp qqrd2e, const numtyp g_ewald) {
-  // ii indexes the two interacting particles in gi
-  int ii=THREAD_ID_X;
+                               const numtyp qqrd2e, const numtyp g_ewald,
+                               const int t_per_atom) {
+  int tid=THREAD_ID_X;
+  int ii=mul24((int)BLOCK_ID_X,(int)(BLOCK_SIZE_X)/t_per_atom);
+  ii+=tid/t_per_atom;
+  int offset=tid%t_per_atom;
+
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
-  if (ii<8)
-    sp_lj[ii]=sp_lj_in[ii];
-  if (ii<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
-    lj1[ii]=lj1_in[ii];
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
     if (eflag>0)
-      lj3[ii]=lj3_in[ii];
+      lj3[tid]=lj3_in[tid];
   }
-  ii+=mul24((int)BLOCK_ID_X,(int)BLOCK_SIZE_X);
+  
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0;
+  f.y=(acctyp)0;
+  f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+  
   __syncthreads();
   
   if (ii<inum) {
-  
-    acctyp energy=(acctyp)0;
-    acctyp e_coul=(acctyp)0;
-    acctyp4 f;
-    f.x=(acctyp)0;
-    f.y=(acctyp)0;
-    f.z=(acctyp)0;
-    acctyp virial[6];
-    for (int i=0; i<6; i++)
-      virial[i]=(acctyp)0;
-  
     __global int *nbor=dev_nbor+ii;
     int i=*nbor;
     nbor+=nbor_pitch;
     int numj=*nbor;
     nbor+=nbor_pitch;
-    __global int *list_end=nbor+mul24(numj,nbor_pitch);
+
+    int n_stride;
+    __global int *list_end;
+    if (dev_nbor==dev_packed) {
+      list_end=nbor+mul24(numj,nbor_pitch);
+      nbor+=mul24(offset,nbor_pitch);
+      n_stride=mul24(t_per_atom,nbor_pitch);
+    } else {
+      nbor=dev_packed+*nbor;
+      list_end=nbor+numj;
+      n_stride=t_per_atom;
+      nbor+=offset;
+    }
   
     numtyp4 ix=fetch_pos(i,x_); //x_[i];
     numtyp qtmp=fetch_q(i,q_);
     int iw=ix.w;
     int itype=mul24((int)MAX_SHARED_TYPES,iw);
 
-    for ( ; nbor<list_end; nbor+=nbor_pitch) {
+    for ( ; nbor<list_end; nbor+=n_stride) {
       int j=*nbor;
 
       numtyp factor_lj, factor_coul;
@@ -334,8 +407,49 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
       }
 
     } // for nbor
+  } // if ii
 
-    // Store answers
+  // Reduce answers
+  if (t_per_atom>1) {
+    __local acctyp red_acc[6][BLOCK_PAIR];
+    
+    red_acc[0][tid]=f.x;
+    red_acc[1][tid]=f.y;
+    red_acc[2][tid]=f.z;
+    red_acc[3][tid]=energy;
+    red_acc[4][tid]=e_coul;
+
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      if (offset < s) {
+        for (int r=0; r<5; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    
+    f.x=red_acc[0][tid];
+    f.y=red_acc[1][tid];
+    f.z=red_acc[2][tid];
+    energy=red_acc[3][tid];
+    e_coul=red_acc[4][tid];
+
+    if (vflag>0) {
+      for (int r=0; r<6; r++)
+        red_acc[r][tid]=virial[r];
+
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        if (offset < s) {
+          for (int r=0; r<6; r++)
+            red_acc[r][tid] += red_acc[r][tid+s];
+        }
+      }
+    
+      for (int r=0; r<6; r++)
+        virial[r]=red_acc[r][tid];
+    }
+  }
+
+  // Store answers
+  if (ii<inum && offset==0) {
     __global acctyp *ap1=engv+ii;
     if (eflag>0) {
       *ap1=energy;
diff --git a/lib/gpu/ljcl_cut_gpu_memory.cpp b/lib/gpu/ljcl_cut_gpu_memory.cpp
index a126309a92..f37e6b1857 100644
--- a/lib/gpu/ljcl_cut_gpu_memory.cpp
+++ b/lib/gpu/ljcl_cut_gpu_memory.cpp
@@ -43,7 +43,7 @@ int LJCL_GPU_MemoryT::bytes_per_atom(const int max_nbors) const {
 }
 
 template <class numtyp, class acctyp>
-bool LJCL_GPU_MemoryT::init(const int ntypes,
+int LJCL_GPU_MemoryT::init(const int ntypes,
                            double **host_cutsq, double **host_lj1, 
                            double **host_lj2, double **host_lj3, 
                            double **host_lj4, double **host_offset, 
@@ -54,14 +54,18 @@ bool LJCL_GPU_MemoryT::init(const int ntypes,
                            double **host_cut_ljsq, const double host_cut_coulsq,
                            double *host_special_coul, const double qqrd2e,
                            const double g_ewald) {
-  this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                    _screen,ljcl_cut_gpu_kernel);
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,ljcl_cut_gpu_kernel);
+  if (success!=0)
+    return success;
 
   // If atom type constants fit in shared memory use fast kernel
   int lj_types=ntypes;
   shared_types=false;
-  if (lj_types<=MAX_SHARED_TYPES && this->_block_size>=MAX_SHARED_TYPES) {
-    lj_types=MAX_SHARED_TYPES;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
     shared_types=true;
   }
   _lj_types=lj_types;
@@ -94,7 +98,7 @@ bool LJCL_GPU_MemoryT::init(const int ntypes,
 
   _allocated=true;
   this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
-  return true;
+  return 0;
 }
 
 template <class numtyp, class acctyp>
@@ -132,9 +136,10 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
   else
     vflag=0;
   
-  int GX=static_cast<int>(ceil(static_cast<double>(this->atom->inum())/BX));
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
 
-  int ainum=this->atom->inum();
+  int ainum=this->ans->inum();
   int anall=this->atom->nall();
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
@@ -143,19 +148,21 @@ void LJCL_GPU_MemoryT::loop(const bool _eflag, const bool _vflag) {
     this->k_pair_fast.run(&this->atom->dev_x.begin(), &lj1.begin(),
                           &lj3.begin(), &sp_lj.begin(),
                           &this->nbor->dev_nbor.begin(),
-                          &this->atom->dev_ans.begin(),
-                          &this->atom->dev_engv.begin(), &eflag, &vflag,
+                          &this->_nbor_data->begin(),
+                          &this->ans->dev_ans.begin(),
+                          &this->ans->dev_engv.begin(), &eflag, &vflag,
                           &ainum, &anall, &nbor_pitch,
                           &this->atom->dev_q.begin(), &_cut_coulsq,
-                          &_qqrd2e, &_g_ewald);
+                          &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
     this->k_pair.run(&this->atom->dev_x.begin(), &lj1.begin(), &lj3.begin(),
                      &_lj_types, &sp_lj.begin(), &this->nbor->dev_nbor.begin(),
-                     &this->atom->dev_ans.begin(),
-                     &this->atom->dev_engv.begin(), &eflag, &vflag, &ainum,
+                     &this->_nbor_data->begin(), &this->ans->dev_ans.begin(),
+                     &this->ans->dev_engv.begin(), &eflag, &vflag, &ainum,
                      &anall, &nbor_pitch, &this->atom->dev_q.begin(),
-                     &_cut_coulsq, &_qqrd2e, &_g_ewald);
+                     &_cut_coulsq, &_qqrd2e, &_g_ewald, 
+                     &this->_threads_per_atom);
   }
   this->time_pair.stop();
 }
diff --git a/lib/gpu/ljcl_cut_gpu_memory.h b/lib/gpu/ljcl_cut_gpu_memory.h
index 056ba0e41f..fae4c07040 100644
--- a/lib/gpu/ljcl_cut_gpu_memory.h
+++ b/lib/gpu/ljcl_cut_gpu_memory.h
@@ -29,15 +29,22 @@ class LJCL_GPU_Memory : public ChargeGPUMemory<numtyp, acctyp> {
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param max_nbors initial number of rows in the neighbor matrix
     * \param cell_size cutoff + skin
-    * \param gpu_split fraction of particles handled by device **/
-  bool init(const int ntypes, double **host_cutsq,
-            double **host_lj1, double **host_lj2, double **host_lj3,
-            double **host_lj4, double **host_offset, double *host_special_lj,
-            const int nlocal, const int nall, const int max_nbors, 
-            const int maxspecial, const double cell_size, 
-            const double gpu_split, FILE *screen, double **host_cut_ljsq,
-            const double host_cut_coulsq, double *host_special_coul,
-            const double qqrd2e, const double g_ewald);
+    * \param gpu_split fraction of particles handled by device
+    * 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq,
+           double **host_lj1, double **host_lj2, double **host_lj3,
+           double **host_lj4, double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors, 
+           const int maxspecial, const double cell_size, 
+           const double gpu_split, FILE *screen, double **host_cut_ljsq,
+           const double host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e, const double g_ewald);
 
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
diff --git a/lib/gpu/pair_gpu_atom.cpp b/lib/gpu/pair_gpu_atom.cpp
index 0ca2345087..e34a15c0b9 100644
--- a/lib/gpu/pair_gpu_atom.cpp
+++ b/lib/gpu/pair_gpu_atom.cpp
@@ -29,9 +29,8 @@ __win_sort _win_sort;
 #endif
 
 template <class numtyp, class acctyp>
-PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),_eflag(false),
-                              _vflag(false),_inum(0),_ilist(NULL), 
-                              _newton(false) {
+PairGPUAtomT::PairGPUAtom() : _compiled(false),_allocated(false),
+                              _max_gpu_bytes(0) {
   #ifndef USE_OPENCL
   sort_config.op = CUDPP_ADD;
   sort_config.datatype = CUDPP_UINT;
@@ -56,28 +55,20 @@ int PairGPUAtomT::bytes_per_atom() const {
   int id_space=0;
   if (_gpu_nbor)
     id_space=2;
-  int bytes=4*sizeof(numtyp)+11*sizeof(acctyp)+id_space;
+  int bytes=4*sizeof(numtyp)+id_space;
   if (_rot)
-    bytes+=4*sizeof(numtyp)+4*sizeof(acctyp);
+    bytes+=4*sizeof(numtyp);
   if (_charge)
     bytes+=sizeof(numtyp);
   return bytes;
 }
 
 template <class numtyp, class acctyp>
-bool PairGPUAtomT::alloc(const int inum, const int nall) {
+bool PairGPUAtomT::alloc(const int nall) {
   _max_atoms=static_cast<int>(static_cast<double>(nall)*1.10);
-  if (_newton)
-    _max_local=_max_atoms;
-  else
-    _max_local=static_cast<int>(static_cast<double>(inum)*1.10);
 
   bool success=true;
   
-  int ans_elements=4;
-  if (_rot)
-    ans_elements+=4;
-  
   // Ignore host/device transfers?
   bool cpuview=false;
   if (dev->device_type()==UCL_CPU)
@@ -107,8 +98,6 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) {
   success=success && (host_x.alloc(_max_atoms*4,*dev,
                       UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
   #endif                      
-  success=success &&(host_ans.alloc(ans_elements*_max_local,*dev)==UCL_SUCCESS);
-  success=success &&(host_engv.alloc(_ev_fields*_max_local,*dev)==UCL_SUCCESS);
   // Buffer for casting only if different precisions
   if (_charge)
     success=success && (host_q.alloc(_max_atoms,*dev,
@@ -120,15 +109,13 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) {
 
     
   // ---------------------------  Device allocations
-  _gpu_bytes=0;
+  int gpu_bytes=0;
   if (cpuview) {
     #ifdef GPU_CAST
     assert(0==1);
     #else
     dev_x.view(host_x);
     #endif
-    dev_engv.view(host_engv);
-    dev_ans.view(host_ans);
     if (_rot)
       dev_quat.view(host_quat);
     if (_charge)
@@ -140,49 +127,80 @@ bool PairGPUAtomT::alloc(const int inum, const int nall) {
                         dev_x_cast.alloc(_max_atoms*3,*dev,UCL_READ_ONLY));
     success=success && (UCL_SUCCESS==
                         dev_type_cast.alloc(_max_atoms,*dev,UCL_READ_ONLY));
-    _gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
+    gpu_bytes+=dev_x_cast.row_bytes()+dev_type_cast.row_bytes();
     #else
     success=success && (UCL_SUCCESS==
                         dev_x.alloc(_max_atoms*4,*dev,UCL_READ_ONLY));
     #endif
-    success=success && (dev_engv.alloc(_ev_fields*_max_local,*dev,
-                                       UCL_WRITE_ONLY)==UCL_SUCCESS);
-    success=success && (dev_ans.alloc(ans_elements*_max_local,
-                                      *dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
     if (_charge) {
       success=success && (dev_q.alloc(_max_atoms,*dev,
                                       UCL_READ_ONLY)==UCL_SUCCESS);
-      _gpu_bytes+=dev_q.row_bytes();
+      gpu_bytes+=dev_q.row_bytes();
     }
     if (_rot) {
       success=success && (dev_quat.alloc(_max_atoms*4,*dev,
                                       UCL_READ_ONLY)==UCL_SUCCESS);
-      _gpu_bytes+=dev_quat.row_bytes();
+      gpu_bytes+=dev_quat.row_bytes();
     }
   }
   if (_gpu_nbor) {
     success=success && (dev_cell_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
     success=success && (dev_particle_id.alloc(_max_atoms,*dev)==UCL_SUCCESS);
-    _gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes();
+    gpu_bytes+=dev_cell_id.row_bytes()+dev_particle_id.row_bytes();
     if (_bonds) {
       success=success && (dev_tag.alloc(_max_atoms,*dev)==UCL_SUCCESS);
-      _gpu_bytes+=dev_tag.row_bytes();
+      gpu_bytes+=dev_tag.row_bytes();
     }
   }
 
-  _gpu_bytes+=dev_x.row_bytes()+dev_engv.row_bytes()+dev_ans.row_bytes();
+  gpu_bytes+=dev_x.row_bytes();
+  if (gpu_bytes>_max_gpu_bytes)
+    _max_gpu_bytes=gpu_bytes;
   
   _allocated=true;  
   return success;
 }
 
 template <class numtyp, class acctyp>
-bool PairGPUAtomT::init(const int inum, const int nall, const bool charge,
-                        const bool rot, UCL_Device &devi, const bool gpu_nbor,
+bool PairGPUAtomT::add_fields(const bool charge, const bool rot,
+                              const bool gpu_nbor, const bool bonds) {
+  bool realloc=false;
+  if (charge && _charge==false) {
+    _charge=true;
+    realloc=true;
+  }
+  if (rot && _rot==false) {
+    _rot=true;
+    realloc=true;
+  }
+  if (gpu_nbor && _gpu_nbor==false) {
+    _gpu_nbor=true;
+    realloc=true;
+  }
+  if (bonds && _bonds==false) {
+    _bonds=true;
+    realloc=true;
+  }
+  if (realloc) {
+    _other=_charge || _rot;
+    int max_atoms=_max_atoms;
+    clear_resize();
+    return alloc(max_atoms);
+  }
+  return true;
+}
+
+template <class numtyp, class acctyp>
+bool PairGPUAtomT::init(const int nall, const bool charge, const bool rot,
+                        UCL_Device &devi, const bool gpu_nbor,
                         const bool bonds) {
   clear();
 
   bool success=true;
+  _x_avail=false;
+  _q_avail=false;
+  _quat_avail=false;
+  _resized=false;
   _gpu_nbor=gpu_nbor;
   _bonds=bonds;
   _charge=charge;
@@ -190,33 +208,25 @@ bool PairGPUAtomT::init(const int inum, const int nall, const bool charge,
   _other=_charge || _rot;
   dev=&devi;
 
-  _e_fields=1;
-  if (_charge)
-    _e_fields++;
-  _ev_fields=6+_e_fields;
-    
   // Initialize atom and nbor data
-  int ef_inum=inum;
-  if (ef_inum==0)
-    ef_inum=1000;
   int ef_nall=nall;
-  if (ef_nall<=ef_inum)
-    ef_nall=ef_inum*2;
+  if (ef_nall==0)
+    ef_nall=2000;
   
   // Initialize timers for the selected device
   time_pos.init(*dev);
-  time_other.init(*dev);
-  time_answer.init(*dev);
+  time_q.init(*dev);
+  time_quat.init(*dev);
   time_pos.zero();
-  time_other.zero();
-  time_answer.zero();
+  time_q.zero();
+  time_quat.zero();
   _time_cast=0.0;
   
   #ifdef GPU_CAST
   compile_kernels(*dev);
   #endif
   
-  return success && alloc(ef_inum,ef_nall);
+  return success && alloc(ef_nall);
 }
   
 template <class numtyp, class acctyp>
@@ -234,16 +244,12 @@ void PairGPUAtomT::clear_resize() {
     dev_quat.clear();
     host_quat.clear();
   }
-  dev_ans.clear();
-  dev_engv.clear();
   #ifndef GPU_CAST
   host_x.clear();
   #else
   host_x_cast.clear();
   host_type_cast.clear();
   #endif
-  host_ans.clear();
-  host_engv.clear();
   dev_cell_id.clear();
   dev_particle_id.clear();
   dev_tag.clear();
@@ -261,17 +267,14 @@ void PairGPUAtomT::clear_resize() {
 
 template <class numtyp, class acctyp>
 void PairGPUAtomT::clear() {
-  _gpu_bytes=0;
+  _max_gpu_bytes=0;
   if (!_allocated)
     return;
 
   time_pos.clear();
-  time_other.clear();
-  time_answer.clear();
+  time_q.clear();
+  time_quat.clear();
   clear_resize();
-  _inum=0;
-  _eflag=false;
-  _vflag=false;
 
   #ifdef GPU_CAST
   if (_compiled) {
@@ -289,255 +292,10 @@ double PairGPUAtomT::host_memory_usage() const {
     atom_bytes+=1;
   if (_rot) 
     atom_bytes+=4;
-  int ans_bytes=atom_bytes+_ev_fields;
   return _max_atoms*atom_bytes*sizeof(numtyp)+
-         ans_bytes*(_max_local)*sizeof(acctyp)+
          sizeof(PairGPUAtom<numtyp,acctyp>);
 }
   
-template <class numtyp, class acctyp>
-void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag,
-                                const bool ef_atom, const bool vf_atom) {
-  time_answer.start();
-  _eflag=eflag;
-  _vflag=vflag;
-  _ef_atom=ef_atom;
-  _vf_atom=vf_atom;
-    
-  int csize=_ev_fields;    
-  if (!eflag)
-    csize-=_e_fields;
-  if (!vflag)
-    csize-=6;
-      
-  if (csize>0)
-    ucl_copy(host_engv,dev_engv,_inum*csize,true);
-  if (_rot)
-    ucl_copy(host_ans,dev_ans,_inum*4*2,true);
-  else
-    ucl_copy(host_ans,dev_ans,_inum*4,true);
-  time_answer.stop();
-}
-
-template <class numtyp, class acctyp>
-void PairGPUAtomT::copy_answers(const bool eflag, const bool vflag,
-                                const bool ef_atom, const bool vf_atom,
-                                int *ilist) {
-  _ilist=ilist;
-  copy_answers(eflag,vflag,ef_atom,vf_atom);
-}
-
-template <class numtyp, class acctyp>
-double PairGPUAtomT::energy_virial(double *eatom, double **vatom,
-                                   double *virial) {
-  if (_eflag==false && _vflag==false)
-    return 0.0;
-
-  double evdwl=0.0;
-  if (_gpu_nbor) {
-    for (int i=0; i<_inum; i++) {
-      acctyp *ap=host_engv.begin()+i;
-      if (_eflag) {
-        if (_ef_atom) {
-          evdwl+=*ap;
-          eatom[i]+=*ap*0.5;
-          ap+=_inum;
-        } else {
-          evdwl+=*ap;
-          ap+=_inum;
-        }
-      }
-      if (_vflag) {
-        if (_vf_atom) {
-          for (int j=0; j<6; j++) {
-            vatom[i][j]+=*ap*0.5;
-            virial[j]+=*ap;
-            ap+=_inum;
-          }
-        } else {
-          for (int j=0; j<6; j++) {
-            virial[j]+=*ap;
-            ap+=_inum;
-          }
-        }
-      }
-    }
-    for (int j=0; j<6; j++)
-      virial[j]*=0.5;
-  } else {
-    for (int i=0; i<_inum; i++) {
-      acctyp *ap=host_engv.begin()+i;
-      int ii=_ilist[i];
-      if (_eflag) {
-        if (_ef_atom) {
-          evdwl+=*ap;
-          eatom[ii]+=*ap*0.5;
-          ap+=_inum;
-        } else {
-          evdwl+=*ap;
-          ap+=_inum;
-        }
-      }
-      if (_vflag) {
-        if (_vf_atom) {
-          for (int j=0; j<6; j++) {
-            vatom[ii][j]+=*ap*0.5;
-            virial[j]+=*ap;
-            ap+=_inum;
-          }
-        } else {
-          for (int j=0; j<6; j++) {
-            virial[j]+=*ap;
-            ap+=_inum;
-          }
-        }
-      }
-    }
-    for (int j=0; j<6; j++)
-      virial[j]*=0.5;
-  }
-  
-  evdwl*=0.5;
-  return evdwl;
-}
-
-template <class numtyp, class acctyp>
-double PairGPUAtomT::energy_virial(double *eatom, double **vatom,
-                                   double *virial, double &ecoul) {
-  if (_eflag==false && _vflag==false) {
-    ecoul=0.0;
-    return 0.0;
-  }
-
-  if (_charge==false)
-    return energy_virial(eatom,vatom,virial);
-
-  double evdwl=0.0;
-  double _ecoul=0.0;
-  if (_gpu_nbor) {
-    for (int i=0; i<_inum; i++) {
-      acctyp *ap=host_engv.begin()+i;
-      if (_eflag) {
-        if (_ef_atom) {
-          evdwl+=*ap;
-          eatom[i]+=*ap*0.5;
-          ap+=_inum;
-          _ecoul+=*ap;
-          eatom[i]+=*ap*0.5;
-          ap+=_inum;
-        } else {
-          evdwl+=*ap;
-          ap+=_inum;
-          _ecoul+=*ap;
-          ap+=_inum;
-        }
-      }
-      if (_vflag) {
-        if (_vf_atom) {
-          for (int j=0; j<6; j++) {
-            vatom[i][j]+=*ap*0.5;
-            virial[j]+=*ap;
-            ap+=_inum;
-          }
-        } else {
-          for (int j=0; j<6; j++) {
-            virial[j]+=*ap;
-            ap+=_inum;
-          }
-        }
-      }
-    }
-    for (int j=0; j<6; j++)
-      virial[j]*=0.5;
-  } else {
-    for (int i=0; i<_inum; i++) {
-      acctyp *ap=host_engv.begin()+i;
-      int ii=_ilist[i];
-      if (_eflag) {
-        if (_ef_atom) {
-          evdwl+=*ap;
-          eatom[ii]+=*ap*0.5;
-          ap+=_inum;
-          _ecoul+=*ap;
-          eatom[ii]+=*ap*0.5;
-          ap+=_inum;
-        } else {
-          evdwl+=*ap;
-          ap+=_inum;
-          _ecoul+=*ap;
-          ap+=_inum;
-        }
-      }
-      if (_vflag) {
-        if (_vf_atom) {
-          for (int j=0; j<6; j++) {
-            vatom[ii][j]+=*ap*0.5;
-            virial[j]+=*ap;
-            ap+=_inum;
-          }
-        } else {
-          for (int j=0; j<6; j++) {
-            virial[j]+=*ap;
-            ap+=_inum;
-          }
-        }
-      }
-    }
-    for (int j=0; j<6; j++)
-      virial[j]*=0.5;
-  }
-  
-  evdwl*=0.5;
-  ecoul+=_ecoul*0.5;
-  return evdwl;
-}
-
-template <class numtyp, class acctyp>
-void PairGPUAtomT::get_answers(double **f, double **tor) {
-  acctyp *ap=host_ans.begin();
-  if (_gpu_nbor) {
-    for (int i=0; i<_inum; i++) {
-      f[i][0]+=*ap;
-      ap++;
-      f[i][1]+=*ap;
-      ap++;
-      f[i][2]+=*ap;
-      ap+=2;
-    }
-    if (_rot) {
-      for (int i=0; i<_inum; i++) {
-        tor[i][0]+=*ap;
-        ap++;
-        tor[i][1]+=*ap;
-        ap++;
-        tor[i][2]+=*ap;
-        ap+=2;
-      }
-    }
-  } else {
-    for (int i=0; i<_inum; i++) {
-      int ii=_ilist[i];
-      f[ii][0]+=*ap;
-      ap++;
-      f[ii][1]+=*ap;
-      ap++;
-      f[ii][2]+=*ap;
-      ap+=2;
-    }
-    if (_rot) {
-      for (int i=0; i<_inum; i++) {
-        int ii=_ilist[i];
-        tor[ii][0]+=*ap;
-        ap++;
-        tor[ii][1]+=*ap;
-        ap++;
-        tor[ii][2]+=*ap;
-        ap+=2;
-      }
-    }
-  }
-}
-
 // Sort arrays for neighbor list calculation
 template <class numtyp, class acctyp>
 void PairGPUAtomT::sort_neighbor(const int num_atoms) {
diff --git a/lib/gpu/pair_gpu_atom.h b/lib/gpu/pair_gpu_atom.h
index e0a1fd9fb1..526c146f37 100644
--- a/lib/gpu/pair_gpu_atom.h
+++ b/lib/gpu/pair_gpu_atom.h
@@ -23,7 +23,6 @@
 
 #ifdef USE_OPENCL
 
-#include "geryon/ocl_device.h"
 #include "geryon/ocl_timer.h"
 #include "geryon/ocl_mat.h"
 #include "geryon/ocl_kernel.h"
@@ -32,7 +31,6 @@ using namespace ucl_opencl;
 #else
 
 #include "cudpp.h"
-#include "geryon/nvd_device.h"
 #include "geryon/nvd_timer.h"
 #include "geryon/nvd_mat.h"
 #include "geryon/nvd_kernel.h"
@@ -40,10 +38,6 @@ using namespace ucl_cudadr;
 
 #endif
 
-#ifndef int2
-struct int2 { int x; int y; };
-#endif
-
 #include "pair_gpu_precision.h"
 
 template <class numtyp, class acctyp>
@@ -56,13 +50,9 @@ class PairGPUAtom {
   inline int max_atoms() const { return _max_atoms; }
   /// Current number of local+ghost atoms stored
   inline int nall() const { return _nall; }
-  /// Current number of local atoms stored
-  inline int inum() const { return _inum; }
 
   /// Set number of local+ghost atoms for future copy operations
   inline void nall(const int n) { _nall=n; }
-  /// Set number of local atoms for future copy operations
-  inline void inum(const int n) { _inum=n; }
   
   /// Memory usage per atom in this class
   int bytes_per_atom() const; 
@@ -70,21 +60,33 @@ class PairGPUAtom {
   /// Clear any previous data and set up for a new LAMMPS run
   /** \param rot True if atom storage needs quaternions
     * \param gpu_nbor True if neighboring will be performed on device **/
-  bool init(const int inum, const int nall, const bool charge, const bool rot, 
+  bool init(const int nall, const bool charge, const bool rot, 
             UCL_Device &dev, const bool gpu_nbor=false, const bool bonds=false);
   
   /// Check if we have enough device storage and realloc if not
-  inline bool resize(const int inum, const int nall, bool &success) {
-    _inum=inum;
+  /** Returns true if resized with any call during this timestep **/
+  inline bool resize(const int nall, bool &success) {
     _nall=nall;
-    if (inum>_max_local || nall>_max_atoms) {
+    if (nall>_max_atoms) {
       clear_resize();
-      success = success && alloc(inum,nall);
-      return true;
+      success = success && alloc(nall);
+      _resized=true;
     }
-    return false;
+    return _resized;
   }
-
+  
+  /// If already initialized by another LAMMPS style, add fields as necessary
+  /** \param rot True if atom storage needs quaternions
+    * \param gpu_nbor True if neighboring will be performed on device **/
+  bool add_fields(const bool charge, const bool rot, const bool gpu_nbor,
+                  const bool bonds);
+  
+  /// Returns true if GPU is using charges
+  bool charge() { return _charge; }
+  
+  /// Returns true if GPU is using quaternions
+  bool quat() { return _rot; }
+  
   /// Only free matrices of length inum or nall for resizing
   void clear_resize();
   
@@ -100,28 +102,42 @@ class PairGPUAtom {
   /// Add copy times to timers
   inline void acc_timers() {
     time_pos.add_to_total();
-    time_answer.add_to_total();
-    if (_other)
-      time_other.add_to_total();
+    if (_charge)
+      time_q.add_to_total();
+    if (_rot)
+      time_quat.add_to_total();
   }
 
   /// Add copy times to timers
   inline void zero_timers() {
     time_pos.zero();
-    time_answer.zero();
-    if (_other)
-      time_other.zero();
+    if (_charge)
+      time_q.zero();
+    if (_rot)
+      time_quat.zero();
   }
 
   /// Return the total time for host/device data transfer
+  /** Zeros the total so that the atom times are only included once **/
   inline double transfer_time() {
-    double total=time_pos.total_seconds()+time_answer.total_seconds();
-    if (_other) total+=time_other.total_seconds();
+    double total=time_pos.total_seconds();
+    time_pos.zero_total();
+    if (_charge) {
+      total+=time_q.total_seconds();
+      time_q.zero_total();
+    }
+    if (_rot) {
+      total+=time_q.total_seconds();
+      time_quat.zero_total();
+    }
+    
     return total;
   }
   
   /// Return the total time for data cast/pack
-  inline double cast_time() { return _time_cast; }
+  /** Zeros the time so that atom times are only included once **/
+  inline double cast_time() 
+    { double t=_time_cast; _time_cast=0.0; return t; }
 
   /// Pack LAMMPS atom type constants into matrix and copy to device
   template <class dev_typ, class t1>
@@ -216,43 +232,52 @@ class PairGPUAtom {
 
   // -------------------------COPY TO GPU ----------------------------------
 
+  /// Signal that we need to transfer atom data for next timestep
+  inline void data_unavail()
+    { _x_avail=false; _q_avail=false; _quat_avail=false; _resized=false; }
+
   /// Cast positions and types to write buffer
   inline void cast_x_data(double **host_ptr, const int *host_type) {
-    double t=MPI_Wtime();
-    #ifdef GPU_CAST
-    memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
-    memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
-    #else
-    numtyp *_write_loc=host_x.begin();
-    for (int i=0; i<_nall; i++) {
-      *_write_loc=host_ptr[i][0];
-      _write_loc++;
-      *_write_loc=host_ptr[i][1];
-      _write_loc++;
-      *_write_loc=host_ptr[i][2];
-      _write_loc++;
-      *_write_loc=host_type[i];
-      _write_loc++;
+    if (_x_avail==false) {
+      double t=MPI_Wtime();
+      #ifdef GPU_CAST
+      memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
+      memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
+      #else
+      numtyp *_write_loc=host_x.begin();
+      for (int i=0; i<_nall; i++) {
+        *_write_loc=host_ptr[i][0];
+        _write_loc++;
+        *_write_loc=host_ptr[i][1];
+        _write_loc++;
+        *_write_loc=host_ptr[i][2];
+        _write_loc++;
+        *_write_loc=host_type[i];
+        _write_loc++;
+      }
+      #endif
+      _time_cast+=MPI_Wtime()-t;
     }
-    #endif
-    _time_cast+=MPI_Wtime()-t;
-  }      
+  }
 
   /// Copy positions and types to device asynchronously
   /** Copies nall() elements **/
   inline void add_x_data(double **host_ptr, int *host_type) { 
     time_pos.start();
-    #ifdef GPU_CAST
-    ucl_copy(dev_x_cast,host_x_cast,_nall*3,true);
-    ucl_copy(dev_type_cast,host_type_cast,_nall,true);
-    int block_size=64;
-    int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
-    k_cast_x.set_size(GX,block_size);
-    k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(), 
-                 &_nall);
-    #else
-    ucl_copy(dev_x,host_x,_nall*4,true);
-    #endif
+    if (_x_avail==false) {
+      #ifdef GPU_CAST
+      ucl_copy(dev_x_cast,host_x_cast,_nall*3,true);
+      ucl_copy(dev_type_cast,host_type_cast,_nall,true);
+      int block_size=64;
+      int GX=static_cast<int>(ceil(static_cast<double>(_nall)/block_size));
+      k_cast_x.set_size(GX,block_size);
+      k_cast_x.run(&dev_x.begin(), &dev_x_cast.begin(), &dev_type_cast.begin(), 
+                   &_nall);
+      #else
+      ucl_copy(dev_x,host_x,_nall*4,true);
+      #endif
+      _x_avail=true;
+    }
     time_pos.stop();
   }
 
@@ -262,87 +287,68 @@ class PairGPUAtom {
     add_x_data(host_ptr,host_type);
   }
 
-  /// Cast charges to write buffer
+  // Cast charges to write buffer
   template<class cpytyp>
   inline void cast_q_data(cpytyp *host_ptr) {
-    double t=MPI_Wtime();
-    if (dev->device_type()==UCL_CPU) {
-      if (sizeof(numtyp)==sizeof(double)) {
-        host_q.view((numtyp*)host_ptr,_nall,*dev);
-        dev_q.view(host_q);
-      } else
-        for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
-    } else {
-      if (sizeof(numtyp)==sizeof(double))
-        memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp));
-      else
-        for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
+    if (_q_avail==false) {
+      double t=MPI_Wtime();
+      if (dev->device_type()==UCL_CPU) {
+        if (sizeof(numtyp)==sizeof(double)) {
+          host_q.view((numtyp*)host_ptr,_nall,*dev);
+          dev_q.view(host_q);
+        } else
+          for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
+      } else {
+        if (sizeof(numtyp)==sizeof(double))
+          memcpy(host_q.begin(),host_ptr,_nall*sizeof(numtyp));
+        else
+          for (int i=0; i<_nall; i++) host_q[i]=host_ptr[i];
+      }
+      _time_cast+=MPI_Wtime()-t;
     }
-    _time_cast+=MPI_Wtime()-t;
   }
 
-  /// Copy charges to device asynchronously
+  // Copy charges to device asynchronously
   inline void add_q_data() {
-    ucl_copy(dev_q,host_q,_nall,true);
+    if (_q_avail==false) {
+      ucl_copy(dev_q,host_q,_nall,true);
+      _q_avail=true;
+    }
   }
 
-  /// Cast quaternions to write buffer
+  // Cast quaternions to write buffer
   template<class cpytyp>
   inline void cast_quat_data(cpytyp *host_ptr) {
-    double t=MPI_Wtime();
-    if (dev->device_type()==UCL_CPU) {
-      if (sizeof(numtyp)==sizeof(double)) {
-        host_quat.view((numtyp*)host_ptr,_nall*4,*dev);
-        dev_quat.view(host_quat);
-      } else
-        for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
-    } else {
-      if (sizeof(numtyp)==sizeof(double))
-        memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp));
-      else
-        for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
+    if (_quat_avail==false) {
+      double t=MPI_Wtime();
+      if (dev->device_type()==UCL_CPU) {
+        if (sizeof(numtyp)==sizeof(double)) {
+          host_quat.view((numtyp*)host_ptr,_nall*4,*dev);
+          dev_quat.view(host_quat);
+        } else
+          for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
+      } else {
+        if (sizeof(numtyp)==sizeof(double))
+          memcpy(host_quat.begin(),host_ptr,_nall*4*sizeof(numtyp));
+        else
+          for (int i=0; i<_nall*4; i++) host_quat[i]=host_ptr[i];
+      }
+      _time_cast+=MPI_Wtime()-t;
     }
-    _time_cast+=MPI_Wtime()-t;
   }
 
-  /// Copy quaternions to device
+  // Copy quaternions to device
   /** Copies nall()*4 elements **/
   inline void add_quat_data() {
-    ucl_copy(dev_quat,host_quat,_nall*4,true);
+    if (_quat_avail==false) {
+      ucl_copy(dev_quat,host_quat,_nall*4,true);
+      _quat_avail=true;
+    }
   }
 
-  /// Copy data other than pos and data to device
-  inline void add_other_data() {
-    time_other.start();
-    if (_charge)
-      add_q_data();
-    if (_rot)
-      add_quat_data();
-    time_other.stop();
-  }
-  
   /// Return number of bytes used on device
-  inline double gpu_bytes() { return _gpu_bytes; } 
-
-  // -------------------------COPY FROM GPU -------------------------------
-
-  /// Copy answers from device into read buffer asynchronously
-  void copy_answers(const bool eflag, const bool vflag,
-                    const bool ef_atom, const bool vf_atom);
-
-  /// Copy answers from device into read buffer asynchronously
-  void copy_answers(const bool eflag, const bool vflag,
-                    const bool ef_atom, const bool vf_atom, int *ilist);
-  
-  /// Copy energy and virial data into LAMMPS memory
-  double energy_virial(double *eatom, double **vatom, double *virial);
-
-  /// Copy energy and virial data into LAMMPS memory
-  double energy_virial(double *eatom, double **vatom, double *virial,
-                       double &ecoul);
-
-  /// Add forces and torques from the GPU into a LAMMPS pointer
-  void get_answers(double **f, double **tor);
+  inline double max_gpu_bytes() 
+    { double m=_max_gpu_bytes; _max_gpu_bytes=0.0; return m; } 
 
   // ------------------------------ DATA ----------------------------------
 
@@ -352,10 +358,6 @@ class PairGPUAtom {
   UCL_D_Vec<numtyp> dev_q;
   /// Quaterions
   UCL_D_Vec<numtyp> dev_quat;
-  /// Force and possibly torque
-  UCL_D_Vec<acctyp> dev_ans;
-  /// Energy and virial per-atom storage
-  UCL_D_Vec<acctyp> dev_engv;
   
   #ifdef GPU_CAST
   UCL_D_Vec<double> dev_x_cast;
@@ -370,10 +372,6 @@ class PairGPUAtom {
   UCL_H_Vec<numtyp> host_q;
   /// Buffer for moving quat data to GPU
   UCL_H_Vec<numtyp> host_quat;
-  /// Force and possibly torque data on host
-  UCL_H_Vec<acctyp> host_ans;
-  /// Energy/virial data on host
-  UCL_H_Vec<acctyp> host_engv;
   
   /// Cell list identifiers for device nbor builds
   UCL_D_Vec<unsigned> dev_cell_id;
@@ -383,7 +381,7 @@ class PairGPUAtom {
   UCL_D_Vec<int> dev_tag;
 
   /// Device timers
-  UCL_Timer time_pos, time_other, time_answer;
+  UCL_Timer time_pos, time_q, time_quat;
   
   /// Geryon device
   UCL_Device *dev;
@@ -396,19 +394,19 @@ class PairGPUAtom {
   #endif
 
   bool _compiled;
-
-  bool alloc(const int inum, const int nall);
   
-  bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other;
-  int _max_local, _max_atoms, _nall, _inum, _e_fields, _ev_fields;
+  // True if data has been copied to device already
+  bool _x_avail, _q_avail, _quat_avail, _resized;
+
+  bool alloc(const int nall);
+  
+  bool _allocated, _rot, _charge, _other;
+  int _max_atoms, _nall;
   bool _gpu_nbor, _bonds;
-  int *_ilist;
   double _time_cast;
   
-  double _gpu_bytes;
+  double _max_gpu_bytes;
   
-  bool _newton;
-
   #ifndef USE_OPENCL
   CUDPPConfiguration sort_config;
   CUDPPHandle sort_plan;
diff --git a/lib/gpu/pair_gpu_balance.h b/lib/gpu/pair_gpu_balance.h
index a3a0f61a62..9e14ad60d8 100644
--- a/lib/gpu/pair_gpu_balance.h
+++ b/lib/gpu/pair_gpu_balance.h
@@ -23,7 +23,7 @@
 
 #define _HD_BALANCE_EVERY 25
 #define _HD_BALANCE_WEIGHT 0.5
-#define _HD_BALANCE_GAP 1.05
+#define _HD_BALANCE_GAP 1.10
 
 /// Host/device load balancer
 template<class numtyp, class acctyp>
@@ -33,7 +33,8 @@ class PairGPUBalance {
   inline ~PairGPUBalance() { clear(); }
 
   /// Clear any old data and setup for new LAMMPS run
-  inline void init(PairGPUDevice<numtyp, acctyp> *gpu, const double split);
+  inline void init(PairGPUDevice<numtyp, acctyp> *gpu, const bool gpu_nbor,
+                   const double split);
 
   /// Clear all host and device data
   inline void clear() {
@@ -43,23 +44,25 @@ class PairGPUBalance {
       _init_done=false;
     }
   }
+  
+  /// Return the timestep since initialization
+  inline int timestep() { return _timestep; }
 
   /// Get a count of the number of particles host will handle for initial alloc
-  inline int first_host_count(const int nlocal,const bool gpu_nbor,
-                              const double gpu_split) const {
+  inline int first_host_count(const int nlocal, const double gpu_split,
+                              const bool gpu_nbor) const {
     int host_nlocal=0;
     if (gpu_nbor && gpu_split!=1.0) {
       if (gpu_split>0)
         host_nlocal=static_cast<int>(ceil((1.0-gpu_split)*nlocal));
       else
-        host_nlocal=static_cast<int>(ceil(0.1*nlocal));
+        host_nlocal=static_cast<int>(ceil(0.05*nlocal));
     }
     return host_nlocal;
   }
 
   /// Return the number of particles the device will handle this timestep
-  inline int get_gpu_count(const int timestep, const int ago,
-                           const int inum_full);
+  inline int get_gpu_count(const int ago, const int inum_full);
 
   /// Return the average fraction of particles handled by device on all procs
   inline double all_avg_split() {
@@ -82,10 +85,10 @@ class PairGPUBalance {
     if (_measure_this_step) {
       _device->gpu->sync();
       _device->gpu_barrier();
+      _device->start_host_timer();
       _device_time.start();
       _device->gpu->sync();
       _device->gpu_barrier();
-      _device->start_host_timer();
     }
   }
 
@@ -95,34 +98,34 @@ class PairGPUBalance {
   /// Calculate the new host/device split based on the cpu and device times
   /** \note Only does calculation every _HD_BALANCE_EVERY timesteps 
             (and first 10) **/
-  inline void balance(const double cpu_time, const bool gpu_nbor);
+  inline void balance(const double cpu_time);
 
   /// Calls balance() and then get_gpu_count()
-  inline int balance(const int timestep, const int ago, const int inum_full,
-                     const double cpu_time, const bool gpu_nbor) {
-    balance(cpu_time,gpu_nbor);
-    return get_gpu_count(timestep,ago,inum_full);
+  inline int balance(const int ago,const int inum_full,const double cpu_time) {
+    balance(cpu_time);
+    return get_gpu_count(ago,inum_full);
   }
   
  private:
   PairGPUDevice<numtyp,acctyp> *_device;
   UCL_Timer _device_time;
-  bool _init_done;
+  bool _init_done, _gpu_nbor;
   
   bool _load_balance;
   double _actual_split, _avg_split, _desired_split, _max_split;
   int _avg_count;
 
   bool _measure_this_step;
-  int _inum, _inum_full;
+  int _inum, _inum_full, _timestep;
 };
 
 #define PairGPUBalanceT PairGPUBalance<numtyp,acctyp>
 
 template <class numtyp, class acctyp>
-void PairGPUBalanceT::init(PairGPUDevice<numtyp, acctyp> *gpu,
-			   const double split) {
+void PairGPUBalanceT::init(PairGPUDevice<numtyp, acctyp> *gpu, 
+                           const bool gpu_nbor, const double split) {
   clear();
+  _gpu_nbor=gpu_nbor;
   _init_done=true;
   
   _device=gpu;
@@ -130,7 +133,7 @@ void PairGPUBalanceT::init(PairGPUDevice<numtyp, acctyp> *gpu,
   
   if (split<0.0) {
     _load_balance=true;
-    _desired_split=0.9;
+    _desired_split=0.90;
   } else {
     _load_balance=false;
     _desired_split=split;
@@ -138,14 +141,14 @@ void PairGPUBalanceT::init(PairGPUDevice<numtyp, acctyp> *gpu,
   _actual_split=_desired_split;
   _avg_split=0.0;
   _avg_count=0;
+  _timestep=0;
 }
 
 template <class numtyp, class acctyp>
-int PairGPUBalanceT::get_gpu_count(const int timestep, const int ago,
-			           const int inum_full) {
+int PairGPUBalanceT::get_gpu_count(const int ago, const int inum_full) {
   _measure_this_step=false;
   if (_load_balance) {
-    if (_avg_count<11 || timestep%_HD_BALANCE_EVERY==0) {
+    if (_avg_count<11 || _timestep%_HD_BALANCE_EVERY==0) {
       _measure_this_step=true;
       _inum_full=inum_full;
     }
@@ -156,44 +159,44 @@ int PairGPUBalanceT::get_gpu_count(const int timestep, const int ago,
   }
   _inum=static_cast<int>(floor(_actual_split*inum_full));
   if (_inum==0) _inum++;
+  _timestep++;
   return _inum;
 }
     
 template <class numtyp, class acctyp>
-void PairGPUBalanceT::balance(const double cpu_time, const bool gpu_nbor) {
+void PairGPUBalanceT::balance(const double cpu_time) {
   if (_measure_this_step) {
+    _measure_this_step=false;
+    double gpu_time=_device_time.seconds();
+
+    double max_gpu_time;
+    MPI_Allreduce(&gpu_time,&max_gpu_time,1,MPI_DOUBLE,MPI_MAX,
+                  _device->gpu_comm());
+
     if (_inum_full==_inum) {
       _desired_split=1.0;
       return;
     }
 
-    _measure_this_step=false;
-    double gpu_time=_device_time.seconds();
+    double cpu_time_per_atom=cpu_time/(_inum_full-_inum);
+    double cpu_other_time=_device->host_time()-cpu_time;
+    int host_inum=static_cast<int>((max_gpu_time-cpu_other_time)/
+                                   cpu_time_per_atom);
 
-    double cpu_gpu_time[3], max_times[3];
-    cpu_gpu_time[0]=cpu_time/(_inum_full-_inum);
-    cpu_gpu_time[1]=gpu_time/_inum;
-    cpu_gpu_time[2]=(_device->host_time()-cpu_time)/_inum_full;
+    double split=static_cast<double>(_inum_full-host_inum)/_inum_full;
+    _desired_split=split*_HD_BALANCE_GAP;
+    if (_desired_split>1.0)
+      _desired_split=1.0;
+    if (_desired_split<0.0)
+      _desired_split=0.0;
 
-    MPI_Allreduce(cpu_gpu_time,max_times,3,MPI_DOUBLE,MPI_MAX,
-                  _device->gpu_comm());
-    double split=(max_times[0]+max_times[2])/(max_times[0]+max_times[1]);
-    split*=_HD_BALANCE_GAP;
-
-    if (split>1.0)
-      split=1.0;
-    if (_avg_count<10)
-      _desired_split=(_desired_split*_avg_count+split)/(_avg_count+1);
-    else
-      _desired_split=_desired_split*(1.0-_HD_BALANCE_WEIGHT)+
-                     _HD_BALANCE_WEIGHT*split;
-
-    if (!gpu_nbor) {
+    if (!_gpu_nbor) {
       if (_desired_split<_max_split)
         _actual_split=_desired_split;
       else
         _actual_split=_max_split;
     }
+//std::cout << gpu_time << " " << max_gpu_time << " " << cpu_other_time << " " << cpu_time_per_atom << " " << cpu_time << " " << _desired_split << " " << host_inum << std::endl;
   }
   _avg_split+=_desired_split;
   _avg_count++;
diff --git a/lib/gpu/pair_gpu_build_kernel.cu b/lib/gpu/pair_gpu_build_kernel.cu
index bcf41c0050..33742a4cba 100644
--- a/lib/gpu/pair_gpu_build_kernel.cu
+++ b/lib/gpu/pair_gpu_build_kernel.cu
@@ -18,7 +18,7 @@
 
 #ifdef NV_KERNEL
 
-#include "geryon/ucl_nv_kernel.h"
+#include "nv_kernel_def.h"
 texture<float4> neigh_tex;
 
 #ifdef _DOUBLE_DOUBLE
@@ -36,6 +36,7 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 #else
 
 #define fetch_pos(i,y) x_[i]
+#define BLOCK_NBOR_BUILD 64
 
 #endif
 
@@ -54,29 +55,30 @@ __inline float4 fetch_pos(const int& i, const float4 *pos)
 #define numtyp4 float4
 #endif
 
-#define CELL_BLOCK_SIZE 64
-#define BLOCK_2D 8
+#define BLOCK_CELL_2D 8
+
+#define SBBITS 30
 
 #define SBBITS 30
 
 __kernel void transpose(int *out, int *in, int columns_in, int rows_in)
 {
-	__local float block[BLOCK_2D][BLOCK_2D+1];
+	__local float block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
 	
 	unsigned ti=THREAD_ID_X;
 	unsigned tj=THREAD_ID_Y;
 	unsigned bi=BLOCK_ID_X;
 	unsigned bj=BLOCK_ID_Y;
 	
-	unsigned i=bi*BLOCK_2D+ti;
-	unsigned j=bj*BLOCK_2D+tj;
+	unsigned i=bi*BLOCK_CELL_2D+ti;
+	unsigned j=bj*BLOCK_CELL_2D+tj;
 	if ((i<columns_in) && (j<rows_in))
 		block[tj][ti]=in[j*columns_in+i];
 
 	__syncthreads();
 
-	i=bj*BLOCK_2D+ti;
-	j=bi*BLOCK_2D+tj;
+	i=bj*BLOCK_CELL_2D+ti;
+	j=bi*BLOCK_CELL_2D+tj;
 	if ((i<rows_in) && (j<columns_in))
 		out[j*rows_in+i] = block[ti][tj];
 }
@@ -141,7 +143,8 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
 				     int *cell_particle_id, 
 				     int *cell_counts,
 				     int *nbor_list,
-				     int *host_nbor_list, 
+				     int *host_nbor_list,
+				     int *host_numj, 
 				     int neigh_bin_size, 
 				     numtyp cell_size,
 				     int ncellx, int ncelly, int ncellz,
@@ -154,8 +157,8 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
 	  
   int icell = ix + iy*ncellx + iz*ncellx*ncelly;
 
-  __shared__ int cell_list_sh[CELL_BLOCK_SIZE];
-  __shared__ numtyp4 pos_sh[CELL_BLOCK_SIZE];
+  __shared__ int cell_list_sh[BLOCK_NBOR_BUILD];
+  __shared__ numtyp4 pos_sh[BLOCK_NBOR_BUILD];
 
   int icell_begin = cell_counts[icell];
   int icell_end = cell_counts[icell+1];
@@ -185,9 +188,9 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
       neigh_list=neigh_counts+stride;
       nbor_list[pid_i]=pid_i;
     } else {
-      stride=nt-inum;
-    	neigh_counts=host_nbor_list+pid_i-inum;
-      neigh_list=neigh_counts+stride;
+      stride=1;
+    	neigh_counts=host_numj+pid_i-inum;
+      neigh_list=host_nbor_list+(pid_i-inum)*neigh_bin_size;
     }
     
     // loop through neighbors
@@ -203,13 +206,13 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
           int num_atom_cell = jcell_end - jcell_begin;
 	  
           // load jcell to shared memory
-          int num_iter = (int)ceil((numtyp)num_atom_cell/CELL_BLOCK_SIZE);
+          int num_iter = (int)ceil((numtyp)num_atom_cell/BLOCK_NBOR_BUILD);
 
           for (int k = 0; k < num_iter; k++) {
-            int end_idx = min(CELL_BLOCK_SIZE, num_atom_cell-k*CELL_BLOCK_SIZE);
+            int end_idx = min(BLOCK_NBOR_BUILD, num_atom_cell-k*BLOCK_NBOR_BUILD);
 	    
             if (tid < end_idx) {
-              pid_j =  cell_particle_id[tid+k*CELL_BLOCK_SIZE+jcell_begin];
+              pid_j =  cell_particle_id[tid+k*BLOCK_NBOR_BUILD+jcell_begin];
               cell_list_sh[tid] = pid_j;
               atom_j = fetch_pos(pid_j,pos); //[pid_j];
               pos_sh[tid].x = atom_j.x;
@@ -222,20 +225,18 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
 	    
               for (int j = 0; j < end_idx; j++) {
                 int pid_j = cell_list_sh[j]; // gather from shared memory
-                if (pid_i<inum || pid_j<inum || pid_j>pid_i) {
-                  diff.x = atom_i.x - pos_sh[j].x;
-                  diff.y = atom_i.y - pos_sh[j].y;
-                  diff.z = atom_i.z - pos_sh[j].z;
+                diff.x = atom_i.x - pos_sh[j].x;
+                diff.y = atom_i.y - pos_sh[j].y;
+                diff.z = atom_i.z - pos_sh[j].z;
 		
-                  r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z;
-                  if (r2 < cell_size*cell_size && r2 > 1e-5) {
-                    if (cnt < neigh_bin_size) {
-                      *neigh_list = pid_j;
-                      neigh_list+=stride;
-                    }
-                    cnt++;
-                  }		
-                }
+                r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z;
+                if (r2 < cell_size*cell_size && r2 > 1e-5) {
+                  if (cnt < neigh_bin_size) {
+                    *neigh_list = pid_j;
+                    neigh_list+=stride;
+                  }
+                  cnt++;
+                }		
               }
             }
 	          __syncthreads();
@@ -249,9 +250,10 @@ __kernel void calc_neigh_list_cell(numtyp4 *pos,
 }
 
 __kernel void kernel_special(__global int *dev_nbor, 
-                             __global int *host_nbor_list, __global int *tag,
+                             __global int *host_nbor_list, 
+                             __global int *host_numj, __global int *tag,
                              __global int *nspecial, __global int *special,
-                             int inum, int nt, int nall) {
+                             int inum, int nt, int nall, int max_nbors) {
   // ii indexes the two interacting particles in gi
   int ii=GLOBAL_ID_X;
 
@@ -263,15 +265,17 @@ __kernel void kernel_special(__global int *dev_nbor,
     int n2=nspecial[ii*3+1];
     int n3=nspecial[ii*3+2];
 
+    int numj;
     if (ii < inum) {
       stride=inum;
       list=dev_nbor+stride+ii;
+      numj=*list;
+      list+=stride;
     } else {
-      stride=nt-inum;
-      list=host_nbor_list+ii-inum;
+      stride=1;
+      list=host_nbor_list+(ii-inum)*max_nbors;
+      numj=host_numj[ii-inum];
     }
-    int numj=*list;
-    list+=stride;
     list_end=list+numj*stride;
   
     for ( ; list<list_end; list+=stride) {
@@ -294,4 +298,3 @@ __kernel void kernel_special(__global int *dev_nbor,
     }
   } // if ii
 }
-
diff --git a/lib/gpu/pair_gpu_device.cpp b/lib/gpu/pair_gpu_device.cpp
index c2d980cf99..d5906b10e5 100644
--- a/lib/gpu/pair_gpu_device.cpp
+++ b/lib/gpu/pair_gpu_device.cpp
@@ -19,13 +19,22 @@
 #include "pair_gpu_precision.h"
 #include <map>
 #include <math.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#ifdef USE_OPENCL
+#include "pair_gpu_dev_cl.h"
+#else
+#include "pair_gpu_dev_ptx.h"
+#endif
 
 #define PairGPUDeviceT PairGPUDevice<numtyp, acctyp>
 
 template <class numtyp, class acctyp>
 PairGPUDeviceT::PairGPUDevice() : _init_count(0), _device_init(false),
                                   _gpu_mode(GPU_FORCE), _first_device(0),
-                                  _last_device(0) {
+                                  _last_device(0), _compiled(false) {
 }
 
 template <class numtyp, class acctyp>
@@ -34,14 +43,19 @@ PairGPUDeviceT::~PairGPUDevice() {
 }
 
 template <class numtyp, class acctyp>
-bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica, 
-                                 const int first_gpu, const int last_gpu,
-                                 const int gpu_mode, const double p_split,
-                                 const int nthreads) {
+int PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica, 
+                                const int first_gpu, const int last_gpu,
+                                const int gpu_mode, const double p_split,
+                                const int nthreads, const int t_per_atom) {
   _nthreads=nthreads;
+  #ifdef _OPENMP
+  omp_set_num_threads(nthreads);
+  #endif
+  _threads_per_atom=t_per_atom;
+  _threads_per_charge=t_per_atom;
 
   if (_device_init)
-    return true;
+    return 0;
   _device_init=true;
   _comm_world=world;
   _comm_replica=replica;
@@ -96,7 +110,12 @@ bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica,
   // set the device ID
   _procs_per_gpu=static_cast<int>(ceil(static_cast<double>(procs_per_node)/
                                        (last_gpu-first_gpu+1)));
-  int my_gpu=node_rank/_procs_per_gpu;
+  int my_gpu=node_rank/_procs_per_gpu+first_gpu;
+
+  // Time on the device only if 1 proc per gpu
+  _time_device=true;
+  if (_procs_per_gpu>1)
+    _time_device=false;
   
   // Set up a per device communicator
   MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu);
@@ -104,39 +123,109 @@ bool PairGPUDeviceT::init_device(MPI_Comm world, MPI_Comm replica,
 
   gpu=new UCL_Device();
   if (my_gpu>=gpu->num_devices())
-    return false;
+    return -2;
   
   gpu->set(my_gpu);
-  return true;
+
+  _long_range_precompute=0;
+
+  int flag=compile_kernels();
+
+  return flag;
 }
 
 template <class numtyp, class acctyp>
-bool PairGPUDeviceT::init(const bool charge, const bool rot, const int nlocal, 
-                          const int host_nlocal, const int nall,
-                          const int maxspecial, const bool gpu_nbor, 
-                          const int gpu_host, const int max_nbors, 
-                          const double cell_size, const bool pre_cut) {
+int PairGPUDeviceT::init(PairGPUAns<numtyp,acctyp> &ans, const bool charge,
+                         const bool rot, const int nlocal, 
+                         const int host_nlocal, const int nall,
+                         PairGPUNbor *nbor, const int maxspecial,
+                         const int gpu_host, const int max_nbors, 
+                         const double cell_size, const bool pre_cut) {
   if (!_device_init)
-    return false;                          
+    return -1;
+  if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
+    return -5;
+
+  // Counts of data transfers for timing overhead estimates
+  _data_in_estimate=0;
+  _data_out_estimate=1;
+
+  // Initial number of local particles
+  int ef_nlocal=nlocal;
+  if (_particle_split<1.0 && _particle_split>0.0)
+    ef_nlocal=static_cast<int>(_particle_split*nlocal);
+
+  bool gpu_nbor=false;
+  if (_gpu_mode==GPU_NEIGH)
+    gpu_nbor=true;
+    
   if (_init_count==0) {
     // Initialize atom and nbor data
-    int ef_nlocal=nlocal;
-    if (_particle_split<1.0 && _particle_split>0.0)
-      ef_nlocal=static_cast<int>(_particle_split*nlocal);
-    if (!atom.init(ef_nlocal,nall,charge,rot,*gpu,gpu_nbor,
-                   gpu_nbor && maxspecial>0))
-      return false;
-    if (!nbor.init(ef_nlocal,host_nlocal,max_nbors,maxspecial,*gpu,gpu_nbor,
-                   gpu_host,pre_cut))
-      return false;
-    nbor.cell_size(cell_size);
+    if (!atom.init(nall,charge,rot,*gpu,gpu_nbor,gpu_nbor && maxspecial>0))
+      return -3;
+      
+    _data_in_estimate++;
+    if (charge)
+      _data_in_estimate++;
+    if (rot)
+      _data_in_estimate++;
   } else {
-    if (cell_size>nbor.cell_size())
-      nbor.cell_size(cell_size);
+    if (atom.charge()==false && charge)
+      _data_in_estimate++;
+    if (atom.quat()==false && rot)
+      _data_in_estimate++;
+    if (!atom.add_fields(charge,rot,gpu_nbor,gpu_nbor && maxspecial))
+      return -3;
   }
+  
+  if (!ans.init(ef_nlocal,charge,rot,*gpu))
+    return -3;
+
+  if (!nbor->init(&_nbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
+                  *gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d, 
+                  _block_cell_id, _block_nbor_build))
+    return -3;
+  nbor->cell_size(cell_size);
 
   _init_count++;
-  return true;
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+int PairGPUDeviceT::init(PairGPUAns<numtyp,acctyp> &ans, const int nlocal,
+                         const int nall) {
+  if (!_device_init)
+    return -1;                          
+  if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
+    return -5;
+
+  if (_init_count==0) {
+    // Initialize atom and nbor data
+    if (!atom.init(nall,true,false,*gpu,false,false))
+      return -3;
+  } else
+    if (!atom.add_fields(true,false,false,false))
+      return -3;
+
+  if (!ans.init(nlocal,true,false,*gpu))
+    return -3;
+
+  _init_count++;
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void PairGPUDeviceT::set_single_precompute
+                     (PPPMGPUMemory<numtyp,acctyp,float,_lgpu_float4> *pppm) {
+  _long_range_precompute=1;
+  pppm_single=pppm;
+}
+
+template <class numtyp, class acctyp>
+void PairGPUDeviceT::set_double_precompute
+                     (PPPMGPUMemory<numtyp,acctyp,double,_lgpu_double4> *pppm) {
+  _long_range_precompute=2;
+  pppm_double=pppm;
 }
 
 template <class numtyp, class acctyp>
@@ -152,11 +241,17 @@ void PairGPUDeviceT::init_message(FILE *screen, const char *name,
     fprintf(screen,"\n-------------------------------------");
     fprintf(screen,"-------------------------------------\n");
     fprintf(screen,"- Using GPGPU acceleration for %s:\n",name);
-    fprintf(screen,"-  with %d procs per device.\n",_procs_per_gpu);
+    fprintf(screen,"-  with %d proc(s) per device.\n",_procs_per_gpu);
+    #ifdef _OPENMP
+    fprintf(screen,"-  with %d thread(s) per proc.\n",_nthreads);
+    #endif
     fprintf(screen,"-------------------------------------");
     fprintf(screen,"-------------------------------------\n");
 
-    for (int i=first_gpu; i<=last_gpu; i++) {
+    int last=last_gpu+1;
+    if (last>gpu->num_devices())
+      last=gpu->num_devices();
+    for (int i=first_gpu; i<last; i++) {
       std::string sname=gpu->name(i)+", "+toa(gpu->cores(i))+" cores, "+fs+
                         toa(gpu->gigabytes(i))+" GB, "+toa(gpu->clock_rate(i))+
                         " GHZ (";
@@ -177,32 +272,152 @@ void PairGPUDeviceT::init_message(FILE *screen, const char *name,
 }
 
 template <class numtyp, class acctyp>
-void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split,
-                                  const double max_bytes, FILE *screen) {
-  double single[5], times[5];
+void PairGPUDeviceT::estimate_gpu_overhead(const int kernel_calls, 
+                                           double &gpu_overhead,
+                                           double &gpu_driver_overhead) {
+  UCL_H_Vec<int> *host_data_in=NULL, *host_data_out=NULL;
+  UCL_D_Vec<int> *dev_data_in=NULL, *dev_data_out=NULL, *kernel_data=NULL;
+  UCL_Timer *timers_in=NULL, *timers_out=NULL, *timers_kernel=NULL;
+  UCL_Timer over_timer(*gpu);
 
-  single[0]=atom.transfer_time();
+  if (_data_in_estimate>0) {
+    host_data_in=new UCL_H_Vec<int>[_data_in_estimate];
+    dev_data_in=new UCL_D_Vec<int>[_data_in_estimate];
+    timers_in=new UCL_Timer[_data_in_estimate];
+  }
+  
+  if (_data_out_estimate>0) {
+    host_data_out=new UCL_H_Vec<int>[_data_out_estimate];
+    dev_data_out=new UCL_D_Vec<int>[_data_out_estimate];
+    timers_out=new UCL_Timer[_data_out_estimate];
+  }
+  
+  if (kernel_calls>0) {
+    kernel_data=new UCL_D_Vec<int>[kernel_calls];
+    timers_kernel=new UCL_Timer[kernel_calls];
+  }
+  
+  for (int i=0; i<_data_in_estimate; i++) {
+    host_data_in[i].alloc(1,*gpu);
+    dev_data_in[i].alloc(1,*gpu);
+    timers_in[i].init(*gpu);
+  }  
+  
+  for (int i=0; i<_data_out_estimate; i++) {
+    host_data_out[i].alloc(1,*gpu);
+    dev_data_out[i].alloc(1,*gpu);
+    timers_out[i].init(*gpu);
+  }  
+  
+  for (int i=0; i<kernel_calls; i++) {
+    kernel_data[i].alloc(1,*gpu);
+    timers_kernel[i].init(*gpu);
+  }  
+  
+  gpu_overhead=0.0;
+  gpu_driver_overhead=0.0;
+  
+  for (int i=0; i<10; i++) {
+    gpu->sync();
+    gpu_barrier();
+    over_timer.start();
+    gpu->sync();
+    gpu_barrier();
+
+    double driver_time=MPI_Wtime();
+    for (int i=0; i<_data_in_estimate; i++) {
+      timers_in[i].start();
+      ucl_copy(dev_data_in[i],host_data_in[i],true);
+      timers_in[i].stop();
+    }
+    
+    for (int i=0; i<kernel_calls; i++) {
+      timers_kernel[i].start();
+      zero(kernel_data[i],1);
+      timers_kernel[i].stop();
+    }
+
+    for (int i=0; i<_data_out_estimate; i++) {
+      timers_out[i].start();
+      ucl_copy(host_data_out[i],dev_data_out[i],true);
+      timers_out[i].stop();
+    }
+    over_timer.stop();
+
+    double time=over_timer.seconds();
+    driver_time=MPI_Wtime()-driver_time;
+     
+    if (time_device()) {
+      for (int i=0; i<_data_in_estimate; i++)
+        timers_in[i].add_to_total();
+      for (int i=0; i<kernel_calls; i++)
+        timers_kernel[i].add_to_total();
+      for (int i=0; i<_data_out_estimate; i++)
+        timers_out[i].add_to_total();
+    }
+    
+    double mpi_time, mpi_driver_time;
+    MPI_Allreduce(&time,&mpi_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm());
+    MPI_Allreduce(&driver_time,&mpi_driver_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm());
+    gpu_overhead+=mpi_time;
+    gpu_driver_overhead+=mpi_driver_time;
+  }
+  gpu_overhead/=10.0;
+  gpu_driver_overhead/=10.0;
+
+  if (_data_in_estimate>0) {
+    delete [] host_data_in;
+    delete [] dev_data_in;
+    delete [] timers_in;
+  }
+  
+  if (_data_out_estimate>0) {
+    delete [] host_data_out;
+    delete [] dev_data_out;
+    delete [] timers_out;
+  }
+  
+  if (kernel_calls>0) {
+    delete [] kernel_data;
+    delete [] timers_kernel;
+  }
+}              
+
+template <class numtyp, class acctyp>
+void PairGPUDeviceT::output_times(UCL_Timer &time_pair, 
+                                  PairGPUAns<numtyp,acctyp> &ans, 
+                                  PairGPUNbor &nbor, const double avg_split, 
+                                  const double max_bytes, 
+                                  const double gpu_overhead,
+                                  const double driver_overhead, 
+                                  const int threads_per_atom, FILE *screen) {
+  double single[8], times[8];
+
+  single[0]=atom.transfer_time()+ans.transfer_time();
   single[1]=nbor.time_nbor.total_seconds();
   single[2]=nbor.time_kernel.total_seconds();
   single[3]=time_pair.total_seconds();
-  single[4]=atom.cast_time();
+  single[4]=atom.cast_time()+ans.cast_time();
+  single[5]=gpu_overhead;
+  single[6]=driver_overhead;
+  single[7]=ans.cpu_idle_time();
 
-  MPI_Reduce(single,times,5,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
+  MPI_Reduce(single,times,8,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
 
-  double my_max_bytes=max_bytes;
+  double my_max_bytes=max_bytes+atom.max_gpu_bytes();
   double mpi_max_bytes;
   MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
   double max_mb=mpi_max_bytes/(1024.0*1024.0);
 
   if (replica_me()==0)
-    if (screen && times[3]>0.0) {
+    if (screen && times[5]>0.0) {
       fprintf(screen,"\n\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
       fprintf(screen,"      GPU Time Info (average): ");
       fprintf(screen,"\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
 
-      if (procs_per_gpu()==1) {
+      if (time_device()) {
         fprintf(screen,"Data Transfer:   %.4f s.\n",times[0]/_replica_size);
         fprintf(screen,"Data Cast/Pack:  %.4f s.\n",times[4]/_replica_size);
         fprintf(screen,"Neighbor copy:   %.4f s.\n",times[1]/_replica_size);
@@ -212,7 +427,71 @@ void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split,
           fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_replica_size);
         fprintf(screen,"Force calc:      %.4f s.\n",times[3]/_replica_size);
       }
+      fprintf(screen,"GPU Overhead:    %.4f s.\n",times[5]/_replica_size);
       fprintf(screen,"Average split:   %.4f.\n",avg_split);
+      fprintf(screen,"Threads / atom:  %d.\n",threads_per_atom);
+      fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
+      fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[6]/_replica_size);
+      fprintf(screen,"CPU Idle_Time:   %.4f s.\n",times[7]/_replica_size);
+
+      fprintf(screen,"-------------------------------------");
+      fprintf(screen,"--------------------------------\n\n");
+    }
+}
+
+template <class numtyp, class acctyp>
+void PairGPUDeviceT::output_kspace_times(UCL_Timer &time_in, 
+                                         UCL_Timer &time_out,
+                                         UCL_Timer &time_map,
+                                         UCL_Timer &time_rho,
+                                         UCL_Timer &time_interp,
+                                         PairGPUAns<numtyp,acctyp> &ans, 
+                                         const double max_bytes, 
+                                         const double cpu_time, 
+                                         const double idle_time, FILE *screen) {
+  double single[8], times[8];
+
+  single[0]=time_out.total_seconds();
+  single[1]=time_in.total_seconds()+atom.transfer_time()+atom.cast_time();
+  single[2]=time_map.total_seconds();
+  single[3]=time_rho.total_seconds();
+  single[4]=time_interp.total_seconds();
+  single[5]=ans.transfer_time()+ans.cast_time();
+  single[6]=cpu_time;
+  single[7]=idle_time;
+
+  MPI_Reduce(single,times,8,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
+
+  double my_max_bytes=max_bytes+atom.max_gpu_bytes();
+  double mpi_max_bytes;
+  MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
+  double max_mb=mpi_max_bytes/(1024.0*1024.0);
+
+  if (replica_me()==0)
+    if (screen && times[6]>0.0) {
+      fprintf(screen,"\n\n-------------------------------------");
+      fprintf(screen,"--------------------------------\n");
+      fprintf(screen,"      GPU Time Info (average): ");
+      fprintf(screen,"\n-------------------------------------");
+      fprintf(screen,"--------------------------------\n");
+
+      if (time_device()) {
+        fprintf(screen,"Data Out:        %.4f s.\n",times[0]/_replica_size);
+        fprintf(screen,"Data In:         %.4f s.\n",times[1]/_replica_size);
+        fprintf(screen,"Kernel (map):    %.4f s.\n",times[2]/_replica_size);
+        fprintf(screen,"Kernel (rho):    %.4f s.\n",times[3]/_replica_size);
+        fprintf(screen,"Force interp:    %.4f s.\n",times[4]/_replica_size);
+        fprintf(screen,"Total rho:       %.4f s.\n",
+                (times[0]+times[2]+times[3])/_replica_size);
+        fprintf(screen,"Total interp:    %.4f s.\n",
+                (times[1]+times[4])/_replica_size);
+        fprintf(screen,"Force copy/cast: %.4f s.\n",times[5]/_replica_size);
+        fprintf(screen,"Total:           %.4f s.\n",
+                (times[0]+times[1]+times[2]+times[3]+times[4]+times[5])/
+                _replica_size);
+      }
+      fprintf(screen,"CPU Poisson:     %.4f s.\n",times[6]/_replica_size);
+      fprintf(screen,"CPU Idle Time:   %.4f s.\n",times[7]/_replica_size);
       fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
 
       fprintf(screen,"-------------------------------------");
@@ -223,10 +502,17 @@ void PairGPUDeviceT::output_times(UCL_Timer &time_pair, const double avg_split,
 template <class numtyp, class acctyp>
 void PairGPUDeviceT::clear() {
   if (_init_count>0) {
+    _long_range_precompute=0;
     _init_count--;
     if (_init_count==0) {
       atom.clear();
-      nbor.clear();
+      _nbor_shared.clear();
+      if (_compiled) {
+        k_zero.clear();
+        k_info.clear();
+        delete dev_program;
+        _compiled=false;
+      }
     }
   }
 }
@@ -241,21 +527,80 @@ void PairGPUDeviceT::clear_device() {
   }
 }
 
+template <class numtyp, class acctyp>
+int PairGPUDeviceT::compile_kernels() {
+  int flag=0;
+
+  if (_compiled)
+  	return flag;
+  	
+  std::string flags="-cl-mad-enable";
+  dev_program=new UCL_Program(*gpu);
+  int success=dev_program->load_string(pair_gpu_dev_kernel,flags.c_str());
+  if (success!=UCL_SUCCESS)
+    return -4;
+  k_zero.set_function(*dev_program,"kernel_zero");
+  k_info.set_function(*dev_program,"kernel_info");
+  _compiled=true;
+
+  UCL_H_Vec<int> h_gpu_lib_data(14,*gpu,UCL_NOT_PINNED);
+  UCL_D_Vec<int> d_gpu_lib_data(14,*gpu);
+  k_info.set_size(1,1);
+  k_info.run(&d_gpu_lib_data.begin());
+  ucl_copy(h_gpu_lib_data,d_gpu_lib_data,false);
+  
+  #ifndef USE_OPENCL
+  if (static_cast<double>(h_gpu_lib_data[0])/100.0>gpu->arch())
+    return -4;
+  #endif
+
+  _num_mem_threads=h_gpu_lib_data[1];
+  _warp_size=h_gpu_lib_data[2];
+  if (_threads_per_atom<1)
+    _threads_per_atom=h_gpu_lib_data[3];
+  if (_threads_per_charge<1)
+    _threads_per_charge=h_gpu_lib_data[13];
+  _pppm_max_spline=h_gpu_lib_data[4];
+  _pppm_block=h_gpu_lib_data[5];
+  _block_pair=h_gpu_lib_data[6];
+  _max_shared_types=h_gpu_lib_data[7];
+  _block_cell_2d=h_gpu_lib_data[8];
+  _block_cell_id=h_gpu_lib_data[9];
+  _block_nbor_build=h_gpu_lib_data[10];
+  _block_bio_pair=h_gpu_lib_data[11];
+  _max_bio_shared_types=h_gpu_lib_data[12];
+
+  if (static_cast<size_t>(_block_pair)>gpu->group_size())
+    _block_pair=gpu->group_size();
+  if (static_cast<size_t>(_block_bio_pair)>gpu->group_size())
+    _block_bio_pair=gpu->group_size();
+  if (_threads_per_atom>_warp_size)
+    _threads_per_atom=_warp_size;
+  if (_warp_size%_threads_per_atom!=0)
+    _threads_per_atom=1;
+  if (_threads_per_charge>_warp_size)
+    _threads_per_charge=_warp_size;
+  if (_warp_size%_threads_per_charge!=0)
+    _threads_per_charge=1;
+
+  return flag;    
+}
+
 template <class numtyp, class acctyp>
 double PairGPUDeviceT::host_memory_usage() const {
-  return atom.host_memory_usage()+
-         nbor.host_memory_usage()+4*sizeof(numtyp)+
+  return atom.host_memory_usage()+4*sizeof(numtyp)+
          sizeof(PairGPUDevice<numtyp,acctyp>);
 }
 
 template class PairGPUDevice<PRECISION,ACC_PRECISION>;
 PairGPUDevice<PRECISION,ACC_PRECISION> pair_gpu_device;
 
-bool lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
-                     const int last_gpu, const int gpu_mode, 
-                     const double particle_split, const int nthreads) {
+int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
+                    const int last_gpu, const int gpu_mode, 
+                    const double particle_split, const int nthreads,
+                    const int t_per_atom) {
   return pair_gpu_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
-                                     particle_split,nthreads);
+                                     particle_split,nthreads,t_per_atom);
 }
 
 void lmp_clear_device() {
@@ -264,14 +609,5 @@ void lmp_clear_device() {
 
 double lmp_gpu_forces(double **f, double **tor, double *eatom,
                       double **vatom, double *virial, double &ecoul) {
-  if (pair_gpu_device.init_count()) {
-    pair_gpu_device.stop_host_timer();
-    pair_gpu_device.gpu->sync();
-    double evdw=pair_gpu_device.atom.energy_virial(eatom,vatom,virial,ecoul);
-    pair_gpu_device.atom.get_answers(f,tor);
-
-    return evdw;
-  }
-  return 0.0;
+  return pair_gpu_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul);
 }
-
diff --git a/lib/gpu/pair_gpu_device.h b/lib/gpu/pair_gpu_device.h
index 33aa54959b..1e7e15e6a8 100644
--- a/lib/gpu/pair_gpu_device.h
+++ b/lib/gpu/pair_gpu_device.h
@@ -19,11 +19,17 @@
 #define PAIR_GPU_DEVICE_H
 
 #include "pair_gpu_atom.h"
+#include "pair_gpu_ans.h"
 #include "pair_gpu_nbor.h"
+#include "pppm_gpu_memory.h"
 #include "mpi.h"
 #include <sstream>
 #include "stdio.h"
 #include <string>
+#include <queue>
+
+template <class numtyp, class acctyp, 
+          class grdtyp, class grdtyp4> class PPPMGPUMemory;
 
 template <class numtyp, class acctyp>
 class PairGPUDevice {
@@ -33,10 +39,15 @@ class PairGPUDevice {
  
   /// Initialize the device for use by this process
   /** Sets up a per-device MPI communicator for load balancing and initializes
-    * the device (>=first_gpu and <=last_gpu) that this proc will be using **/
-  bool init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, 
+    * the device (>=first_gpu and <=last_gpu) that this proc will be using 
+    * Returns:
+    * -  0 if successfull
+    * - -2 if GPU not found
+    * - -4 if GPU library not compiled for GPU **/
+  int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu, 
                    const int last_gpu, const int gpu_mode, 
-                   const double particle_split, const int nthreads);
+                   const double particle_split, const int nthreads,
+                   const int t_per_atom);
 
   /// Initialize the device for Atom and Neighbor storage
   /** \param rot True if quaternions need to be stored
@@ -50,19 +61,67 @@ class PairGPUDevice {
     * \param max_nbors Initial number of rows in the neighbor matrix
     * \param cell_size cutoff+skin 
     * \param pre_cut True if cutoff test will be performed in separate kernel
-    *                than the force kernel **/
-  bool init(const bool charge, const bool rot, const int nlocal,
-            const int host_nlocal, const int nall, const int maxspecial, 
-            const bool gpu_nbor, const int gpu_host, const int max_nbors,
-            const double cell_size, const bool pre_cut);
+    *                than the force kernel 
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(PairGPUAns<numtyp,acctyp> &a, const bool charge, const bool rot,
+           const int nlocal, const int host_nlocal, const int nall,
+           PairGPUNbor *nbor, const int maxspecial, const int gpu_host,
+           const int max_nbors, const double cell_size, const bool pre_cut);
+
+  /// Initialize the device for Atom storage only
+  /** \param nlocal Total number of local particles to allocate memory for
+    * \param nall Total number of local+ghost particles
+    *
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(PairGPUAns<numtyp,acctyp> &ans, const int nlocal, const int nall);
 
   /// Output a message for pair_style acceleration with device stats
   void init_message(FILE *screen, const char *name,
                     const int first_gpu, const int last_gpu);
 
+  /// Perform charge assignment asynchronously for PPPM
+	void set_single_precompute(PPPMGPUMemory<numtyp,acctyp,
+	                                         float,_lgpu_float4> *pppm);
+
+  /// Perform charge assignment asynchronously for PPPM
+	void set_double_precompute(PPPMGPUMemory<numtyp,acctyp,
+	                                         double,_lgpu_double4> *pppm);
+
+  /// Esimate the overhead from GPU calls from multiple procs
+  /** \param kernel_calls Number of kernel calls/timestep for timing estimated
+    *                     overhead
+    * \param gpu_overhead Estimated gpu overhead per timestep (sec)
+    * \param driver_overhead Estimated overhead from driver per timestep (s) **/
+  void estimate_gpu_overhead(const int kernel_calls, double &gpu_overhead,
+                             double &gpu_driver_overhead);
+
+  /// Returns true if double precision is supported on card
+  inline bool double_precision() { return gpu->double_precision(); }
+  
   /// Output a message with timing information
-  void output_times(UCL_Timer &time_pair, const double avg_split, 
-                    const double max_bytes, FILE *screen);
+  void output_times(UCL_Timer &time_pair, PairGPUAns<numtyp,acctyp> &ans, 
+                    PairGPUNbor &nbor, const double avg_split, 
+                    const double max_bytes, const double gpu_overhead,
+                    const double driver_overhead, 
+                    const int threads_per_atom, FILE *screen);
+
+  /// Output a message with timing information
+  void output_kspace_times(UCL_Timer &time_in, UCL_Timer &time_out,
+                           UCL_Timer & time_map, UCL_Timer & time_rho,
+                           UCL_Timer &time_interp, 
+                           PairGPUAns<numtyp,acctyp> &ans, 
+                           const double max_bytes, const double cpu_time,
+                           const double cpu_idle_time, FILE *screen);
 
   /// Clear all memory on host and device associated with atom and nbor data
   void clear();
@@ -70,11 +129,37 @@ class PairGPUDevice {
   /// Clear all memory on host and device
   void clear_device();
 
+  /// Add an answer object for putting forces, energies, etc from GPU to LAMMPS
+  inline void add_ans_object(PairGPUAns<numtyp,acctyp> *ans)
+    { ans_queue.push(ans); }
+
+  /// Add "answers" (force,energies,etc.) into LAMMPS structures
+  inline double fix_gpu(double **f, double **tor, double *eatom,
+                        double **vatom, double *virial, double &ecoul) {
+    atom.data_unavail();
+    if (ans_queue.empty()==false) {
+      stop_host_timer();
+      double evdw=0.0;
+      while (ans_queue.empty()==false) {
+        evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul);
+        ans_queue.pop();
+      }                                                 
+      return evdw;
+    }
+    return 0.0;
+  }
+
   /// Start timer on host
-  inline void start_host_timer() { _cpu_full=MPI_Wtime(); }
+  inline void start_host_timer() 
+    { _cpu_full=MPI_Wtime(); _host_timer_started=true; }
   
   /// Stop timer on host
-  inline void stop_host_timer() { _cpu_full=MPI_Wtime()-_cpu_full; }
+  inline void stop_host_timer() { 
+    if (_host_timer_started) {
+      _cpu_full=MPI_Wtime()-_cpu_full; 
+      _host_timer_started=false;
+    }
+  }
   
   /// Return host time
   inline double host_time() { return _cpu_full; }
@@ -114,6 +199,42 @@ class PairGPUDevice {
   inline double particle_split() const { return _particle_split; }
   /// Return the initialization count for the device
   inline int init_count() const { return _init_count; }
+  /// True if device is being timed
+  inline bool time_device() const { return _time_device; }
+
+  /// Return the number of threads accessing memory simulatenously
+  inline int num_mem_threads() const { return _num_mem_threads; }
+  /// Return the number of threads per atom for pair styles
+  inline int threads_per_atom() const { return _threads_per_atom; }
+  /// Return the number of threads per atom for pair styles using charge
+  inline int threads_per_charge() const { return _threads_per_charge; }
+  /// Return the min of the pair block size or the device max block size
+  inline int pair_block_size() const { return _block_pair; }
+  /// Return the maximum number of atom types that can be used with shared mem
+  inline int max_shared_types() const { return _max_shared_types; }
+  /// Return the maximum order for PPPM splines
+  inline int pppm_max_spline() const { return _pppm_max_spline; }
+  /// Return the block size for PPPM kernels
+  inline int pppm_block() const { return _pppm_block; }
+  /// Return the block size for neighbor binning
+  inline int block_cell_2d() const { return _block_cell_2d; }
+  /// Return the block size for atom mapping for neighbor builds
+  inline int block_cell_id() const { return _block_cell_id; }
+  /// Return the block size for neighbor build kernel
+  inline int block_nbor_build() const { return _block_nbor_build; }
+  /// Return the block size for "bio" pair styles
+  inline int block_bio_pair() const { return _block_bio_pair; }
+  /// Return the maximum number of atom types for shared mem with "bio" styles
+  inline int max_bio_shared_types() const { return _max_bio_shared_types; }
+
+  // -------------------- SHARED DEVICE ROUTINES -------------------- 
+  // Perform asynchronous zero of integer array 
+  void zero(UCL_D_Vec<int> &mem, const int numel) {
+    int num_blocks=static_cast<int>(ceil(static_cast<double>(numel)/
+                                    _block_pair));
+    k_zero.set_size(num_blocks,_block_pair);
+    k_zero.run(&mem.begin(),&numel);
+  }
 
   // -------------------------- DEVICE DATA ------------------------- 
 
@@ -130,11 +251,30 @@ class PairGPUDevice {
   // --------------------------- NBOR DATA ----------------------------
   
   /// Neighbor Data
-  PairGPUNbor nbor;
+  PairGPUNborShared _nbor_shared;
+
+  // ------------------------ LONG RANGE DATA -------------------------
+  
+  // Long Range Data
+  int _long_range_precompute;
+  PPPMGPUMemory<numtyp,acctyp,float,_lgpu_float4> *pppm_single;
+  PPPMGPUMemory<numtyp,acctyp,double,_lgpu_double4> *pppm_double;
+  /// Precomputations for long range charge assignment (asynchronously)
+  inline void precompute(const int ago, const int nlocal, const int nall,
+                         double **host_x, int *host_type, bool &success,
+                         double *charge, double *boxlo, double *prd) {
+    if (_long_range_precompute==1)
+      pppm_single->precompute(ago,nlocal,nall,host_x,host_type,success,charge,
+                              boxlo,prd);
+    else if (_long_range_precompute==2)
+      pppm_double->precompute(ago,nlocal,nall,host_x,host_type,success,charge,
+                              boxlo,prd);
+  }
 
  private:
+  std::queue<PairGPUAns<numtyp,acctyp> *> ans_queue;
   int _init_count;
-  bool _device_init;
+  bool _device_init, _host_timer_started, _time_device;
   MPI_Comm _comm_world, _comm_replica, _comm_gpu;
   int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me, 
       _replica_size;
@@ -142,6 +282,19 @@ class PairGPUDevice {
   double _particle_split;
   double _cpu_full;
 
+  int _num_mem_threads, _warp_size, _threads_per_atom, _threads_per_charge;
+  int _pppm_max_spline, _pppm_block;
+  int _block_pair, _max_shared_types;
+  int _block_cell_2d, _block_cell_id, _block_nbor_build;
+  int _block_bio_pair, _max_bio_shared_types;
+
+  UCL_Program *dev_program;
+  UCL_Kernel k_zero, k_info;
+  bool _compiled;
+  int compile_kernels();
+
+  int _data_in_estimate, _data_out_estimate;
+  
   template <class t>
   inline std::string toa(const t& in) {
     std::ostringstream o;
diff --git a/lib/gpu/pair_gpu_nbor.cpp b/lib/gpu/pair_gpu_nbor.cpp
index 123fbe54aa..df138a7eff 100644
--- a/lib/gpu/pair_gpu_nbor.cpp
+++ b/lib/gpu/pair_gpu_nbor.cpp
@@ -18,15 +18,9 @@
 
 #include "pair_gpu_precision.h"
 #include "pair_gpu_nbor.h"
+#include "pair_gpu_device.h"
 #include "math.h"
 
-#ifdef USE_OPENCL
-#include "pair_gpu_nbor_cl.h"
-#else
-#include "pair_gpu_nbor_ptx.h"
-#include "pair_gpu_build_ptx.h"
-#endif
-
 int PairGPUNbor::bytes_per_atom(const int max_nbors) const {
   if (_gpu_nbor)
     return (max_nbors+2)*sizeof(int);
@@ -36,12 +30,18 @@ int PairGPUNbor::bytes_per_atom(const int max_nbors) const {
     return (max_nbors+3)*sizeof(int);
 }
 
-bool PairGPUNbor::init(const int inum, const int host_inum, const int max_nbors, 
+bool PairGPUNbor::init(PairGPUNborShared *shared, const int inum,
+                       const int host_inum, const int max_nbors, 
                        const int maxspecial, UCL_Device &devi, 
                        const bool gpu_nbor, const int gpu_host, 
-                       const bool pre_cut) {
+                       const bool pre_cut, const int block_cell_2d,
+                       const int block_cell_id, const int block_nbor_build) {
   clear();
 
+  _block_cell_2d=block_cell_2d;
+  _block_cell_id=block_cell_id;
+  _block_nbor_build=block_nbor_build;
+  _shared=shared;
   dev=&devi;
   _gpu_nbor=gpu_nbor;
   if (gpu_host==0)
@@ -80,8 +80,11 @@ bool PairGPUNbor::init(const int inum, const int host_inum, const int max_nbors,
     success=success && (host_packed.alloc(2*IJ_SIZE,*dev,
                                           UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
   alloc(success);
+  if (!success)
+    return false;
+    
   if (_use_packing==false)
-    compile_kernels(devi);
+    _shared->compile_kernels(devi,gpu_nbor);
 
   return success;
 }
@@ -89,13 +92,14 @@ bool PairGPUNbor::init(const int inum, const int host_inum, const int max_nbors,
 void PairGPUNbor::alloc(bool &success) { 
   dev_nbor.clear();
   host_acc.clear();
+  int nt=_max_atoms+_max_host;
   if (_use_packing==false || _gpu_nbor) 
     success=success && (dev_nbor.alloc((_max_nbors+2)*_max_atoms,*dev,
                                        UCL_READ_ONLY)==UCL_SUCCESS);
   else 
     success=success && (dev_nbor.alloc(3*_max_atoms,*dev,
                                        UCL_READ_ONLY)==UCL_SUCCESS);
-  success=success && (host_acc.alloc((_max_atoms+_max_host)*2,*dev,
+  success=success && (host_acc.alloc(nt*2,*dev,
                                      UCL_WRITE_OPTIMIZED)==UCL_SUCCESS);
 
   _c_bytes=dev_nbor.row_bytes();
@@ -108,11 +112,31 @@ void PairGPUNbor::alloc(bool &success) {
   if (_max_host>0) {
     host_nbor.clear();
     dev_host_nbor.clear();
-    success=success && (host_nbor.alloc((_max_nbors+1)*_max_host,*dev,
+    dev_host_numj.clear();
+    host_ilist.clear();
+    host_jlist.clear();
+    
+    success=success && (host_nbor.alloc(_max_nbors*_max_host,*dev,
                                         UCL_RW_OPTIMIZED)==UCL_SUCCESS);
-    success=success && (dev_host_nbor.alloc((_max_nbors+1)*_max_host,
+    success=success && (dev_host_nbor.alloc(_max_nbors*_max_host,
                                             *dev,UCL_WRITE_ONLY)==UCL_SUCCESS);
-    _c_bytes+=dev_host_nbor.row_bytes();
+    success=success && (dev_host_numj.alloc(_max_host,*dev,
+                                            UCL_WRITE_ONLY)==UCL_SUCCESS);
+    success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
+    if (!success)
+      return;
+    for (int i=0; i<nt; i++)
+      host_ilist[i]=i;
+    success=success && (host_jlist.alloc(_max_host,*dev,
+                                         UCL_NOT_PINNED)==UCL_SUCCESS);
+    if (!success)
+      return;
+    int *ptr=host_nbor.begin();
+    for (int i=0; i<_max_host; i++) {
+      host_jlist[i]=ptr;
+      ptr+=_max_nbors;
+    }                                                 
+    _c_bytes+=dev_host_nbor.row_bytes()+dev_host_numj.row_bytes();
   }
   if (_maxspecial>0) {
     dev_nspecial.clear();
@@ -145,6 +169,9 @@ void PairGPUNbor::clear() {
     dev_host_nbor.clear();
     dev_packed.clear();
     host_nbor.clear();
+    dev_host_numj.clear();
+    host_ilist.clear();
+    host_jlist.clear();
     dev_nspecial.clear();
     dev_special.clear();
     dev_special_t.clear();
@@ -152,27 +179,13 @@ void PairGPUNbor::clear() {
     time_kernel.clear();
     time_nbor.clear();
   }
-
-  if (_compiled) {
-    if (_gpu_nbor) {
-      k_cell_id.clear();
-      k_cell_counts.clear();
-      k_build_nbor.clear();
-      k_transpose.clear();
-      k_special.clear();
-      delete build_program;
-    } else {
-      k_nbor.clear();
-      delete nbor_program;
-    }
-    _compiled=false;
-  }
 }
 
 double PairGPUNbor::host_memory_usage() const {
   if (_gpu_nbor) {
     if (_gpu_host)
-      return host_nbor.row_bytes()*host_nbor.rows();
+      return host_nbor.row_bytes()*host_nbor.rows()+host_ilist.row_bytes()+
+             host_jlist.row_bytes();
     else
       return 0;
   } else 
@@ -186,7 +199,7 @@ void PairGPUNbor::get_host(const int inum, int *ilist, int *numj,
 
   UCL_H_Vec<int> ilist_view;
   ilist_view.view(ilist,inum,*dev);
-  ucl_copy(dev_nbor,ilist_view,true);
+  ucl_copy(dev_nbor,ilist_view,false);
 
   UCL_D_Vec<int> nbor_offset;
   UCL_H_Vec<int> host_offset;
@@ -238,46 +251,20 @@ void PairGPUNbor::get_host(const int inum, int *ilist, int *numj,
   if (_use_packing==false) {
     time_kernel.start();
     int GX=static_cast<int>(ceil(static_cast<double>(inum)/block_size));
-    k_nbor.set_size(GX,block_size);
-    k_nbor.run(&dev_nbor.begin(), &dev_packed.begin(), &inum);
+    _shared->k_nbor.set_size(GX,block_size);
+    _shared->k_nbor.run(&dev_nbor.begin(), &dev_packed.begin(), &inum);
     time_kernel.stop();
   }
 }
 
-void PairGPUNbor::compile_kernels(UCL_Device &dev) {
-  std::string flags="-cl-fast-relaxed-math -cl-mad-enable";
-
-  if (_gpu_nbor==false) {
-    nbor_program=new UCL_Program(dev);
-    nbor_program->load_string(pair_gpu_nbor_kernel,flags.c_str());
-    k_nbor.set_function(*nbor_program,"kernel_unpack");
-  } else {
-    build_program=new UCL_Program(dev);
-    #ifdef USE_OPENCL
-    std::cerr << "CANNOT CURRENTLY USE GPU NEIGHBORING WITH OPENCL\n";
-    exit(1);
-    #else
-    build_program->load_string(pair_gpu_build_kernel,flags.c_str());
-    #endif
-    k_cell_id.set_function(*build_program,"calc_cell_id");
-    k_cell_counts.set_function(*build_program,"kernel_calc_cell_counts");
-    k_build_nbor.set_function(*build_program,"calc_neigh_list_cell");
-    k_transpose.set_function(*build_program,"transpose");
-    k_special.set_function(*build_program,"kernel_special");
-    neigh_tex.get_texture(*build_program,"neigh_tex");
-  }
-  _compiled=true;
-}
-
 template <class numtyp, class acctyp>
 void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
                                   const int nall, 
                                   PairGPUAtom<numtyp,acctyp> &atom, 
-                                  double *boxlo, double *boxhi, int *tag, 
+                                  double *sublo, double *subhi, int *tag, 
                                   int **nspecial, int **special, bool &success,
                                   int &mn) {
   const int nt=inum+host_inum;
-
   if (_maxspecial>0) {
     time_nbor.start();
     UCL_H_Vec<int> view_nspecial, view_special, view_tag;
@@ -290,25 +277,25 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
     time_nbor.stop();
     time_nbor.add_to_total();
     time_kernel.start();
-    const int b2x=8;
-    const int b2y=8;
+    const int b2x=_block_cell_2d;
+    const int b2y=_block_cell_2d;
     const int g2x=static_cast<int>(ceil(static_cast<double>(_maxspecial)/b2x));
     const int g2y=static_cast<int>(ceil(static_cast<double>(nt)/b2y));
-    k_transpose.set_size(g2x,g2y,b2x,b2y);
-    k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),&_maxspecial,
-                    &nt);        
+    _shared->k_transpose.set_size(g2x,g2y,b2x,b2y);
+    _shared->k_transpose.run(&dev_special.begin(),&dev_special_t.begin(),
+                             &_maxspecial,&nt);        
   } else
     time_kernel.start();
 
   _nbor_pitch=inum;
-  neigh_tex.bind_float(atom.dev_x,4);
+  _shared->neigh_tex.bind_float(atom.dev_x,4);
 
   int ncellx, ncelly, ncellz, ncell_3d;
-  ncellx = static_cast<int>(ceil(((boxhi[0] - boxlo[0]) +
+  ncellx = static_cast<int>(ceil(((subhi[0] - sublo[0]) +
                                   2.0*_cell_size)/_cell_size));
-  ncelly = static_cast<int>(ceil(((boxhi[1] - boxlo[1]) +
+  ncelly = static_cast<int>(ceil(((subhi[1] - sublo[1]) +
                                   2.0*_cell_size)/_cell_size));
-  ncellz = static_cast<int>(ceil(((boxhi[2] - boxlo[2]) +
+  ncellz = static_cast<int>(ceil(((subhi[2] - sublo[2]) +
                                   2.0*_cell_size)/_cell_size));
   ncell_3d = ncellx * ncelly * ncellz;
   UCL_D_Vec<int> cell_counts;
@@ -316,35 +303,36 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
   _cell_bytes=cell_counts.row_bytes();
 
   /* build cell list on GPU */
-  const int neigh_block=128;
+  const int neigh_block=_block_cell_id;
   const int GX=(int)ceil((float)nall/neigh_block);
-  const numtyp boxlo0=static_cast<numtyp>(boxlo[0]);
-  const numtyp boxlo1=static_cast<numtyp>(boxlo[1]);
-  const numtyp boxlo2=static_cast<numtyp>(boxlo[2]);
-  const numtyp boxhi0=static_cast<numtyp>(boxhi[0]);
-  const numtyp boxhi1=static_cast<numtyp>(boxhi[1]);
-  const numtyp boxhi2=static_cast<numtyp>(boxhi[2]);
+  const numtyp sublo0=static_cast<numtyp>(sublo[0]);
+  const numtyp sublo1=static_cast<numtyp>(sublo[1]);
+  const numtyp sublo2=static_cast<numtyp>(sublo[2]);
+  const numtyp subhi0=static_cast<numtyp>(subhi[0]);
+  const numtyp subhi1=static_cast<numtyp>(subhi[1]);
+  const numtyp subhi2=static_cast<numtyp>(subhi[2]);
   const numtyp cell_size_cast=static_cast<numtyp>(_cell_size);
-  k_cell_id.set_size(GX,neigh_block);
-  k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(), 
-                &atom.dev_particle_id.begin(),
-  				      &boxlo0, &boxlo1, &boxlo2, &boxhi0, &boxhi1, 
-  				      &boxhi2, &cell_size_cast, &ncellx, &ncelly, &nall);
+  _shared->k_cell_id.set_size(GX,neigh_block);
+  _shared->k_cell_id.run(&atom.dev_x.begin(), &atom.dev_cell_id.begin(), 
+                         &atom.dev_particle_id.begin(),
+  				               &sublo0, &sublo1, &sublo2, &subhi0, &subhi1, 
+  				               &subhi2, &cell_size_cast, &ncellx, &ncelly, &nall);
 
   atom.sort_neighbor(nall);
 
   /* calculate cell count */
-  k_cell_counts.set_size(GX,neigh_block);
-  k_cell_counts.run(&atom.dev_cell_id.begin(), &cell_counts.begin(), &nall, 
-                    &ncell_3d);
+  _shared->k_cell_counts.set_size(GX,neigh_block);
+  _shared->k_cell_counts.run(&atom.dev_cell_id.begin(), &cell_counts.begin(), 
+                             &nall, &ncell_3d);
 
   /* build the neighbor list */
-  const int cell_block=64;
-  k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1);
-  k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(),
-                   &cell_counts.begin(), &dev_nbor.begin(),
-                   &dev_host_nbor.begin(), &_max_nbors, &cell_size_cast,
-                   &ncellx, &ncelly, &ncellz, &inum, &nt, &nall);
+  const int cell_block=_block_nbor_build;
+  _shared->k_build_nbor.set_size(ncellx, ncelly*ncellz, cell_block, 1);
+  _shared->k_build_nbor.run(&atom.dev_x.begin(), &atom.dev_particle_id.begin(),
+                            &cell_counts.begin(), &dev_nbor.begin(),
+                            &dev_host_nbor.begin(), &dev_host_numj.begin(),
+                            &_max_nbors,&cell_size_cast,
+                            &ncellx, &ncelly, &ncellz, &inum, &nt, &nall);
 
   /* Get the maximum number of nbors and realloc if necessary */
   UCL_D_Vec<int> numj;
@@ -353,7 +341,7 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
   if (nt>inum) {
     UCL_H_Vec<int> host_offset;
     host_offset.view_offset(inum,host_acc,nt-inum);
-    ucl_copy(host_offset,dev_host_nbor,nt-inum,false);
+    ucl_copy(host_offset,dev_host_numj,nt-inum,false);
   }
   mn=host_acc[0];
   for (int i=1; i<nt; i++)
@@ -368,10 +356,15 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
     if (_max_host>0) {
       host_nbor.clear();
       dev_host_nbor.clear();
-      success=success && (host_nbor.alloc((mn+1)*_max_host,dev_nbor,
+      success=success && (host_nbor.alloc(mn*_max_host,dev_nbor,
                                           UCL_RW_OPTIMIZED)==UCL_SUCCESS);
-      success=success && (dev_host_nbor.alloc((mn+1)*_max_host,
+      success=success && (dev_host_nbor.alloc(mn*_max_host,
                                         dev_nbor,UCL_WRITE_ONLY)==UCL_SUCCESS);
+      int *ptr=host_nbor.begin();
+      for (int i=0; i<_max_host; i++) {
+        host_jlist[i]=ptr;
+        ptr+=mn;
+      }                                                 
       _gpu_bytes+=dev_host_nbor.row_bytes();
     }
     if (_alloc_packed) {
@@ -385,28 +378,29 @@ void PairGPUNbor::build_nbor_list(const int inum, const int host_inum,
     _max_nbors=mn;
     time_kernel.stop();
     time_kernel.add_to_total();
-    build_nbor_list(inum, host_inum, nall, atom, boxlo, boxhi, tag, nspecial,
+    build_nbor_list(inum, host_inum, nall, atom, sublo, subhi, tag, nspecial,
                     special, success, mn);
     return;
   }
   
   if (_maxspecial>0) {
     const int GX2=static_cast<int>(ceil(static_cast<double>(nt)/cell_block));
-    k_special.set_size(GX2,cell_block);
-    k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(), 
-                  &atom.dev_tag.begin(), &dev_nspecial.begin(), 
-                  &dev_special.begin(), &inum, &nt, &nall);
+    _shared->k_special.set_size(GX2,cell_block);
+    _shared->k_special.run(&dev_nbor.begin(), &dev_host_nbor.begin(), 
+                           &dev_host_numj.begin(), &atom.dev_tag.begin(), 
+                           &dev_nspecial.begin(), &dev_special.begin(), 
+                           &inum, &nt, &nall, &_max_nbors);
   }
   time_kernel.stop();
 
   time_nbor.start();
   if (_gpu_host)
-    ucl_copy(host_nbor,dev_host_nbor,host_inum*(mn+1),false);
+    ucl_copy(host_nbor,dev_host_nbor,false);
   time_nbor.stop();
 }
 
 template void PairGPUNbor::build_nbor_list<PRECISION,ACC_PRECISION>
-     (const int inum, const int host_inum, const int nall, 
-      PairGPUAtom<PRECISION,ACC_PRECISION> &atom, double *boxlo, double *boxhi,
+     (const int inum, const int host_inum, const int nall,
+      PairGPUAtom<PRECISION,ACC_PRECISION> &atom, double *sublo, double *subhi,
       int *, int **, int **, bool &success, int &mn);
 
diff --git a/lib/gpu/pair_gpu_nbor.h b/lib/gpu/pair_gpu_nbor.h
index 403bd7aed4..02ad4b201b 100644
--- a/lib/gpu/pair_gpu_nbor.h
+++ b/lib/gpu/pair_gpu_nbor.h
@@ -19,32 +19,27 @@
 #define PAIR_GPU_NBOR_H
 
 #include "pair_gpu_atom.h"
+#include "pair_gpu_nbor_shared.h"
 
 #define IJ_SIZE 131072
 
 #ifdef USE_OPENCL
 
-#include "geryon/ocl_device.h"
 #include "geryon/ocl_timer.h"
 #include "geryon/ocl_mat.h"
-#include "geryon/ocl_kernel.h"
-#include "geryon/ocl_texture.h"
 using namespace ucl_opencl;
 
 #else
 
-#include "geryon/nvd_device.h"
 #include "geryon/nvd_timer.h"
 #include "geryon/nvd_mat.h"
-#include "geryon/nvd_kernel.h"
-#include "geryon/nvd_texture.h"
 using namespace ucl_cudadr;
 
 #endif
 
 class PairGPUNbor {
  public:
-  PairGPUNbor() : _allocated(false), _use_packing(false), _compiled(false) {}
+  PairGPUNbor() : _allocated(false), _use_packing(false) {}
   ~PairGPUNbor() { clear(); }
  
   /// Determine whether neighbor unpacking should be used
@@ -62,9 +57,11 @@ class PairGPUNbor {
     *                 2 if gpu_nbor is true, and host needs a full nbor list
     * \param pre_cut True if cutoff test will be performed in separate kernel
     *                than the force kernel **/
-  bool init(const int inum, const int host_inum, const int max_nbors, 
-            const int maxspecial, UCL_Device &dev, const bool gpu_nbor,
-            const int gpu_host, const bool pre_cut);
+  bool init(PairGPUNborShared *shared, const int inum, const int host_inum,
+            const int max_nbors, const int maxspecial, UCL_Device &dev,
+            const bool gpu_nbor, const int gpu_host, const bool pre_cut,
+            const int block_cell_2d, const int block_cell_id, 
+            const int block_nbor_build);
 
   /// Set the size of the cutoff+skin
   inline void cell_size(const double size) { _cell_size=size; }
@@ -131,18 +128,18 @@ class PairGPUNbor {
   inline int max_nbors() const { return _max_nbors; }
 
   /// Loop through neighbor count array and return maximum nbors for a particle
-  inline int max_nbor_loop(const int inum, int *numj) const {
+  inline int max_nbor_loop(const int inum, int *numj, int *ilist) const {
     int mn=0;
     for (int i=0; i<inum; i++)
-      mn=std::max(mn,numj[i]);
+      mn=std::max(mn,numj[ilist[i]]);
     return mn;
   }
 
   /// Build nbor list on the device
   template <class numtyp, class acctyp>
   void build_nbor_list(const int inum, const int host_inum, const int nall,
-                       PairGPUAtom<numtyp,acctyp> &atom, double *boxlo,
-                       double *boxhi, int *tag, int **nspecial, int **special, 
+                       PairGPUAtom<numtyp,acctyp> &atom, double *sublo,
+                       double *subhi, int *tag, int **nspecial, int **special, 
                        bool &success, int &max_nbors);
 
   /// Return the number of bytes used on device
@@ -176,31 +173,31 @@ class PairGPUNbor {
   UCL_H_Vec<int> host_nbor;
   /// Device storage for neighbor list matrix that will be copied to host
   /** - 1st row is numj
-    * - Remaining rows are nbors **/
+    * - Remaining rows are by atom, columns are nbors **/
   UCL_D_Vec<int> dev_host_nbor;
+  UCL_D_Vec<int> dev_host_numj;
+  UCL_H_Vec<int> host_ilist;
+  UCL_H_Vec<int*> host_jlist;
   /// Device storage for special neighbor counts
   UCL_D_Vec<int> dev_nspecial;
   /// Device storage for special neighbors
   UCL_D_Vec<int> dev_special, dev_special_t;
-  /// Texture for cached position/type access with CUDA
-  UCL_Texture neigh_tex;
 
   /// Device timers
   UCL_Timer time_nbor, time_kernel;
   
  private:
+  PairGPUNborShared *_shared;
   UCL_Device *dev;
-  UCL_Program *nbor_program, *build_program;
-  UCL_Kernel k_nbor, k_cell_id, k_cell_counts, k_build_nbor;
-  UCL_Kernel k_transpose, k_special;
-  bool _allocated, _use_packing, _compiled;
-  void compile_kernels(UCL_Device &dev);
+  bool _allocated, _use_packing;
   int _max_atoms, _max_nbors, _max_host, _nbor_pitch, _maxspecial;
   bool _gpu_nbor, _gpu_host, _alloc_packed;
   double _cell_size;
 
   double _gpu_bytes, _c_bytes, _cell_bytes;
   void alloc(bool &success);
+  
+  int _block_cell_2d, _block_cell_id, _block_nbor_build;
 };
 
 #endif
diff --git a/lib/gpu/pair_gpu_precision.h b/lib/gpu/pair_gpu_precision.h
index a5f57c1f95..902975be0b 100644
--- a/lib/gpu/pair_gpu_precision.h
+++ b/lib/gpu/pair_gpu_precision.h
@@ -84,8 +84,6 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
 #define acctyp4 _lgpu_float4
 #endif
 
-#define MAX_SHARED_TYPES 8
-#define MAX_BIO_SHARED_TYPES 128
 enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
 
 #endif

From 6321bca76958bf1ef635f60f40763942b15aaf6f Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Mon, 2 May 2011 15:02:58 +0000
Subject: [PATCH 18/21] git-svn-id:
 svn://svn.icms.temple.edu/lammps-ro/trunk@6054
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 src/MAKE/Makefile.altix        | 2 +-
 src/MAKE/Makefile.bgl          | 2 +-
 src/MAKE/Makefile.cygwin       | 2 +-
 src/MAKE/Makefile.encanto      | 2 +-
 src/MAKE/Makefile.fink         | 2 +-
 src/MAKE/Makefile.g++          | 2 +-
 src/MAKE/Makefile.g++3         | 2 +-
 src/MAKE/Makefile.glory        | 2 +-
 src/MAKE/Makefile.jaguar       | 2 +-
 src/MAKE/Makefile.lam          | 2 +-
 src/MAKE/Makefile.linux        | 2 +-
 src/MAKE/Makefile.mac          | 2 +-
 src/MAKE/Makefile.mac_mpi      | 2 +-
 src/MAKE/Makefile.mingw        | 2 +-
 src/MAKE/Makefile.mkl          | 2 +-
 src/MAKE/Makefile.odin         | 2 +-
 src/MAKE/Makefile.openmpi      | 2 +-
 src/MAKE/Makefile.pgi          | 2 +-
 src/MAKE/Makefile.power5       | 2 +-
 src/MAKE/Makefile.qed          | 2 +-
 src/MAKE/Makefile.redsky       | 2 +-
 src/MAKE/Makefile.sdsc         | 2 +-
 src/MAKE/Makefile.seaborg      | 2 +-
 src/MAKE/Makefile.serial       | 2 +-
 src/MAKE/Makefile.serial_debug | 2 +-
 src/MAKE/Makefile.sgi          | 2 +-
 src/MAKE/Makefile.solaris      | 2 +-
 src/MAKE/Makefile.spirit       | 2 +-
 src/MAKE/Makefile.storm        | 2 +-
 src/MAKE/Makefile.tacc         | 2 +-
 src/MAKE/Makefile.tbird        | 2 +-
 src/MAKE/Makefile.tesla        | 2 +-
 src/MAKE/Makefile.tunnison     | 2 +-
 src/MAKE/Makefile.xt3          | 2 +-
 34 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/src/MAKE/Makefile.altix b/src/MAKE/Makefile.altix
index 26c07247cc..982a822aa3 100644
--- a/src/MAKE/Makefile.altix
+++ b/src/MAKE/Makefile.altix
@@ -57,7 +57,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lifcore -lsvml -lompstub -limf
 reax_SYSLIB =      -lifcore -lsvml -lompstub -limf
 user-atc_SYSLIB =  -lblas -llapack
diff --git a/src/MAKE/Makefile.bgl b/src/MAKE/Makefile.bgl
index e8d25d745d..0568d33d53 100644
--- a/src/MAKE/Makefile.bgl
+++ b/src/MAKE/Makefile.bgl
@@ -63,7 +63,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lifcore -lsvml -lompstub -limf
 reax_SYSLIB =      -lifcore -lsvml -lompstub -limf
 user-atc_SYSLIB =  -lblas -llapack
diff --git a/src/MAKE/Makefile.cygwin b/src/MAKE/Makefile.cygwin
index cb35ccbf34..0d6264848f 100644
--- a/src/MAKE/Makefile.cygwin
+++ b/src/MAKE/Makefile.cygwin
@@ -57,7 +57,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lifcore -lsvml -lompstub -limf
 reax_SYSLIB =      -lifcore -lsvml -lompstub -limf
 user-atc_SYSLIB =  -lblas -llapack
diff --git a/src/MAKE/Makefile.encanto b/src/MAKE/Makefile.encanto
index 14a5b7a359..69f8d434ad 100644
--- a/src/MAKE/Makefile.encanto
+++ b/src/MAKE/Makefile.encanto
@@ -57,7 +57,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lifcore -lsvml -lompstub -limf
 reax_SYSLIB =      -lifcore -lsvml -lompstub -limf
 user-atc_SYSLIB =  -lblas -llapack
diff --git a/src/MAKE/Makefile.fink b/src/MAKE/Makefile.fink
index f6a1afa400..0474e0246a 100644
--- a/src/MAKE/Makefile.fink
+++ b/src/MAKE/Makefile.fink
@@ -57,7 +57,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lifcore -lsvml -lompstub -limf
 reax_SYSLIB =      -lifcore -lsvml -lompstub -limf
 user-atc_SYSLIB =  -lblas -llapack
diff --git a/src/MAKE/Makefile.g++ b/src/MAKE/Makefile.g++
index 2381f7235c..147ac4f388 100755
--- a/src/MAKE/Makefile.g++
+++ b/src/MAKE/Makefile.g++
@@ -57,7 +57,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lgfortran
 reax_SYSLIB =      -lgfortran
 user-atc_SYSLIB =  -lblas -llapack
diff --git a/src/MAKE/Makefile.g++3 b/src/MAKE/Makefile.g++3
index 1b35f3c2c0..ddafa913d9 100755
--- a/src/MAKE/Makefile.g++3
+++ b/src/MAKE/Makefile.g++3
@@ -57,7 +57,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lifcore -lsvml -lompstub -limf
 reax_SYSLIB =      -lifcore -lsvml -lompstub -limf
 user-atc_SYSLIB =  -lblas -llapack
diff --git a/src/MAKE/Makefile.glory b/src/MAKE/Makefile.glory
index 9e9f9345bd..00312daf4b 100644
--- a/src/MAKE/Makefile.glory
+++ b/src/MAKE/Makefile.glory
@@ -75,7 +75,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lifcore -lsvml -lompstub -limf
 reax_SYSLIB =      -lifcore -lsvml -lompstub -limf
 user-atc_SYSLIB =  -lblas -llapack
diff --git a/src/MAKE/Makefile.jaguar b/src/MAKE/Makefile.jaguar
index 50b9934486..07ee4436e4 100644
--- a/src/MAKE/Makefile.jaguar
+++ b/src/MAKE/Makefile.jaguar
@@ -63,7 +63,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lifcore -lsvml -lompstub -limf
 reax_SYSLIB =      -lifcore -lsvml -lompstub -limf
 user-atc_SYSLIB =  -lblas -llapack
diff --git a/src/MAKE/Makefile.lam b/src/MAKE/Makefile.lam
index f7ad21bc5e..0082b29699 100644
--- a/src/MAKE/Makefile.lam
+++ b/src/MAKE/Makefile.lam
@@ -57,7 +57,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lifcore -lsvml -lompstub -limf
 reax_SYSLIB =      -lifcore -lsvml -lompstub -limf
 user-atc_SYSLIB =  -lblas -llapack
diff --git a/src/MAKE/Makefile.linux b/src/MAKE/Makefile.linux
index 15df43de64..a82f1347bb 100755
--- a/src/MAKE/Makefile.linux
+++ b/src/MAKE/Makefile.linux
@@ -57,7 +57,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lifcore -lsvml -lompstub -limf
 reax_SYSLIB =      -lifcore -lsvml -lompstub -limf
 user-atc_SYSLIB =  -lblas -llapack
diff --git a/src/MAKE/Makefile.mac b/src/MAKE/Makefile.mac
index 0a6dfd9bc7..4ac8beaacd 100755
--- a/src/MAKE/Makefile.mac
+++ b/src/MAKE/Makefile.mac
@@ -57,7 +57,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lifcore -lsvml -lompstub -limf
 reax_SYSLIB =      -lifcore -lsvml -lompstub -limf
 user-atc_SYSLIB =  -lblas -llapack
diff --git a/src/MAKE/Makefile.mac_mpi b/src/MAKE/Makefile.mac_mpi
index e28fa06033..c0e2aa4170 100755
--- a/src/MAKE/Makefile.mac_mpi
+++ b/src/MAKE/Makefile.mac_mpi
@@ -60,7 +60,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lgfortran
 reax_SYSLIB =      -lgfortran
 user-atc_SYSLIB =  -lblas -llapack
diff --git a/src/MAKE/Makefile.mingw b/src/MAKE/Makefile.mingw
index d727478859..81132902fb 100644
--- a/src/MAKE/Makefile.mingw
+++ b/src/MAKE/Makefile.mingw
@@ -58,7 +58,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lifcore -lsvml -lompstub -limf
 reax_SYSLIB =      -lifcore -lsvml -lompstub -limf
 user-atc_SYSLIB =  -lblas -llapack
diff --git a/src/MAKE/Makefile.mkl b/src/MAKE/Makefile.mkl
index 40abca8301..4d6cd22545 100644
--- a/src/MAKE/Makefile.mkl
+++ b/src/MAKE/Makefile.mkl
@@ -63,7 +63,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lifcore -lsvml -lompstub -limf
 reax_SYSLIB =      -lifcore -lsvml -lompstub -limf
 user-atc_SYSLIB =  -lblas -llapack
diff --git a/src/MAKE/Makefile.odin b/src/MAKE/Makefile.odin
index b370ed16c8..cabb2dc4ea 100755
--- a/src/MAKE/Makefile.odin
+++ b/src/MAKE/Makefile.odin
@@ -57,7 +57,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lifcore -lsvml -lompstub -limf
 reax_SYSLIB =      -lifcore -lsvml -lompstub -limf
 user-atc_SYSLIB =  -lblas -llapack
diff --git a/src/MAKE/Makefile.openmpi b/src/MAKE/Makefile.openmpi
index fb86bb4c57..0fcf6fb650 100644
--- a/src/MAKE/Makefile.openmpi
+++ b/src/MAKE/Makefile.openmpi
@@ -58,7 +58,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lifcore -lsvml -lompstub -limf
 reax_SYSLIB =      -lifcore -lsvml -lompstub -limf
 user-atc_SYSLIB =  -lblas -llapack
diff --git a/src/MAKE/Makefile.pgi b/src/MAKE/Makefile.pgi
index 20dcd71ea8..c945e91124 100644
--- a/src/MAKE/Makefile.pgi
+++ b/src/MAKE/Makefile.pgi
@@ -57,7 +57,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lifcore -lsvml -lompstub -limf
 reax_SYSLIB =      -lifcore -lsvml -lompstub -limf
 user-atc_SYSLIB =  -lblas -llapack
diff --git a/src/MAKE/Makefile.power5 b/src/MAKE/Makefile.power5
index 7ea1b76e2b..616e8f31dc 100644
--- a/src/MAKE/Makefile.power5
+++ b/src/MAKE/Makefile.power5
@@ -58,7 +58,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lifcore -lsvml -lompstub -limf
 reax_SYSLIB =      -lifcore -lsvml -lompstub -limf
 user-atc_SYSLIB =  -lblas -llapack
diff --git a/src/MAKE/Makefile.qed b/src/MAKE/Makefile.qed
index 35961016b1..bf008ef744 100644
--- a/src/MAKE/Makefile.qed
+++ b/src/MAKE/Makefile.qed
@@ -57,7 +57,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lifcore -lsvml -lompstub -limf
 reax_SYSLIB =      -lifcore -lsvml -lompstub -limf
 user-atc_SYSLIB =  -lblas -llapack
diff --git a/src/MAKE/Makefile.redsky b/src/MAKE/Makefile.redsky
index 2f9b7a880e..579e527ff7 100644
--- a/src/MAKE/Makefile.redsky
+++ b/src/MAKE/Makefile.redsky
@@ -86,7 +86,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lifcore -lsvml -lompstub -limf
 reax_SYSLIB =      -lifcore -lsvml -lompstub -limf
 user-atc_SYSLIB =  ${BLASLIB}
diff --git a/src/MAKE/Makefile.sdsc b/src/MAKE/Makefile.sdsc
index c7a438d630..21acdaa375 100644
--- a/src/MAKE/Makefile.sdsc
+++ b/src/MAKE/Makefile.sdsc
@@ -62,7 +62,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lifcore -lsvml -lompstub -limf
 reax_SYSLIB =      -lifcore -lsvml -lompstub -limf
 user-atc_SYSLIB =  -lblas -llapack
diff --git a/src/MAKE/Makefile.seaborg b/src/MAKE/Makefile.seaborg
index 5134fa34ec..d8398506cb 100644
--- a/src/MAKE/Makefile.seaborg
+++ b/src/MAKE/Makefile.seaborg
@@ -58,7 +58,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lifcore -lsvml -lompstub -limf
 reax_SYSLIB =      -lifcore -lsvml -lompstub -limf
 user-atc_SYSLIB =  -lblas -llapack
diff --git a/src/MAKE/Makefile.serial b/src/MAKE/Makefile.serial
index 7f431b3c7e..64cf13db29 100755
--- a/src/MAKE/Makefile.serial
+++ b/src/MAKE/Makefile.serial
@@ -58,7 +58,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lgfortran
 reax_SYSLIB =      -lgfortran
 user-atc_SYSLIB =  -lblas -llapack
diff --git a/src/MAKE/Makefile.serial_debug b/src/MAKE/Makefile.serial_debug
index 771d60b260..9dad10864f 100644
--- a/src/MAKE/Makefile.serial_debug
+++ b/src/MAKE/Makefile.serial_debug
@@ -57,7 +57,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lgfortran
 reax_SYSLIB =      -lgfortran
 user-atc_SYSLIB =  -lblas -llapack
diff --git a/src/MAKE/Makefile.sgi b/src/MAKE/Makefile.sgi
index 0b3e03edd1..bb1c8fc254 100644
--- a/src/MAKE/Makefile.sgi
+++ b/src/MAKE/Makefile.sgi
@@ -57,7 +57,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lifcore -lsvml -lompstub -limf
 reax_SYSLIB =      -lifcore -lsvml -lompstub -limf
 user-atc_SYSLIB =  -lblas -llapack
diff --git a/src/MAKE/Makefile.solaris b/src/MAKE/Makefile.solaris
index 27da3064c6..44ca0148ea 100644
--- a/src/MAKE/Makefile.solaris
+++ b/src/MAKE/Makefile.solaris
@@ -57,7 +57,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lifcore -lsvml -lompstub -limf
 reax_SYSLIB =      -lifcore -lsvml -lompstub -limf
 user-atc_SYSLIB =  -lblas -llapack
diff --git a/src/MAKE/Makefile.spirit b/src/MAKE/Makefile.spirit
index e998af1a72..db5b557fa4 100644
--- a/src/MAKE/Makefile.spirit
+++ b/src/MAKE/Makefile.spirit
@@ -65,7 +65,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lifcore -lsvml -lompstub -limf
 reax_SYSLIB =      -lifcore -lsvml -lompstub -limf
 user-atc_SYSLIB =  -lblas -llapack
diff --git a/src/MAKE/Makefile.storm b/src/MAKE/Makefile.storm
index 65c838e9ba..8519c35291 100644
--- a/src/MAKE/Makefile.storm
+++ b/src/MAKE/Makefile.storm
@@ -58,7 +58,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lifcore -lsvml -lompstub -limf
 reax_SYSLIB =      -lifcore -lsvml -lompstub -limf
 user-atc_SYSLIB =  -lblas -llapack
diff --git a/src/MAKE/Makefile.tacc b/src/MAKE/Makefile.tacc
index 0ed8f6f34c..045695597b 100644
--- a/src/MAKE/Makefile.tacc
+++ b/src/MAKE/Makefile.tacc
@@ -60,7 +60,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lifcore
 reax_SYSLIB =      -lifcore
 user-atc_SYSLIB =  -lblas -llapack
diff --git a/src/MAKE/Makefile.tbird b/src/MAKE/Makefile.tbird
index 32783ea237..de21c8fdfe 100644
--- a/src/MAKE/Makefile.tbird
+++ b/src/MAKE/Makefile.tbird
@@ -76,7 +76,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lifcore -lsvml -lompstub -limf
 reax_SYSLIB =      -lifcore -lsvml -lompstub -limf
 user-atc_SYSLIB =  -lblas -llapack
diff --git a/src/MAKE/Makefile.tesla b/src/MAKE/Makefile.tesla
index 9b35a739c8..337fa9e0f5 100755
--- a/src/MAKE/Makefile.tesla
+++ b/src/MAKE/Makefile.tesla
@@ -57,7 +57,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lifcore -lsvml -lompstub -limf
 reax_SYSLIB =      -lifcore -lsvml -lompstub -limf
 user-atc_SYSLIB =  -lblas -llapack
diff --git a/src/MAKE/Makefile.tunnison b/src/MAKE/Makefile.tunnison
index d0b2a2efdf..2afd374aa9 100644
--- a/src/MAKE/Makefile.tunnison
+++ b/src/MAKE/Makefile.tunnison
@@ -68,7 +68,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lifcore -lsvml -lompstub -limf
 reax_SYSLIB =      -lifcore -lsvml -lompstub -limf
 user-atc_SYSLIB =  -llapack
diff --git a/src/MAKE/Makefile.xt3 b/src/MAKE/Makefile.xt3
index a3089716dc..2c610c7bc7 100644
--- a/src/MAKE/Makefile.xt3
+++ b/src/MAKE/Makefile.xt3
@@ -59,7 +59,7 @@ JPG_LIB =
 # SYSLIB = names of libraries
 # SYSPATH = paths of libraries
 
-gpu_SYSLIB =       -lcudart
+gpu_SYSLIB =       -lcudart -lcuda
 meam_SYSLIB =      -lifcore -lsvml -lompstub -limf
 reax_SYSLIB =      -lifcore -lsvml -lompstub -limf
 user-atc_SYSLIB =  -lblas -llapack

From 96fb599b2d3d5814e74b19f49475d22e5e686f5c Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Mon, 2 May 2011 15:03:20 +0000
Subject: [PATCH 19/21] git-svn-id:
 svn://svn.icms.temple.edu/lammps-ro/trunk@6055
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 src/GPU/Install.sh                       |  25 +++
 src/GPU/fix_gpu.cpp                      |  48 ++--
 src/GPU/fix_gpu.h                        |   2 +
 src/GPU/pair_cg_cmm_coul_long_gpu.cpp    | 274 +++++------------------
 src/GPU/pair_cg_cmm_coul_long_gpu.h      |   3 +-
 src/GPU/pair_cg_cmm_gpu.cpp              | 203 ++++-------------
 src/GPU/pair_cg_cmm_gpu.h                |   3 +-
 src/GPU/pair_gayberne_gpu.cpp            | 268 ++++++----------------
 src/GPU/pair_gayberne_gpu.h              |   5 +-
 src/GPU/pair_lj96_cut_gpu.cpp            | 166 ++++----------
 src/GPU/pair_lj96_cut_gpu.h              |   3 +-
 src/GPU/pair_lj_charmm_coul_long_gpu.cpp | 264 ++++++----------------
 src/GPU/pair_lj_charmm_coul_long_gpu.h   |   3 +-
 src/GPU/pair_lj_cut_coul_cut_gpu.cpp     | 192 ++++------------
 src/GPU/pair_lj_cut_coul_cut_gpu.h       |   3 +-
 src/GPU/pair_lj_cut_coul_long_gpu.cpp    | 225 ++++---------------
 src/GPU/pair_lj_cut_coul_long_gpu.h      |   3 +-
 src/GPU/pair_lj_cut_gpu.cpp              | 165 ++++----------
 src/GPU/pair_lj_cut_gpu.h                |   3 +-
 19 files changed, 491 insertions(+), 1367 deletions(-)

diff --git a/src/GPU/Install.sh b/src/GPU/Install.sh
index 29504865b4..a17dc9ffd5 100644
--- a/src/GPU/Install.sh
+++ b/src/GPU/Install.sh
@@ -14,6 +14,15 @@ if (test $1 = 1) then
     sed -i -e 's|^PKG_SYSLIB =[ \t]*|&$(gpu_SYSLIB) |' ../Makefile.package
   fi
   
+  if (test -e ../pppm.cpp) then
+    cp pppm_gpu.cpp ..
+    cp pppm_gpu_single.cpp ..
+    cp pppm_gpu_double.cpp ..
+    cp pppm_gpu.h ..
+    cp pppm_gpu_single.h ..
+    cp pppm_gpu_double.h ..
+  fi
+  
   if (test -e ../pair_gayberne.cpp) then
     cp pair_gayberne_gpu.cpp ..
     cp pair_gayberne_gpu.h ..
@@ -40,14 +49,19 @@ if (test $1 = 1) then
   fi
 
   cp pair_lj_cut_gpu.cpp ..
+  cp pair_morse_gpu.cpp ..
   cp pair_lj96_cut_gpu.cpp ..
+  cp pair_lj_expand_gpu.cpp ..
   cp pair_lj_cut_coul_cut_gpu.cpp ..
   cp pair_lj_cut_gpu.h ..
+  cp pair_morse_gpu.h ..
   cp pair_lj96_cut_gpu.h ..
+  cp pair_lj_expand_gpu.h ..
   cp pair_lj_cut_coul_cut_gpu.h ..
   
   cp fix_gpu.cpp ..
   cp fix_gpu.h ..
+  cp gpu_extra.h ..
 
 elif (test $1 = 0) then
 
@@ -56,9 +70,14 @@ elif (test $1 = 0) then
     sed -i -e 's/[^ \t]*gpu_[^ \t]*) //' ../Makefile.package
   fi
   
+  rm ../pppm_gpu.cpp
+  rm ../pppm_gpu_single.cpp
+  rm ../pppm_gpu_double.cpp
   rm ../pair_gayberne_gpu.cpp
   rm ../pair_lj_cut_gpu.cpp
+  rm ../pair_morse_gpu.cpp
   rm ../pair_lj96_cut_gpu.cpp
+  rm ../pair_lj_expand_gpu.cpp
   rm ../pair_lj_cut_coul_cut_gpu.cpp
   rm ../pair_lj_cut_coul_long_gpu.cpp
   rm ../pair_lj_charmm_coul_long_gpu.cpp
@@ -66,15 +85,21 @@ elif (test $1 = 0) then
   rm ../pair_cg_cmm_coul_long_gpu.cpp
   rm ../fix_gpu.cpp
 
+  rm ../pppm_gpu.h
+  rm ../pppm_gpu_single.cpp
+  rm ../pppm_gpu_double.h
   rm ../pair_gayberne_gpu.h
   rm ../pair_lj_cut_gpu.h
+  rm ../pair_morse_gpu.h
   rm ../pair_lj96_cut_gpu.h
+  rm ../pair_lj_expand_gpu.h
   rm ../pair_lj_cut_coul_cut_gpu.h
   rm ../pair_lj_cut_coul_long_gpu.h
   rm ../pair_lj_charmm_coul_long_gpu.h
   rm ../pair_cg_cmm_gpu.h
   rm ../pair_cg_cmm_coul_long_gpu.h
   rm ../fix_gpu.h
+  rm ../gpu_extra.h
   
 fi
 
diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp
index 75ce1e83f3..54721900e6 100644
--- a/src/GPU/fix_gpu.cpp
+++ b/src/GPU/fix_gpu.cpp
@@ -24,15 +24,16 @@
 #include "modify.h"
 #include "domain.h"
 #include "universe.h"
+#include "gpu_extra.h"
 
 using namespace LAMMPS_NS;
 
 enum{GPU_FORCE, GPU_NEIGH};
 
-extern bool lmp_init_device(MPI_Comm world, MPI_Comm replica,
-                            const int first_gpu, const int last_gpu,
-                            const int gpu_mode, const double particle_split,
-                            const int nthreads);
+extern int lmp_init_device(MPI_Comm world, MPI_Comm replica,
+                           const int first_gpu, const int last_gpu,
+                           const int gpu_mode, const double particle_split,
+                           const int nthreads, const int t_per_atom);
 extern void lmp_clear_device();
 extern double lmp_gpu_forces(double **f, double **tor, double *eatom,
                              double **vatom, double *virial, double &ecoul);
@@ -42,18 +43,17 @@ extern double lmp_gpu_forces(double **f, double **tor, double *eatom,
 FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
   Fix(lmp, narg, arg)
 {
-  if (narg != 7) error->all("Illegal fix gpu command");
+  if (narg < 7) error->all("Illegal fix gpu command");
 
   if (strcmp(arg[1],"all") != 0)
     error->all("Illegal fix gpu command");
 
-  int gpu_mode, first_gpu, last_gpu;
-  double particle_split;
+  int first_gpu, last_gpu;
 
   if (strcmp(arg[3],"force") == 0)
-    gpu_mode = GPU_FORCE;
+    _gpu_mode = GPU_FORCE;
   else if (strcmp(arg[3],"force/neigh") == 0) {
-    gpu_mode = GPU_NEIGH;
+    _gpu_mode = GPU_NEIGH;
     if (domain->triclinic)
       error->all("Cannot use force/neigh with triclinic box.");
   } else
@@ -62,13 +62,24 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
   first_gpu = atoi(arg[4]);
   last_gpu = atoi(arg[5]);
 
-  particle_split = force->numeric(arg[6]);
-  if (particle_split==0 || particle_split>1)
+  _particle_split = force->numeric(arg[6]);
+  if (_particle_split==0 || _particle_split>1)
     error->all("Illegal fix gpu command.");
     
-  if (!lmp_init_device(universe->uworld,world,first_gpu,last_gpu,gpu_mode,
-                       particle_split,1))
-    error->one("Could not find or initialize a specified accelerator device.");
+  int nthreads = 1;
+  int threads_per_atom = -1;
+  if (narg == 9) {
+    if (strcmp(arg[7],"threads_per_atom") == 0)
+      threads_per_atom = atoi(arg[8]);
+    else
+      error->all("Illegal fix gpu command.");
+  } else if (narg != 7)
+    error->all("Illegal fix gpu command.");
+
+  int gpu_flag = lmp_init_device(universe->uworld, world, first_gpu, last_gpu,
+				 _gpu_mode, _particle_split, nthreads,
+				 threads_per_atom);
+  GPU_EXTRA::check_flag(gpu_flag,error,world);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -95,6 +106,15 @@ void FixGPU::init()
   // Can only have 1 gpu fix that must be the first fix for a run
   if ((void*)modify->fix[0] != (void*)this)
     error->all("GPU is not the first fix for this run.");
+  // Hybrid cannot be used with force/neigh option
+  if (_gpu_mode == GPU_NEIGH)
+    if (force->pair_match("hybrid",1) != NULL ||
+	force->pair_match("hybrid/overlay",1) != NULL)
+      error->all("Cannot use pair hybrid with GPU neighbor builds.");
+  if (_particle_split < 0)
+    if (force->pair_match("hybrid",1) != NULL ||
+	force->pair_match("hybrid/overlay",1) != NULL)
+      error->all("Fix gpu split must be positive for hybrid pair styles.");
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/GPU/fix_gpu.h b/src/GPU/fix_gpu.h
index 35c8dea324..30b53ac879 100644
--- a/src/GPU/fix_gpu.h
+++ b/src/GPU/fix_gpu.h
@@ -37,6 +37,8 @@ class FixGPU : public Fix {
   double memory_usage();
 
  private:
+  int _gpu_mode;
+  double _particle_split;
 };
 
 }
diff --git a/src/GPU/pair_cg_cmm_coul_long_gpu.cpp b/src/GPU/pair_cg_cmm_coul_long_gpu.cpp
index 6d11692d5c..153cb98a9e 100644
--- a/src/GPU/pair_cg_cmm_coul_long_gpu.cpp
+++ b/src/GPU/pair_cg_cmm_coul_long_gpu.cpp
@@ -35,6 +35,7 @@
 #include "domain.h"
 #include "string.h"
 #include "kspace.h"
+#include "gpu_extra.h"
 
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
@@ -49,27 +50,29 @@
 
 // External functions from cuda library for atom decomposition
 
-bool cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
-                   double **host_lj1, double **host_lj2, double **host_lj3,
-                   double **host_lj4, double **offset, double *special_lj,
-                   const int nlocal, const int nall, const int max_nbors,
-                   const int maxspecial, const double cell_size, int &gpu_mode,
-                   FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
-                   double *host_special_coul, const double qqrd2e,
-                   const double g_ewald);
+int cmml_gpu_init(const int ntypes, double **cutsq, int **cg_type,
+		  double **host_lj1, double **host_lj2, double **host_lj3,
+		  double **host_lj4, double **offset, double *special_lj,
+		  const int nlocal, const int nall, const int max_nbors,
+		  const int maxspecial, const double cell_size, int &gpu_mode,
+		  FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
+		  double *host_special_coul, const double qqrd2e,
+		  const double g_ewald);
 void cmml_gpu_clear();
-int * cmml_gpu_compute_n(const int timestep, const int ago, const int inum,
-	 	         const int nall, double **host_x, int *host_type, 
-                         double *boxlo, double *boxhi, int *tag, int **nspecial,
-                         int **special, const bool eflag, const bool vflag,
-                         const bool eatom, const bool vatom, int &host_start,
-                         const double cpu_time, bool &success, double *host_q);
-void cmml_gpu_compute(const int timestep, const int ago, const int inum,
-	 	      const int nall, double **host_x, int *host_type,
-                      int *ilist, int *numj, int **firstneigh,
-		      const bool eflag, const bool vflag, const bool eatom,
-                      const bool vatom, int &host_start, const double cpu_time,
-                      bool &success, double *host_q);
+int ** cmml_gpu_compute_n(const int ago, const int inum, const int nall,
+			  double **host_x, int *host_type, double *sublo,
+			  double *subhi, int *tag, int **nspecial,
+			  int **special, const bool eflag, const bool vflag,
+			  const bool eatom, const bool vatom, int &host_start,
+			  int **ilist, int **jnum, const double cpu_time,
+			  bool &success, double *host_q, double *boxlo,
+			  double *prd);
+void cmml_gpu_compute(const int ago, const int inum, const int nall,
+		      double **host_x, int *host_type, int *ilist, int *numj,
+		      int **firstneigh, const bool eflag, const bool vflag,
+		      const bool eatom, const bool vatom, int &host_start,
+		      const double cpu_time, bool &success, double *host_q,
+		      const int nlocal, double *boxlo, double *prd);
 double cmml_gpu_bytes();
 
 using namespace LAMMPS_NS;
@@ -95,8 +98,6 @@ PairCGCMMCoulLongGPU::~PairCGCMMCoulLongGPU()
 
 void PairCGCMMCoulLongGPU::compute(int eflag, int vflag)
 {
-  int ntimestep = static_cast<int>(update->ntimestep % MAXSMALLINT);
-
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = 0;
   
@@ -104,31 +105,32 @@ void PairCGCMMCoulLongGPU::compute(int eflag, int vflag)
   int inum, host_start;
   
   bool success = true;
-  
+  int *ilist, *numneigh, **firstneigh;    
   if (gpu_mode == GPU_NEIGH) {
     inum = atom->nlocal;
-    gpulist = cmml_gpu_compute_n(ntimestep, neighbor->ago, inum, nall,
-			         atom->x, atom->type, domain->sublo,
-				 domain->subhi, atom->tag, atom->nspecial,
-                                 atom->special, eflag, vflag, eflag_atom,
-                                 vflag_atom, host_start, cpu_time, success,
-                                 atom->q);
+    firstneigh = cmml_gpu_compute_n(neighbor->ago, inum, nall, atom->x,
+				    atom->type, domain->sublo, domain->subhi,
+				    atom->tag, atom->nspecial, atom->special,
+				    eflag, vflag, eflag_atom, vflag_atom,
+				    host_start, &ilist, &numneigh, cpu_time,
+				    success, atom->q, domain->boxlo,
+				    domain->prd);
   } else {
     inum = list->inum;
-    cmml_gpu_compute(ntimestep, neighbor->ago, inum, nall, atom->x,
-		     atom->type, list->ilist, list->numneigh, list->firstneigh,
-		     eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time,
-                     success, atom->q);
+    ilist = list->ilist;
+    numneigh = list->numneigh;
+    firstneigh = list->firstneigh;
+    cmml_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
+		     ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
+		     vflag_atom, host_start, cpu_time, success, atom->q,
+		     atom->nlocal, domain->boxlo, domain->prd);
   }
   if (!success)
     error->one("Out of memory on GPGPU");
 
   if (host_start<inum) {
     cpu_time = MPI_Wtime();
-    if (gpu_mode == GPU_NEIGH)
-      cpu_compute(gpulist, host_start, eflag, vflag);
-    else
-      cpu_compute(host_start, eflag, vflag);
+    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
     cpu_time = MPI_Wtime() - cpu_time;
   }
 }
@@ -143,8 +145,8 @@ void PairCGCMMCoulLongGPU::init_style()
 
   if (!atom->q_flag)
     error->all("Pair style cg/cmm/coul/long requires atom attribute q");
-  if (force->pair_match("gpu",0) == NULL)
-    error->all("Cannot use pair hybrid with multiple GPU pair styles");
+  if (force->newton_pair) 
+    error->all("Cannot use newton pair with GPU cg/cmm pair style");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
@@ -176,17 +178,13 @@ void PairCGCMMCoulLongGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
-  bool init_ok = cmml_gpu_init(atom->ntypes+1, cutsq, cg_type, lj1, lj2, lj3,
-                               lj4, offset, force->special_lj, atom->nlocal,
-                               atom->nlocal+atom->nghost, 300, maxspecial,
-                               cell_size, gpu_mode, screen, cut_ljsq,
-                               cut_coulsq_global, force->special_coul,
-                               force->qqrd2e, g_ewald);
-  if (!init_ok)
-    error->one("Insufficient memory on accelerator (or no fix gpu).\n"); 
-
-  if (force->newton_pair) 
-    error->all("Cannot use newton pair with GPU cg/cmm pair style");
+  int success = cmml_gpu_init(atom->ntypes+1, cutsq, cg_type, lj1, lj2, lj3,
+			      lj4, offset, force->special_lj, atom->nlocal,
+			      atom->nlocal+atom->nghost, 300, maxspecial,
+			      cell_size, gpu_mode, screen, cut_ljsq,
+			      cut_coulsq_global, force->special_coul,
+			      force->qqrd2e, g_ewald);
+  GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode != GPU_NEIGH) {
     int irequest = neighbor->request(this);
@@ -205,14 +203,16 @@ double PairCGCMMCoulLongGPU::memory_usage()
 
 /* ---------------------------------------------------------------------- */
 
-void PairCGCMMCoulLongGPU::cpu_compute(int start, int eflag, int vflag)
+void PairCGCMMCoulLongGPU::cpu_compute(int start, int inum, int eflag,
+				       int vflag, int *ilist, int *numneigh,
+				       int **firstneigh)
 {
-  int i,j,ii,jj,inum,jnum,itype,jtype,itable;
+  int i,j,ii,jj,jnum,itype,jtype,itable;
   double qtmp,xtmp,ytmp,ztmp,delx,dely,delz;
   double fraction,table;
   double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
   double grij,expm2,prefactor,t,erfc;
-  int *ilist,*jlist,*numneigh,**firstneigh;
+  int *jlist;
   double rsq;
 
   double **x = atom->x;
@@ -225,11 +225,6 @@ void PairCGCMMCoulLongGPU::cpu_compute(int start, int eflag, int vflag)
   double *special_lj = force->special_lj;
   double qqrd2e = force->qqrd2e;
 
-  inum = list->inum;
-  ilist = list->ilist;
-  numneigh = list->numneigh;
-  firstneigh = list->firstneigh;
-  
   // loop over neighbors of my atoms
 
   for (ii = start; ii < inum; ii++) {
@@ -244,13 +239,9 @@ void PairCGCMMCoulLongGPU::cpu_compute(int start, int eflag, int vflag)
 
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
-
-      if (j < nall) factor_coul = factor_lj = 1.0;
-      else {
-	factor_coul = special_coul[j/nall];
-	factor_lj = special_lj[j/nall];
-	j %= nall;
-      }
+      factor_lj = special_lj[sbmask(j)];
+      factor_coul = special_coul[sbmask(j)];
+      j &= NEIGHMASK;
 
       const double delx = xtmp - x[j][0];
       const double dely = ytmp - x[j][1];
@@ -347,156 +338,3 @@ void PairCGCMMCoulLongGPU::cpu_compute(int start, int eflag, int vflag)
     }
   }
 }
-
-/* ---------------------------------------------------------------------- */
-
-void PairCGCMMCoulLongGPU::cpu_compute(int *nbors, int start, int eflag,
-                                      int vflag)
-{
-  int i,j,jnum,itype,jtype,itable;
-  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz;
-  double fraction,table;
-  double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
-  double grij,expm2,prefactor,t,erfc;
-  double rsq;
-
-  double **x = atom->x;
-  double **f = atom->f;
-  double *q = atom->q;
-  int *type = atom->type;
-  int nlocal = atom->nlocal;
-  int nall = nlocal + atom->nghost;
-  int stride = nlocal-start;
-  double *special_coul = force->special_coul;
-  double *special_lj = force->special_lj;
-  double qqrd2e = force->qqrd2e;
-
-  // loop over neighbors of my atoms
-
-  for (i = start; i < nlocal; i++) {
-    qtmp = q[i];
-    xtmp = x[i][0];
-    ytmp = x[i][1];
-    ztmp = x[i][2];
-    itype = type[i];
-    int *nbor = nbors + i - start;
-    jnum = *nbor;
-    nbor += stride;
-    int *nbor_end = nbor + stride * jnum;
-
-    for (; nbor<nbor_end; nbor+=stride) {
-      j = *nbor;
-
-      if (j < nall) factor_coul = factor_lj = 1.0;
-      else {
-	factor_coul = special_coul[j/nall];
-	factor_lj = special_lj[j/nall];
-	j %= nall;
-      }
-
-      delx = xtmp - x[j][0];
-      dely = ytmp - x[j][1];
-      delz = ztmp - x[j][2];
-      rsq = delx*delx + dely*dely + delz*delz;
-      jtype = type[j];
-
-      double evdwl = 0.0;
-      double ecoul = 0.0;
-      double fpair = 0.0;
-
-      if (rsq < cutsq[itype][jtype]) {
-        const double r2inv = 1.0/rsq;
-        const int cgt=cg_type[itype][jtype];
-
-        double forcelj  = 0.0;
-        double forcecoul = 0.0;
-
-        if (rsq < cut_ljsq[itype][jtype]) {
-          forcelj=factor_lj;
-          if (eflag) evdwl=factor_lj;
-
-          if (cgt == CG_LJ12_4) {
-            const double r4inv=r2inv*r2inv;
-            forcelj *= r4inv*(lj1[itype][jtype]*r4inv*r4inv
-                       - lj2[itype][jtype]);
-            if (eflag) {
-              evdwl *= r4inv*(lj3[itype][jtype]*r4inv*r4inv
-                       - lj4[itype][jtype]) - offset[itype][jtype];
-            }
-          } else if (cgt == CG_LJ9_6) {
-            const double r3inv = r2inv*sqrt(r2inv);
-            const double r6inv = r3inv*r3inv;
-            forcelj *= r6inv*(lj1[itype][jtype]*r3inv
-                       - lj2[itype][jtype]);
-            if (eflag) {
-              evdwl *= r6inv*(lj3[itype][jtype]*r3inv
-                        - lj4[itype][jtype]) - offset[itype][jtype];
-            }
-          } else {
-            const double r6inv = r2inv*r2inv*r2inv;
-            forcelj *= r6inv*(lj1[itype][jtype]*r6inv
-                       - lj2[itype][jtype]);
-            if (eflag) {
-              evdwl *= r6inv*(lj3[itype][jtype]*r6inv
-                       - lj4[itype][jtype]) - offset[itype][jtype];
-            }
-          }
-        }
-
-        if (rsq < cut_coulsq_global) {
-          if (!ncoultablebits || rsq <= tabinnersq) {
-            const double r = sqrt(rsq);
-            const double grij = g_ewald * r;
-            const double expm2 = exp(-grij*grij);
-            const double t = 1.0 / (1.0 + EWALD_P*grij);
-            const double erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-            const double prefactor = qqrd2e * qtmp*q[j]/r;
-            forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
-            if (eflag) ecoul = prefactor*erfc;
-            if (factor_coul < 1.0) {
-              forcecoul -= (1.0-factor_coul)*prefactor;
-              if (eflag) ecoul -= (1.0-factor_coul)*prefactor;
-            }
-          } else {
-            union_int_float_t rsq_lookup;
-            rsq_lookup.f = rsq;
-            int itable = rsq_lookup.i & ncoulmask;
-            itable >>= ncoulshiftbits;
-            const double fraction = (rsq_lookup.f - rtable[itable]) *
-                                     drtable[itable];
-            const double table = ftable[itable] + fraction*dftable[itable];
-            forcecoul = qtmp*q[j] * table;
-            if (eflag) {
-              const double table2 = etable[itable] + fraction*detable[itable];
-              ecoul = qtmp*q[j] * table2;
-            }
-            if (factor_coul < 1.0) {
-              const double table2 = ctable[itable] + fraction*dctable[itable];
-              const double prefactor = qtmp*q[j] * table2;
-              forcecoul -= (1.0-factor_coul)*prefactor;
-              if (eflag) ecoul -= (1.0-factor_coul)*prefactor;
-            }
-          }
-        }
-        fpair = (forcecoul + forcelj) * r2inv;
-
-	f[i][0] += delx*fpair;
-	f[i][1] += dely*fpair;
-	f[i][2] += delz*fpair;
-
-        if (j<start) {
-  	  if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
-        } else {
-          if (j<nlocal) {
-	    f[j][0] -= delx*fpair;
-	    f[j][1] -= dely*fpair;
-	    f[j][2] -= delz*fpair;
-  	  }
-	  if (evflag) ev_tally(i,j,nlocal,0,
-			       evdwl,ecoul,fpair,delx,dely,delz);
-        }
-      }
-    }
-  }
-}
-
diff --git a/src/GPU/pair_cg_cmm_coul_long_gpu.h b/src/GPU/pair_cg_cmm_coul_long_gpu.h
index c42f6efd24..19dd11c66d 100644
--- a/src/GPU/pair_cg_cmm_coul_long_gpu.h
+++ b/src/GPU/pair_cg_cmm_coul_long_gpu.h
@@ -28,8 +28,7 @@ class PairCGCMMCoulLongGPU : public PairCGCMMCoulLong {
  public:
   PairCGCMMCoulLongGPU(LAMMPS *lmp);
   ~PairCGCMMCoulLongGPU();
-  void cpu_compute(int, int, int);
-  void cpu_compute(int *, int, int, int);
+  void cpu_compute(int, int, int, int, int *, int *, int **);
   void compute(int, int);
   void init_style();
   double memory_usage();
diff --git a/src/GPU/pair_cg_cmm_gpu.cpp b/src/GPU/pair_cg_cmm_gpu.cpp
index 638815f367..4ccea32421 100644
--- a/src/GPU/pair_cg_cmm_gpu.cpp
+++ b/src/GPU/pair_cg_cmm_gpu.cpp
@@ -34,31 +34,32 @@
 #include "update.h"
 #include "domain.h"
 #include "string.h"
+#include "gpu_extra.h"
 
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 
 // External functions from cuda library for atom decomposition
 
-bool cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types, 
-                  double **host_lj1, double **host_lj2, double **host_lj3,
-                  double **host_lj4, double **offset, double *special_lj,
-                  const int nlocal, const int nall, const int max_nbors,
-                  const int maxspecial, const double cell_size, int &gpu_mode,
-                  FILE *screen);
+int cmm_gpu_init(const int ntypes, double **cutsq, int **cg_types, 
+		 double **host_lj1, double **host_lj2, double **host_lj3,
+		 double **host_lj4, double **offset, double *special_lj,
+		 const int nlocal, const int nall, const int max_nbors,
+		 const int maxspecial, const double cell_size, int &gpu_mode,
+		 FILE *screen);
 void cmm_gpu_clear();
-int * cmm_gpu_compute_n(const int timestep, const int ago, const int inum,
-	 	        const int nall, double **host_x, int *host_type, 
-                        double *boxlo, double *boxhi, int *tag, int **nspecial,
-                        int **special, const bool eflag, const bool vflag,
-                        const bool eatom, const bool vatom, int &host_start,
-                        const double cpu_time, bool &success);
-void cmm_gpu_compute(const int timestep, const int ago, const int inum,
-	 	     const int nall, double **host_x, int *host_type,
-                     int *ilist, int *numj, int **firstneigh,
-		     const bool eflag, const bool vflag, const bool eatom,
-                     const bool vatom, int &host_start, const double cpu_time,
-                     bool &success);
+int ** cmm_gpu_compute_n(const int ago, const int inum, const int nall,
+			 double **host_x, int *host_type, double *sublo,
+			 double *subhi, int *tag, int **nspecial,
+			 int **special, const bool eflag, const bool vflag,
+			 const bool eatom, const bool vatom, int &host_start,
+			 int **ilist, int **jnum,
+			 const double cpu_time, bool &success);
+void cmm_gpu_compute(const int ago, const int inum, const int nall,
+		     double **host_x, int *host_type, int *ilist, int *numj,
+		     int **firstneigh, const bool eflag, const bool vflag,
+		     const bool eatom, const bool vatom, int &host_start,
+		     const double cpu_time, bool &success);
 double cmm_gpu_bytes();
 
 using namespace LAMMPS_NS;
@@ -84,8 +85,6 @@ PairCGCMMGPU::~PairCGCMMGPU()
 
 void PairCGCMMGPU::compute(int eflag, int vflag)
 {
-  int ntimestep = static_cast<int>(update->ntimestep % MAXSMALLINT);
-
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = 0;
   
@@ -93,30 +92,30 @@ void PairCGCMMGPU::compute(int eflag, int vflag)
   int inum, host_start;
   
   bool success = true;
-  
+  int *ilist, *numneigh, **firstneigh;    
   if (gpu_mode == GPU_NEIGH) {
     inum = atom->nlocal;
-    gpulist = cmm_gpu_compute_n(ntimestep, neighbor->ago, inum, nall,
-			        atom->x, atom->type, domain->sublo,
-				domain->subhi, atom->tag, atom->nspecial,
-                                atom->special, eflag, vflag, eflag_atom,
-                                vflag_atom, host_start, cpu_time, success);
+    firstneigh = cmm_gpu_compute_n(neighbor->ago, inum, nall, atom->x,
+				   atom->type, domain->sublo, domain->subhi,
+				   atom->tag, atom->nspecial, atom->special,
+				   eflag, vflag, eflag_atom, vflag_atom,
+				   host_start, &ilist, &numneigh, cpu_time,
+				   success);
   } else {
     inum = list->inum;
-    cmm_gpu_compute(ntimestep, neighbor->ago, inum, nall, atom->x,
-		    atom->type, list->ilist, list->numneigh, list->firstneigh,
-		    eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time,
-                    success);
+    ilist = list->ilist;
+    numneigh = list->numneigh;
+    firstneigh = list->firstneigh;
+    cmm_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
+		    ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
+		    vflag_atom, host_start, cpu_time, success);
   }
   if (!success)
     error->one("Out of memory on GPGPU");
 
   if (host_start<inum) {
     cpu_time = MPI_Wtime();
-    if (gpu_mode == GPU_NEIGH)
-      cpu_compute(gpulist, host_start, eflag, vflag);
-    else
-      cpu_compute(host_start, eflag, vflag);
+    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
     cpu_time = MPI_Wtime() - cpu_time;
   }
 }
@@ -129,8 +128,8 @@ void PairCGCMMGPU::init_style()
 {
   cut_respa = NULL;
 
-  if (force->pair_match("gpu",0) == NULL)
-    error->all("Cannot use pair hybrid with multiple GPU pair styles");
+  if (force->newton_pair) 
+    error->all("Cannot use newton pair with GPU CGCMM pair style");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
@@ -152,15 +151,11 @@ void PairCGCMMGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
-  bool init_ok = cmm_gpu_init(atom->ntypes+1,cutsq,cg_type,lj1,lj2,lj3,lj4,
-                              offset, force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
-                              cell_size, gpu_mode, screen);
-  if (!init_ok)
-    error->one("Insufficient memory on accelerator (or no fix gpu).\n"); 
-
-  if (force->newton_pair) 
-    error->all("Cannot use newton pair with GPU CGCMM pair style");
+  int success = cmm_gpu_init(atom->ntypes+1,cutsq,cg_type,lj1,lj2,lj3,lj4,
+			     offset, force->special_lj, atom->nlocal,
+			     atom->nlocal+atom->nghost, 300, maxspecial,
+			     cell_size, gpu_mode, screen);
+  GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode != GPU_NEIGH) {
     int irequest = neighbor->request(this);
@@ -179,11 +174,13 @@ double PairCGCMMGPU::memory_usage()
 
 /* ---------------------------------------------------------------------- */
 
-void PairCGCMMGPU::cpu_compute(int start, int eflag, int vflag) {
-  int i,j,ii,jj,inum,jnum,itype,jtype;
+void PairCGCMMGPU::cpu_compute(int start, int inum, int eflag, int vflag,
+			       int *ilist, int *numneigh, int **firstneigh)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
   double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
   double rsq,r2inv,r6inv,forcelj,factor_lj;
-  int *ilist,*jlist,*numneigh,**firstneigh;
+  int *jlist;
 
   double **x = atom->x;
   double **f = atom->f;
@@ -192,11 +189,6 @@ void PairCGCMMGPU::cpu_compute(int start, int eflag, int vflag) {
   int nall = nlocal + atom->nghost;
   double *special_lj = force->special_lj;
 
-  inum = list->inum;
-  ilist = list->ilist;
-  numneigh = list->numneigh;
-  firstneigh = list->firstneigh;
-
   // loop over neighbors of my atoms
 
   for (ii = start; ii < inum; ii++) {
@@ -210,12 +202,8 @@ void PairCGCMMGPU::cpu_compute(int start, int eflag, int vflag) {
 
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
-
-      if (j < nall) factor_lj = 1.0;
-      else {
-	factor_lj = special_lj[j/nall];
-	j %= nall;
-      }
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
 
       delx = xtmp - x[j][0];
       dely = ytmp - x[j][1];
@@ -266,100 +254,3 @@ void PairCGCMMGPU::cpu_compute(int start, int eflag, int vflag) {
     }
   }
 }
-
-/* ---------------------------------------------------------------------- */
-
-void PairCGCMMGPU::cpu_compute(int *nbors, int start, int eflag, int vflag) {
-  int i,j,itype,jtype;
-  int nlocal = atom->nlocal;
-  int nall = nlocal + atom->nghost;
-  int stride = nlocal-start;
-  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
-  double rsq,r2inv,r6inv,forcelj,factor_lj;
-  double *special_lj = force->special_lj;
-
-  double **x = atom->x;
-  double **f = atom->f;
-  int *type = atom->type;
-
-  // loop over neighbors of my atoms
-
-  for (i = start; i < nlocal; i++) {
-    xtmp = x[i][0];
-    ytmp = x[i][1];
-    ztmp = x[i][2];
-    itype = type[i];
-    int *nbor = nbors + i - start;
-    int jnum = *nbor;
-    nbor += stride;
-    int *nbor_end = nbor + stride * jnum;
-
-    for (; nbor<nbor_end; nbor+=stride) {
-      j = *nbor;
-      
-      if (j < nall) factor_lj = 1.0;
-      else {
-	factor_lj = special_lj[j/nall];
-	j %= nall;
-      }
-
-      delx = xtmp - x[j][0];
-      dely = ytmp - x[j][1];
-      delz = ztmp - x[j][2];
-      rsq = delx*delx + dely*dely + delz*delz;
-      jtype = type[j];
-
-      if (rsq < cutsq[itype][jtype]) {
-        const int cgt=cg_type[itype][jtype];
-        r2inv = 1.0/rsq;
-
-	fpair = factor_lj;
-	if (eflag) evdwl = factor_lj;
-	if (cgt == CG_LJ12_4) {
-	  const double r4inv = r2inv*r2inv;
-	  fpair *= r4inv*(lj1[itype][jtype]*r4inv*r4inv
-			  - lj2[itype][jtype]);
-	  if (eflag) {
-	    evdwl *= r4inv*(lj3[itype][jtype]*r4inv*r4inv
-			    - lj4[itype][jtype]) - offset[itype][jtype];
-	  }
-	} else if (cgt == CG_LJ9_6) {
-	  const double r3inv = r2inv*sqrt(r2inv);
-	  const double r6inv = r3inv*r3inv;
-	  fpair *= r6inv*(lj1[itype][jtype]*r3inv
-			  - lj2[itype][jtype]);
-	  if (eflag) {
-	    evdwl *= r6inv*(lj3[itype][jtype]*r3inv
-			    - lj4[itype][jtype]) - offset[itype][jtype];
-	  }
-	} else {
-	  const double r6inv = r2inv*r2inv*r2inv;
-	  fpair *= r6inv*(lj1[itype][jtype]*r6inv
-			  - lj2[itype][jtype]);
-	  if (eflag) {
-	    evdwl *= r6inv*(lj3[itype][jtype]*r6inv
-			    - lj4[itype][jtype]) - offset[itype][jtype];
-	  }
-	}
-        fpair *= r2inv;
-
-	f[i][0] += delx*fpair;
-	f[i][1] += dely*fpair;
-	f[i][2] += delz*fpair;
-
-        if (j<start) {
-  	  if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
-        } else {
-          if (j<nlocal) {
-	    f[j][0] -= delx*fpair;
-	    f[j][1] -= dely*fpair;
-	    f[j][2] -= delz*fpair;
-  	  }
-	  if (evflag) ev_tally(i,j,nlocal,0,
-			       evdwl,0.0,fpair,delx,dely,delz);
-	}
-      }
-    }
-  }
-}
-
diff --git a/src/GPU/pair_cg_cmm_gpu.h b/src/GPU/pair_cg_cmm_gpu.h
index 2c3ef9293f..d4cead54b6 100644
--- a/src/GPU/pair_cg_cmm_gpu.h
+++ b/src/GPU/pair_cg_cmm_gpu.h
@@ -28,8 +28,7 @@ class PairCGCMMGPU : public PairCGCMM {
  public:
   PairCGCMMGPU(LAMMPS *lmp);
   ~PairCGCMMGPU();
-  void cpu_compute(int, int, int);
-  void cpu_compute(int *, int, int, int);
+  void cpu_compute(int, int, int, int, int *, int *, int **);
   void compute(int, int);
   void init_style();
   double memory_usage();
diff --git a/src/GPU/pair_gayberne_gpu.cpp b/src/GPU/pair_gayberne_gpu.cpp
index c9b3062ec5..d9e040d40d 100644
--- a/src/GPU/pair_gayberne_gpu.cpp
+++ b/src/GPU/pair_gayberne_gpu.cpp
@@ -36,33 +36,33 @@
 #include "domain.h"
 #include "update.h"
 #include "string.h"
+#include "gpu_extra.h"
 
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 
 // External functions from cuda library for atom decomposition
 
-bool gb_gpu_init(const int ntypes, const double gamma, const double upsilon,
-                 const double mu, double **shape, double **well, double **cutsq,
-                 double **sigma, double **epsilon, double *host_lshape,
-                 int **form, double **host_lj1, double **host_lj2,
-                 double **host_lj3, double **host_lj4, double **offset,
-                 double *special_lj, const int nlocal, const int nall,
-                 const int max_nbors, const double cell_size,
-                 int &gpu_mode, FILE *screen);
+int gb_gpu_init(const int ntypes, const double gamma, const double upsilon,
+		const double mu, double **shape, double **well, double **cutsq,
+		double **sigma, double **epsilon, double *host_lshape,
+		int **form, double **host_lj1, double **host_lj2,
+		double **host_lj3, double **host_lj4, double **offset,
+		double *special_lj, const int nlocal, const int nall,
+		const int max_nbors, const double cell_size,
+		int &gpu_mode, FILE *screen);
 void gb_gpu_clear();
-int * gb_gpu_compute_n(const int timestep, const int ago, const int inum,
-	 	       const int nall, double **host_x, int *host_type,
-                       double *boxlo, double *boxhi, const bool eflag,
-		       const bool vflag, const bool eatom, const bool vatom,
-                       int &host_start, const double cpu_time, bool &success,
-		       double **host_quat);
-int * gb_gpu_compute(const int timestep, const int ago, const int inum,
-	 	     const int nall, double **host_x, int *host_type,
-                     int *ilist, int *numj, int **firstneigh,
-		     const bool eflag, const bool vflag, const bool eatom,
-                     const bool vatom, int &host_start, const double cpu_time,
-                     bool &success, double **host_quat);
+int ** gb_gpu_compute_n(const int ago, const int inum, const int nall,
+			double **host_x, int *host_type, double *sublo,
+			double *subhi, const bool eflag, const bool vflag,
+			const bool eatom, const bool vatom, int &host_start,
+			int **ilist, int **jnum, const double cpu_time,
+			bool &success, double **host_quat);
+int * gb_gpu_compute(const int ago, const int inum, const int nall,
+		     double **host_x, int *host_type, int *ilist, int *numj,
+		     int **firstneigh, const bool eflag, const bool vflag,
+		     const bool eatom, const bool vatom, int &host_start,
+		     const double cpu_time, bool &success, double **host_quat);
 double gb_gpu_bytes();
 
 using namespace LAMMPS_NS;
@@ -77,6 +77,8 @@ PairGayBerneGPU::PairGayBerneGPU(LAMMPS *lmp) : PairGayBerne(lmp),
   avec = (AtomVecEllipsoid *) atom->style_match("ellipsoid");
   if (!avec) 
     error->all("Pair gayberne requires atom style ellipsoid");
+  quat_nmax = 0;
+  quat = NULL;
 }
 
 /* ----------------------------------------------------------------------
@@ -87,14 +89,13 @@ PairGayBerneGPU::~PairGayBerneGPU()
 {
   gb_gpu_clear();
   cpu_time = 0.0;
+  memory->destroy(quat);
 }
 
 /* ---------------------------------------------------------------------- */
 
 void PairGayBerneGPU::compute(int eflag, int vflag)
 {
-  int ntimestep = static_cast<int>(update->ntimestep % MAXSMALLINT);
-
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = 0;
 
@@ -102,34 +103,47 @@ void PairGayBerneGPU::compute(int eflag, int vflag)
   int inum, host_start;
 
   bool success = true;
+  int *ilist, *numneigh, **firstneigh;  
+
+  if (nall > quat_nmax) {
+    quat_nmax = static_cast<int>(1.1 * nall);
+    memory->grow(quat, quat_nmax, 4, "pair:quat");
+  }
+  AtomVecEllipsoid::Bonus *bonus = avec->bonus;
+  int *ellipsoid = atom->ellipsoid;
+  for (int i=0; i<nall; i++) {
+    int qi = ellipsoid[i];
+    if (qi > -1) {
+      quat[i][0] = bonus[qi].quat[0];
+      quat[i][1] = bonus[qi].quat[1];
+      quat[i][2] = bonus[qi].quat[2];
+      quat[i][3] = bonus[qi].quat[3];
+    }
+  }
 
   if (gpu_mode == GPU_NEIGH) {
     inum = atom->nlocal;
-    /* MIKE: this arg of atom->quat needs to be modified
-    gpulist = gb_gpu_compute_n(ntimestep, neighbor->ago, inum, nall,
-			       atom->x, atom->type, domain->sublo, domain->subhi,
-			       eflag, vflag, eflag_atom, vflag_atom, host_start,
-                               cpu_time, success, atom->quat);
-    */
+    firstneigh = gb_gpu_compute_n(neighbor->ago, inum, nall, atom->x,
+				  atom->type, domain->sublo, domain->subhi,
+				  eflag, vflag, eflag_atom, vflag_atom,
+				  host_start, &ilist, &numneigh, cpu_time,
+				  success, quat);
   } else {
     inum = list->inum;
-    /* MIKE: this arg of atom->quat needs to be modified
-    olist = gb_gpu_compute(ntimestep, neighbor->ago, inum, nall, atom->x,
-			   atom->type, list->ilist, list->numneigh,
-			   list->firstneigh, eflag, vflag, eflag_atom,
-                           vflag_atom, host_start, cpu_time, success,
-                           atom->quat);
-    */
+    ilist = list->ilist;
+    numneigh = list->numneigh;
+    firstneigh = list->firstneigh;
+    olist = gb_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
+			   ilist, numneigh, firstneigh, eflag, vflag,
+			   eflag_atom, vflag_atom, host_start,
+			   cpu_time, success, quat);
   }
   if (!success)
     error->one("Out of memory on GPGPU");
 
   if (host_start < inum) {
     cpu_time = MPI_Wtime();
-    if (gpu_mode == GPU_NEIGH)
-      cpu_compute(gpulist,host_start,eflag,vflag);
-    else
-      cpu_compute(host_start,eflag,vflag);
+    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
     cpu_time = MPI_Wtime() - cpu_time;
   }
 }
@@ -140,8 +154,8 @@ void PairGayBerneGPU::compute(int eflag, int vflag)
 
 void PairGayBerneGPU::init_style()
 {
-  if (force->pair_match("gpu",0) == NULL)
-    error->all("Cannot use pair hybrid with multiple GPU pair styles");
+  if (force->newton_pair) 
+    error->all("Cannot use newton pair with GPU Gay-Berne pair style");
   if (!atom->ellipsoid_flag)
     error->all("Pair gayberne requires atom style ellipsoid");
 
@@ -179,22 +193,20 @@ void PairGayBerneGPU::init_style()
 
   double cell_size = sqrt(maxcut) + neighbor->skin;
 
-  bool init_ok = gb_gpu_init(atom->ntypes+1, gamma, upsilon, mu, 
-                             shape1, well, cutsq, sigma, epsilon, lshape, form,
-                             lj1, lj2, lj3, lj4, offset, force->special_lj, 
-                             atom->nlocal, atom->nlocal+atom->nghost, 300, 
-                             cell_size, gpu_mode, screen);
-  if (!init_ok)
-    error->one("Insufficient memory on accelerator (or no fix gpu).");
-
-  if (force->newton_pair) 
-    error->all("Cannot use newton pair with GPU Gay-Berne pair style");
+  int success = gb_gpu_init(atom->ntypes+1, gamma, upsilon, mu, 
+			    shape2, well, cutsq, sigma, epsilon, lshape, form,
+			    lj1, lj2, lj3, lj4, offset, force->special_lj, 
+			    atom->nlocal, atom->nlocal+atom->nghost, 300, 
+			    cell_size, gpu_mode, screen);
+  GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode != GPU_NEIGH) {
     int irequest = neighbor->request(this);
     neighbor->requests[irequest]->half = 0;
     neighbor->requests[irequest]->full = 1;
   }
+  quat_nmax = static_cast<int>(1.1 * (atom->nlocal + atom->nghost));
+  memory->grow(quat, quat_nmax, 4, "pair:quat");
 }
 
 /* ---------------------------------------------------------------------- */
@@ -202,18 +214,19 @@ void PairGayBerneGPU::init_style()
 double PairGayBerneGPU::memory_usage()
 {
   double bytes = Pair::memory_usage();
-  return bytes + gb_gpu_bytes();
+  return bytes + memory->usage(quat,quat_nmax)+gb_gpu_bytes();
 }
 
 /* ---------------------------------------------------------------------- */
 
-void PairGayBerneGPU::cpu_compute(int start, int eflag, int vflag)
+void PairGayBerneGPU::cpu_compute(int start, int inum, int eflag, int vflag,
+				  int *ilist, int *numneigh, int **firstneigh)
 {
-  int i,j,ii,jj,inum,jnum,itype,jtype;
+  int i,j,ii,jj,jnum,itype,jtype;
   double evdwl,one_eng,rsq,r2inv,r6inv,forcelj,factor_lj;
   double fforce[3],ttor[3],rtor[3],r12[3];
   double a1[3][3],b1[3][3],g1[3][3],a2[3][3],b2[3][3],g2[3][3],temp[3][3];
-  int *ilist,*jlist,*numneigh,**firstneigh;
+  int *jlist;
   double *iquat,*jquat;
 
   AtomVecEllipsoid::Bonus *bonus = avec->bonus;
@@ -225,11 +238,6 @@ void PairGayBerneGPU::cpu_compute(int start, int eflag, int vflag)
   int nlocal = atom->nlocal;
   double *special_lj = force->special_lj;
 
-  inum = list->inum;
-  ilist = olist;
-  numneigh = list->numneigh;
-  firstneigh = list->firstneigh;
-  
   // loop over neighbors of my atoms
 
   for (ii = start; ii < inum; ii++) {
@@ -331,143 +339,3 @@ void PairGayBerneGPU::cpu_compute(int start, int eflag, int vflag)
     }
   }
 }
-
-/* ---------------------------------------------------------------------- */
-
-void PairGayBerneGPU::cpu_compute(int *nbors, int start, int eflag, int vflag)
-{
-  int i,j,itype,jtype;
-  double evdwl,one_eng,rsq,r2inv,r6inv,forcelj,factor_lj;
-  double fforce[3],ttor[3],rtor[3],r12[3];
-  double a1[3][3],b1[3][3],g1[3][3],a2[3][3],b2[3][3],g2[3][3],temp[3][3];
-  double *iquat,*jquat;
-
-  AtomVecEllipsoid::Bonus *bonus = avec->bonus;
-  int *ellipsoid = atom->ellipsoid;
-  double **x = atom->x;
-  double **f = atom->f;
-  double **tor = atom->torque;
-  int *type = atom->type;
-  int nlocal = atom->nlocal;
-  int stride = nlocal-start;
-  double *special_lj = force->special_lj;
-
-  // loop over neighbors of my atoms
-
-  for (i = start; i < nlocal; i++) {
-    itype = type[i];
-
-    if (form[itype][itype] == ELLIPSE_ELLIPSE) {
-      iquat = bonus[ellipsoid[j]].quat;
-      MathExtra::quat_to_mat_trans(iquat,a1);
-      MathExtra::diag_times3(well[itype],a1,temp);
-      MathExtra::transpose_times3(a1,temp,b1);
-      MathExtra::diag_times3(shape2[itype],a1,temp);
-      MathExtra::transpose_times3(a1,temp,g1);
-    }
-
-    int *nbor = nbors+i-start;
-    int jnum =* nbor;
-    nbor += stride;
-    int *nbor_end = nbor + stride * jnum;
-
-    for ( ; nbor < nbor_end; nbor += stride) {
-      j = *nbor;
-      factor_lj = special_lj[sbmask(j)];
-      j &= NEIGHMASK;
-
-      // r12 = center to center vector
-
-      r12[0] = x[j][0]-x[i][0];
-      r12[1] = x[j][1]-x[i][1];
-      r12[2] = x[j][2]-x[i][2];
-      rsq = MathExtra::dot3(r12,r12);
-      jtype = type[j];
-
-      // compute if less than cutoff
-
-      if (rsq < cutsq[itype][jtype]) {
-
-	switch (form[itype][jtype]) {
-	case SPHERE_SPHERE:
-	  r2inv = 1.0/rsq;
-	  r6inv = r2inv*r2inv*r2inv;
-	  forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
-	  forcelj *= -r2inv;
-	  if (eflag) one_eng = 
-	    r6inv*(r6inv*lj3[itype][jtype]-lj4[itype][jtype]) -
-	    offset[itype][jtype];
-	  fforce[0] = r12[0]*forcelj;
-	  fforce[1] = r12[1]*forcelj;
-	  fforce[2] = r12[2]*forcelj;
-	  ttor[0] = ttor[1] = ttor[2] = 0.0;
-	  rtor[0] = rtor[1] = rtor[2] = 0.0;
-	  break;
-
-        case SPHERE_ELLIPSE:
-	  jquat = bonus[ellipsoid[j]].quat;
-	  MathExtra::quat_to_mat_trans(jquat,a2);
-	  MathExtra::diag_times3(well[jtype],a2,temp);
-	  MathExtra::transpose_times3(a2,temp,b2);
-	  MathExtra::diag_times3(shape2[jtype],a2,temp);
-	  MathExtra::transpose_times3(a2,temp,g2);
-	  one_eng = gayberne_lj(j,i,a2,b2,g2,r12,rsq,fforce,rtor);
-	  ttor[0] = ttor[1] = ttor[2] = 0.0;
-	  break;
-
-        case ELLIPSE_SPHERE:
-	  one_eng = gayberne_lj(i,j,a1,b1,g1,r12,rsq,fforce,ttor);
-	  rtor[0] = rtor[1] = rtor[2] = 0.0;
-	  break;
-
-	default:
-	  jquat = bonus[ellipsoid[j]].quat;
-	  MathExtra::quat_to_mat_trans(jquat,a2);
-	  MathExtra::diag_times3(well[jtype],a2,temp);
-	  MathExtra::transpose_times3(a2,temp,b2);
-	  MathExtra::diag_times3(shape2[jtype],a2,temp);
-	  MathExtra::transpose_times3(a2,temp,g2);
-	  one_eng = gayberne_analytic(i,j,a1,a2,b1,b2,g1,g2,r12,rsq,
-				      fforce,ttor,rtor);
-	  break;
-	}
-
-        fforce[0] *= factor_lj;
-	fforce[1] *= factor_lj;
-	fforce[2] *= factor_lj;
-        ttor[0] *= factor_lj;
-	ttor[1] *= factor_lj;
-	ttor[2] *= factor_lj;
-
-        f[i][0] += fforce[0];
-	f[i][1] += fforce[1];
-	f[i][2] += fforce[2];
-        tor[i][0] += ttor[0];
-	tor[i][1] += ttor[1];
-	tor[i][2] += ttor[2];
-
-        if (eflag) evdwl = factor_lj*one_eng;
-
-        if (j<start) { 
-  	  if (evflag) ev_tally_xyz_full(i,evdwl,0.0,fforce[0],fforce[1],
-				        fforce[2],-r12[0],-r12[1],-r12[2]);
-        } else {
-          if (j < nlocal) {
-            rtor[0] *= factor_lj;
-	    rtor[1] *= factor_lj;
-	    rtor[2] *= factor_lj;
-            f[j][0] -= fforce[0];
-	    f[j][1] -= fforce[1];
-	    f[j][2] -= fforce[2];
-            tor[j][0] += rtor[0];
-	    tor[j][1] += rtor[1];
-	    tor[j][2] += rtor[2];
-          }
-  	  if (evflag) ev_tally_xyz(i,j,nlocal,0,
-	  			   evdwl,0.0,fforce[0],fforce[1],fforce[2],
-				   -r12[0],-r12[1],-r12[2]);
-        }
-      }
-    }
-  }
-}
diff --git a/src/GPU/pair_gayberne_gpu.h b/src/GPU/pair_gayberne_gpu.h
index 60961297ec..76d3b516a9 100644
--- a/src/GPU/pair_gayberne_gpu.h
+++ b/src/GPU/pair_gayberne_gpu.h
@@ -29,8 +29,7 @@ class PairGayBerneGPU : public PairGayBerne {
  public:
   PairGayBerneGPU(LAMMPS *lmp);
   ~PairGayBerneGPU();
-  void cpu_compute(int, int, int);
-  void cpu_compute(int *, int, int, int);
+  void cpu_compute(int, int, int, int, int *, int *, int **);
   void compute(int, int);
   void init_style();
   double memory_usage();
@@ -42,6 +41,8 @@ class PairGayBerneGPU : public PairGayBerne {
   int gpu_mode;
   double cpu_time;
   int *gpulist;
+  int quat_nmax;
+  double **quat;
 };
 
 }
diff --git a/src/GPU/pair_lj96_cut_gpu.cpp b/src/GPU/pair_lj96_cut_gpu.cpp
index 2e059905c6..ba3cbbd3c8 100644
--- a/src/GPU/pair_lj96_cut_gpu.cpp
+++ b/src/GPU/pair_lj96_cut_gpu.cpp
@@ -34,30 +34,31 @@
 #include "update.h"
 #include "domain.h"
 #include "string.h"
+#include "gpu_extra.h"
 
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 
 // External functions from cuda library for atom decomposition
 
-bool lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-                   double **host_lj2, double **host_lj3, double **host_lj4, 
-                   double **offset, double *special_lj, const int nlocal, 
-                   const int nall, const int max_nbors, const int maxspecial,
-                   const double cell_size, int &gpu_mode, FILE *screen);
+int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+		  double **host_lj2, double **host_lj3, double **host_lj4, 
+		  double **offset, double *special_lj, const int nlocal, 
+		  const int nall, const int max_nbors, const int maxspecial,
+		  const double cell_size, int &gpu_mode, FILE *screen);
 void lj96_gpu_clear();
-int * lj96_gpu_compute_n(const int timestep, const int ago, const int inum,
-	 	         const int nall, double **host_x, int *host_type, 
-                         double *boxlo, double *boxhi, int *tag, int **nspecial,
-                         int **special, const bool eflag, const bool vflag,
-                         const bool eatom, const bool vatom, int &host_start,
-                         const double cpu_time, bool &success);
-void lj96_gpu_compute(const int timestep, const int ago, const int inum,
-	 	      const int nall, double **host_x, int *host_type,
-                      int *ilist, int *numj, int **firstneigh,
-		      const bool eflag, const bool vflag, const bool eatom,
-                      const bool vatom, int &host_start, const double cpu_time,
-                      bool &success);
+int ** lj96_gpu_compute_n(const int ago, const int inum, const int nall,
+			  double **host_x, int *host_type, double *sublo,
+			  double *subhi, int *tag, int **nspecial,
+			  int **special, const bool eflag, const bool vflag,
+			  const bool eatom, const bool vatom, int &host_start,
+			  int **ilist, int **jnum,
+			  const double cpu_time, bool &success);
+void lj96_gpu_compute(const int ago, const int inum, const int nall,
+                      double **host_x, int *host_type, int *ilist, int *numj,
+		      int **firstneigh, const bool eflag, const bool vflag,
+		      const bool eatom, const bool vatom, int &host_start,
+		      const double cpu_time, bool &success);
 double lj96_gpu_bytes();
 
 using namespace LAMMPS_NS;
@@ -83,8 +84,6 @@ PairLJ96CutGPU::~PairLJ96CutGPU()
 
 void PairLJ96CutGPU::compute(int eflag, int vflag)
 {
-  int ntimestep = static_cast<int>(update->ntimestep % MAXSMALLINT);
-
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = 0;
   
@@ -92,30 +91,30 @@ void PairLJ96CutGPU::compute(int eflag, int vflag)
   int inum, host_start;
   
   bool success = true;
-  
+  int *ilist, *numneigh, **firstneigh;  
   if (gpu_mode == GPU_NEIGH) {
     inum = atom->nlocal;
-    gpulist = lj96_gpu_compute_n(ntimestep, neighbor->ago, inum, nall,
-			         atom->x, atom->type, domain->sublo,
-				 domain->subhi, atom->tag, atom->nspecial,
-                                 atom->special, eflag, vflag, eflag_atom,
-                                 vflag_atom, host_start, cpu_time, success);
+    firstneigh = lj96_gpu_compute_n(neighbor->ago, inum, nall, atom->x,
+				    atom->type, domain->sublo, domain->subhi,
+				    atom->tag, atom->nspecial, atom->special,
+				    eflag, vflag, eflag_atom, vflag_atom,
+				    host_start, &ilist, &numneigh, cpu_time,
+				    success);
   } else {
     inum = list->inum;
-    lj96_gpu_compute(ntimestep, neighbor->ago, inum, nall, atom->x,
-		     atom->type, list->ilist, list->numneigh, list->firstneigh,
-		     eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time,
-                     success);
+    ilist = list->ilist;
+    numneigh = list->numneigh;
+    firstneigh = list->firstneigh;
+    lj96_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
+		     ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
+		     vflag_atom, host_start, cpu_time, success);
   }
   if (!success)
     error->one("Out of memory on GPGPU");
 
   if (host_start<inum) {
     cpu_time = MPI_Wtime();
-    if (gpu_mode == GPU_NEIGH)
-      cpu_compute(gpulist, host_start, eflag, vflag);
-    else
-      cpu_compute(host_start, eflag, vflag);
+    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
     cpu_time = MPI_Wtime() - cpu_time;
   }
 }
@@ -128,8 +127,8 @@ void PairLJ96CutGPU::init_style()
 {
   cut_respa = NULL;
 
-  if (force->pair_match("gpu",0) == NULL)
-    error->all("Cannot use pair hybrid with multiple GPU pair styles");
+  if (force->newton_pair) 
+    error->all("Cannot use newton pair with GPU LJ96 pair style");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
@@ -151,15 +150,11 @@ void PairLJ96CutGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
-  bool init_ok = lj96_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
-                               offset, force->special_lj, atom->nlocal,
-                               atom->nlocal+atom->nghost, 300, maxspecial,
-                               cell_size, gpu_mode, screen);
-  if (!init_ok)
-    error->one("Insufficient memory on accelerator (or no fix gpu).\n"); 
-
-  if (force->newton_pair) 
-    error->all("Cannot use newton pair with GPU LJ96 pair style");
+  int success = lj96_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
+			      offset, force->special_lj, atom->nlocal,
+			      atom->nlocal+atom->nghost, 300, maxspecial,
+			      cell_size, gpu_mode, screen);
+  GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode != GPU_NEIGH) {
     int irequest = neighbor->request(this);
@@ -178,11 +173,13 @@ double PairLJ96CutGPU::memory_usage()
 
 /* ---------------------------------------------------------------------- */
 
-void PairLJ96CutGPU::cpu_compute(int start, int eflag, int vflag) {
-  int i,j,ii,jj,inum,jnum,itype,jtype;
+void PairLJ96CutGPU::cpu_compute(int start, int inum, int eflag, int vflag,
+				 int *ilist, int *numneigh, int **firstneigh)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
   double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
   double rsq,r2inv,r3inv,r6inv,forcelj,factor_lj;
-  int *ilist,*jlist,*numneigh,**firstneigh;
+  int *jlist;
 
   double **x = atom->x;
   double **f = atom->f;
@@ -190,11 +187,6 @@ void PairLJ96CutGPU::cpu_compute(int start, int eflag, int vflag) {
   int nlocal = atom->nlocal;
   double *special_lj = force->special_lj;
 
-  inum = list->inum;
-  ilist = list->ilist;
-  numneigh = list->numneigh;
-  firstneigh = list->firstneigh;
-
   // loop over neighbors of my atoms
 
   for (ii = start; ii < inum; ii++) {
@@ -239,73 +231,3 @@ void PairLJ96CutGPU::cpu_compute(int start, int eflag, int vflag) {
     }
   }
 }
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJ96CutGPU::cpu_compute(int *nbors, int start, int eflag, int vflag) {
-  int i,j,itype,jtype;
-  int nlocal = atom->nlocal;
-  int stride = nlocal-start;
-  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
-  double rsq,r2inv,r3inv,r6inv,forcelj,factor_lj;
-  double *special_lj = force->special_lj;
-
-  double **x = atom->x;
-  double **f = atom->f;
-  int *type = atom->type;
-
-  // loop over neighbors of my atoms
-
-  for (i = start; i < nlocal; i++) {
-    xtmp = x[i][0];
-    ytmp = x[i][1];
-    ztmp = x[i][2];
-    itype = type[i];
-    int *nbor = nbors + i - start;
-    int jnum = *nbor;
-    nbor += stride;
-    int *nbor_end = nbor + stride * jnum;
-
-    for (; nbor<nbor_end; nbor+=stride) {
-      j = *nbor;
-      factor_lj = special_lj[sbmask(j)];
-      j &= NEIGHMASK;
-
-      delx = xtmp - x[j][0];
-      dely = ytmp - x[j][1];
-      delz = ztmp - x[j][2];
-      rsq = delx*delx + dely*dely + delz*delz;
-      jtype = type[j];
-
-      if (rsq < cutsq[itype][jtype]) {
-	r2inv = 1.0/rsq;
-	r6inv = r2inv*r2inv*r2inv;
-	r3inv = sqrt(r6inv);
-	forcelj = r6inv * (lj1[itype][jtype]*r3inv - lj2[itype][jtype]);
-	fpair = factor_lj*forcelj*r2inv;
-
-	f[i][0] += delx*fpair;
-	f[i][1] += dely*fpair;
-	f[i][2] += delz*fpair;
-
-	if (eflag) {
-	  evdwl = r6inv*(lj3[itype][jtype]*r3inv-lj4[itype][jtype]) -
-	    offset[itype][jtype];
-	  evdwl *= factor_lj;
-	}
-
-        if (j<start) {
-  	  if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
-        } else {
-          if (j<nlocal) {
-	    f[j][0] -= delx*fpair;
-	    f[j][1] -= dely*fpair;
-	    f[j][2] -= delz*fpair;
-  	  }
-	  if (evflag) ev_tally(i,j,nlocal,0,
-			       evdwl,0.0,fpair,delx,dely,delz);
-	}
-      }
-    }
-  }
-}
diff --git a/src/GPU/pair_lj96_cut_gpu.h b/src/GPU/pair_lj96_cut_gpu.h
index 1bcba2f23b..b4baa68828 100644
--- a/src/GPU/pair_lj96_cut_gpu.h
+++ b/src/GPU/pair_lj96_cut_gpu.h
@@ -28,8 +28,7 @@ class PairLJ96CutGPU : public PairLJ96Cut {
  public:
   PairLJ96CutGPU(LAMMPS *lmp);
   ~PairLJ96CutGPU();
-  void cpu_compute(int, int, int);
-  void cpu_compute(int *, int, int, int);
+  void cpu_compute(int, int, int, int, int *, int *, int **);
   void compute(int, int);
   void init_style();
   double memory_usage();
diff --git a/src/GPU/pair_lj_charmm_coul_long_gpu.cpp b/src/GPU/pair_lj_charmm_coul_long_gpu.cpp
index ef91ce5b66..eadc6fd22f 100644
--- a/src/GPU/pair_lj_charmm_coul_long_gpu.cpp
+++ b/src/GPU/pair_lj_charmm_coul_long_gpu.cpp
@@ -35,6 +35,7 @@
 #include "domain.h"
 #include "string.h"
 #include "kspace.h"
+#include "gpu_extra.h"
 
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
@@ -49,35 +50,35 @@
 
 // External functions from cuda library for atom decomposition
 
-bool crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
-		   double **host_lj2, double **host_lj3, double **host_lj4, 
-		   double **offset, double *special_lj, const int nlocal, 
-		   const int nall, const int max_nbors, const int maxspecial,
-		   const double cell_size, int &gpu_mode, FILE *screen,
-		   double host_cut_ljsq, double host_cut_coulsq,
-		   double *host_special_coul, const double qqrd2e,
-		   const double g_ewald, const double cut_lj_innersq,
-		   const double denom_lj, double **epsilon, double **sigma,
-		   const bool mix_arithmetic);
+int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
+		  double **host_lj2, double **host_lj3, double **host_lj4, 
+		  double **offset, double *special_lj, const int nlocal, 
+		  const int nall, const int max_nbors, const int maxspecial,
+		  const double cell_size, int &gpu_mode, FILE *screen,
+		  double host_cut_ljsq, double host_cut_coulsq,
+		  double *host_special_coul, const double qqrd2e,
+		  const double g_ewald, const double cut_lj_innersq,
+		  const double denom_lj, double **epsilon, double **sigma,
+		  const bool mix_arithmetic);
 void crml_gpu_clear();
-int * crml_gpu_compute_n(const int timestep, const int ago, const int inum,
-			 const int nall, double **host_x, int *host_type, 
-			 double *boxlo, double *boxhi, int *tag, int **nspecial,
-			 int **special, const bool eflag, const bool vflag,
-			 const bool eatom, const bool vatom, int &host_start,
-			 const double cpu_time, bool &success, double *host_q);
-void crml_gpu_compute(const int timestep, const int ago, const int inum,
-		      const int nall, double **host_x, int *host_type,
-		      int *ilist, int *numj, int **firstneigh,
-		      const bool eflag, const bool vflag, const bool eatom,
-		      const bool vatom, int &host_start, const double cpu_time,
-		      bool &success, double *host_q);
+int ** crml_gpu_compute_n(const int ago, const int inum,
+			  const int nall, double **host_x, int *host_type, 
+			  double *sublo, double *subhi, int *tag,
+			  int **nspecial, int **special, const bool eflag,
+			  const bool vflag, const bool eatom, const bool vatom,
+			  int &host_start, int **ilist, int **jnum,
+			  const double cpu_time, bool &success, double *host_q,
+			  double *boxlo, double *prd);
+void crml_gpu_compute(const int ago, const int inum, const int nall,
+		      double **host_x, int *host_type, int *ilist, int *numj,
+		      int **firstneigh, const bool eflag, const bool vflag,
+		      const bool eatom, const bool vatom, int &host_start,
+		      const double cpu_time, bool &success, double *host_q,
+		      const int nlocal, double *boxlo, double *prd);
 double crml_gpu_bytes();
 
 using namespace LAMMPS_NS;
 
-enum{GEOMETRIC,ARITHMETIC,SIXTHPOWER};
-
 /* ---------------------------------------------------------------------- */
 
 PairLJCharmmCoulLongGPU::PairLJCharmmCoulLongGPU(LAMMPS *lmp) : 
@@ -100,8 +101,6 @@ PairLJCharmmCoulLongGPU::~PairLJCharmmCoulLongGPU()
 
 void PairLJCharmmCoulLongGPU::compute(int eflag, int vflag)
 {
-  int ntimestep = static_cast<int>(update->ntimestep % MAXSMALLINT);
-
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = 0;
   
@@ -109,31 +108,32 @@ void PairLJCharmmCoulLongGPU::compute(int eflag, int vflag)
   int inum, host_start;
   
   bool success = true;
-  
+  int *ilist, *numneigh, **firstneigh;    
   if (gpu_mode == GPU_NEIGH) {
     inum = atom->nlocal;
-    gpulist = crml_gpu_compute_n(ntimestep, neighbor->ago, inum, nall,
-			         atom->x, atom->type, domain->sublo,
-				 domain->subhi, atom->tag, atom->nspecial,
-                                 atom->special, eflag, vflag, eflag_atom,
-                                 vflag_atom, host_start, cpu_time, success,
-                                 atom->q);
+    firstneigh = crml_gpu_compute_n(neighbor->ago, inum, nall, atom->x,
+				    atom->type, domain->sublo, domain->subhi,
+				    atom->tag, atom->nspecial, atom->special,
+				    eflag, vflag, eflag_atom, vflag_atom,
+				    host_start, &ilist, &numneigh, cpu_time,
+				    success, atom->q, domain->boxlo,
+				    domain->prd);
   } else {
     inum = list->inum;
-    crml_gpu_compute(ntimestep, neighbor->ago, inum, nall, atom->x,
-		     atom->type, list->ilist, list->numneigh, list->firstneigh,
-		     eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time,
-		     success, atom->q);
+    ilist = list->ilist;
+    numneigh = list->numneigh;
+    firstneigh = list->firstneigh;
+    crml_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
+		     ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
+		     vflag_atom, host_start, cpu_time, success, atom->q,
+		     atom->nlocal, domain->boxlo, domain->prd);
   }
   if (!success)
     error->one("Out of memory on GPGPU");
 
   if (host_start<inum) {
     cpu_time = MPI_Wtime();
-    if (gpu_mode == GPU_NEIGH)
-      cpu_compute(gpulist, host_start, eflag, vflag);
-    else
-      cpu_compute(host_start, eflag, vflag);
+    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
     cpu_time = MPI_Wtime() - cpu_time;
   }
 }
@@ -148,8 +148,8 @@ void PairLJCharmmCoulLongGPU::init_style()
 
   if (!atom->q_flag)
     error->all("Pair style lj/charmm/coul/long requires atom attribute q");
-  if (force->pair_match("gpu",0) == NULL)
-    error->all("Cannot use pair hybrid with multiple GPU pair styles");
+  if (force->newton_pair) 
+    error->all("Cannot use newton pair with GPU CHARMM pair style");
 
   // Repeat cutsq calculation because done after call to init_style
   double cut;
@@ -183,18 +183,24 @@ void PairLJCharmmCoulLongGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
-  bool init_ok = crml_gpu_init(atom->ntypes+1, cut_bothsq, lj1, lj2, lj3, lj4,
-			       offset, force->special_lj, atom->nlocal,
-			       atom->nlocal+atom->nghost, 300, maxspecial,
-			       cell_size, gpu_mode, screen, cut_ljsq, 
-			       cut_coulsq, force->special_coul, force->qqrd2e,
-			       g_ewald, cut_lj_innersq,denom_lj,epsilon,sigma,
-			       mix_flag == ARITHMETIC);
-  if (!init_ok)
-    error->one("Insufficient memory on accelerator (or no fix gpu).\n"); 
 
-  if (force->newton_pair) 
-    error->all("Cannot use newton pair with GPU CHARMM pair style");
+  bool arithmetic = true;
+  for (int i = 1; i < atom->ntypes + 1; i++)
+    for (int j = i + 1; j < atom->ntypes + 1; j++) {
+      if (epsilon[i][j] != sqrt(epsilon[i][i] * epsilon[j][j]))
+	arithmetic = false;
+      if (sigma[i][j] != 0.5 * (sigma[i][i] + sigma[j][j]))
+	arithmetic = false;
+    }
+
+  int success = crml_gpu_init(atom->ntypes+1, cut_bothsq, lj1, lj2, lj3, lj4,
+			      offset, force->special_lj, atom->nlocal,
+			      atom->nlocal+atom->nghost, 300, maxspecial,
+			      cell_size, gpu_mode, screen, cut_ljsq, 
+			      cut_coulsq, force->special_coul, force->qqrd2e,
+			      g_ewald, cut_lj_innersq,denom_lj,epsilon,sigma,
+			      arithmetic);
+  GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode != GPU_NEIGH) {
     int irequest = neighbor->request(this);
@@ -213,15 +219,17 @@ double PairLJCharmmCoulLongGPU::memory_usage()
 
 /* ---------------------------------------------------------------------- */
 
-void PairLJCharmmCoulLongGPU::cpu_compute(int start, int eflag, int vflag)
+void PairLJCharmmCoulLongGPU::cpu_compute(int start, int inum, int eflag,
+					  int vflag, int *ilist,
+					  int *numneigh, int **firstneigh)
 {
-  int i,j,ii,jj,inum,jnum,itype,jtype,itable;
+  int i,j,ii,jj,jnum,itype,jtype,itable;
   double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
   double fraction,table;
   double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
   double grij,expm2,prefactor,t,erfc;
   double philj,switch1,switch2;
-  int *ilist,*jlist,*numneigh,**firstneigh;
+  int *jlist;
   double rsq;
 
   evdwl = ecoul = 0.0;
@@ -235,11 +243,6 @@ void PairLJCharmmCoulLongGPU::cpu_compute(int start, int eflag, int vflag)
   double *special_lj = force->special_lj;
   double qqrd2e = force->qqrd2e;
 
-  inum = list->inum;
-  ilist = list->ilist;
-  numneigh = list->numneigh;
-  firstneigh = list->firstneigh;
-  
   // loop over neighbors of my atoms
 
   for (ii = start; ii < inum; ii++) {
@@ -339,140 +342,3 @@ void PairLJCharmmCoulLongGPU::cpu_compute(int start, int eflag, int vflag)
     }
   }
 }
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJCharmmCoulLongGPU::cpu_compute(int *nbors, int start, int eflag,
-                                      int vflag)
-{
-  int i,j,jnum,itype,jtype,itable;
-  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
-  double fraction,table;
-  double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
-  double grij,expm2,prefactor,t,erfc;
-  double philj,switch1,switch2;
-  double rsq;
-
-  evdwl = ecoul = 0.0;
-
-  double **x = atom->x;
-  double **f = atom->f;
-  double *q = atom->q;
-  int *type = atom->type;
-  int nlocal = atom->nlocal;
-  int stride = nlocal - start;
-  double *special_coul = force->special_coul;
-  double *special_lj = force->special_lj;
-  double qqrd2e = force->qqrd2e;
-
-  // loop over neighbors of my atoms
-
-  for (i = start; i < nlocal; i++) {
-    qtmp = q[i];
-    xtmp = x[i][0];
-    ytmp = x[i][1];
-    ztmp = x[i][2];
-    itype = type[i];
-    int *nbor = nbors + i - start;
-    jnum = *nbor;
-    nbor += stride;
-    int *nbor_end = nbor + stride * jnum;
-
-    for (; nbor<nbor_end; nbor+=stride) {
-      j = *nbor;
-      factor_lj = special_lj[sbmask(j)];
-      factor_coul = special_coul[sbmask(j)];
-      j &= NEIGHMASK;
-
-      delx = xtmp - x[j][0];
-      dely = ytmp - x[j][1];
-      delz = ztmp - x[j][2];
-      rsq = delx*delx + dely*dely + delz*delz;
-
-      if (rsq < cut_bothsq) {
-	r2inv = 1.0/rsq;
-
-	if (rsq < cut_coulsq) {
-	  if (!ncoultablebits || rsq <= tabinnersq) {
-	    r = sqrt(rsq);
-	    grij = g_ewald * r;
-	    expm2 = exp(-grij*grij);
-	    t = 1.0 / (1.0 + EWALD_P*grij);
-	    erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-	    prefactor = qqrd2e * qtmp*q[j]/r;
-	    forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
-	    if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
-	  } else {
-	    union_int_float_t rsq_lookup;
-	    rsq_lookup.f = rsq;
-	    itable = rsq_lookup.i & ncoulmask;
-	    itable >>= ncoulshiftbits;
-	    fraction = (rsq_lookup.f - rtable[itable]) * drtable[itable];
-	    table = ftable[itable] + fraction*dftable[itable];
-	    forcecoul = qtmp*q[j] * table;
-	    if (factor_coul < 1.0) {
-	      table = ctable[itable] + fraction*dctable[itable];
-	      prefactor = qtmp*q[j] * table;
-	      forcecoul -= (1.0-factor_coul)*prefactor;
-	    }
-	  }
-	} else forcecoul = 0.0;
-
-	if (rsq < cut_ljsq) {
-	  r6inv = r2inv*r2inv*r2inv;
-	  jtype = type[j];
-	  forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
-	  if (rsq > cut_lj_innersq) {
-	    switch1 = (cut_ljsq-rsq) * (cut_ljsq-rsq) *
-	      (cut_ljsq + 2.0*rsq - 3.0*cut_lj_innersq) / denom_lj;
-	    switch2 = 12.0*rsq * (cut_ljsq-rsq) * 
-	      (rsq-cut_lj_innersq) / denom_lj;
-	    philj = r6inv * (lj3[itype][jtype]*r6inv - lj4[itype][jtype]);
-	    forcelj = forcelj*switch1 + philj*switch2;
-	  }
-	} else forcelj = 0.0;
-
-	fpair = (forcecoul + factor_lj*forcelj) * r2inv;
-
-	f[i][0] += delx*fpair;
-	f[i][1] += dely*fpair;
-	f[i][2] += delz*fpair;
-
-	if (eflag) {
-	  if (rsq < cut_coulsq) {
-	    if (!ncoultablebits || rsq <= tabinnersq)
-	      ecoul = prefactor*erfc;
-	    else {
-	      table = etable[itable] + fraction*detable[itable];
-	      ecoul = qtmp*q[j] * table;
-	    }
-	    if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
-	  } else ecoul = 0.0;
-
-	  if (rsq < cut_ljsq) {
-	    evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]);
-	    if (rsq > cut_lj_innersq) {
-	      switch1 = (cut_ljsq-rsq) * (cut_ljsq-rsq) *
-		(cut_ljsq + 2.0*rsq - 3.0*cut_lj_innersq) / denom_lj;
-	      evdwl *= switch1;
-	    }
-	    evdwl *= factor_lj;
-	  } else evdwl = 0.0;
-	}
-
-        if (j<start) {
-  	  if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
-        } else {
-          if (j<nlocal) {
-	    f[j][0] -= delx*fpair;
-	    f[j][1] -= dely*fpair;
-	    f[j][2] -= delz*fpair;
-  	  }
-	  if (evflag) ev_tally(i,j,nlocal,0,
-			       evdwl,ecoul,fpair,delx,dely,delz);
-        }
-      }
-    }
-  }
-}
-
diff --git a/src/GPU/pair_lj_charmm_coul_long_gpu.h b/src/GPU/pair_lj_charmm_coul_long_gpu.h
index c4925f1df7..0205b93860 100644
--- a/src/GPU/pair_lj_charmm_coul_long_gpu.h
+++ b/src/GPU/pair_lj_charmm_coul_long_gpu.h
@@ -28,8 +28,7 @@ class PairLJCharmmCoulLongGPU : public PairLJCharmmCoulLong {
  public:
   PairLJCharmmCoulLongGPU(LAMMPS *lmp);
   ~PairLJCharmmCoulLongGPU();
-  void cpu_compute(int, int, int);
-  void cpu_compute(int *, int, int, int);
+  void cpu_compute(int, int, int, int, int *, int *, int **);
   void compute(int, int);
   void init_style();
   double memory_usage();
diff --git a/src/GPU/pair_lj_cut_coul_cut_gpu.cpp b/src/GPU/pair_lj_cut_coul_cut_gpu.cpp
index 2e540d3844..3ecaec8660 100644
--- a/src/GPU/pair_lj_cut_coul_cut_gpu.cpp
+++ b/src/GPU/pair_lj_cut_coul_cut_gpu.cpp
@@ -34,32 +34,36 @@
 #include "update.h"
 #include "domain.h"
 #include "string.h"
+#include "gpu_extra.h"
 
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 
 // External functions from cuda library for atom decomposition
 
-bool ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-                  double **host_lj2, double **host_lj3, double **host_lj4, 
-                  double **offset, double *special_lj, const int nlocal, 
-                  const int nall, const int max_nbors, const int maxspecial,
-                  const double cell_size, int &gpu_mode, FILE *screen,
-                  double **host_cut_ljsq, double **host_cut_coulsq,
-                  double *host_special_coul, const double qqrd2e);
+int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                 double **host_lj2, double **host_lj3, double **host_lj4, 
+                 double **offset, double *special_lj, const int nlocal, 
+                 const int nall, const int max_nbors, const int maxspecial,
+                 const double cell_size, int &gpu_mode, FILE *screen,
+                 double **host_cut_ljsq, double **host_cut_coulsq,
+                 double *host_special_coul, const double qqrd2e);
 void ljc_gpu_clear();
-int * ljc_gpu_compute_n(const int timestep, const int ago, const int inum,
-	 	        const int nall, double **host_x, int *host_type, 
-                        double *boxlo, double *boxhi, int *tag, int **nspecial,
-                        int **special, const bool eflag, const bool vflag,
-                        const bool eatom, const bool vatom, int &host_start,
-                        const double cpu_time, bool &success, double *host_q);
-void ljc_gpu_compute(const int timestep, const int ago, const int inum,
+int ** ljc_gpu_compute_n(const int ago, const int inum,
+			 const int nall, double **host_x, int *host_type, 
+			 double *sublo, double *subhi, int *tag, int **nspecial,
+			 int **special, const bool eflag, const bool vflag,
+			 const bool eatom, const bool vatom, int &host_start,
+			 int **ilist, int **jnum, const double cpu_time,
+			 bool &success, double *host_q, double *boxlo,
+			 double *prd);
+void ljc_gpu_compute(const int ago, const int inum,
 	 	     const int nall, double **host_x, int *host_type,
                      int *ilist, int *numj, int **firstneigh,
 		     const bool eflag, const bool vflag, const bool eatom,
                      const bool vatom, int &host_start, const double cpu_time,
-                     bool &success, double *host_q);
+                     bool &success, double *host_q, const int nlocal,
+		     double *boxlo, double *prd);
 double ljc_gpu_bytes();
 
 using namespace LAMMPS_NS;
@@ -85,8 +89,6 @@ PairLJCutCoulCutGPU::~PairLJCutCoulCutGPU()
 
 void PairLJCutCoulCutGPU::compute(int eflag, int vflag)
 {
-  int ntimestep = static_cast<int>(update->ntimestep % MAXSMALLINT);
-
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = 0;
   
@@ -94,31 +96,32 @@ void PairLJCutCoulCutGPU::compute(int eflag, int vflag)
   int inum, host_start;
   
   bool success = true;
-  
+  int *ilist, *numneigh, **firstneigh;  
   if (gpu_mode == GPU_NEIGH) {
     inum = atom->nlocal;
-    gpulist = ljc_gpu_compute_n(ntimestep, neighbor->ago, inum, nall,
-			        atom->x, atom->type, domain->sublo,
-				domain->subhi, atom->tag, atom->nspecial,
-                                atom->special, eflag, vflag, eflag_atom,
-                                vflag_atom, host_start, cpu_time, success,
-                                atom->q);
+    firstneigh = ljc_gpu_compute_n(neighbor->ago, inum, nall, atom->x,
+				   atom->type, domain->sublo, domain->subhi,
+				   atom->tag, atom->nspecial, atom->special,
+				   eflag, vflag, eflag_atom, vflag_atom,
+				   host_start, &ilist, &numneigh, cpu_time,
+				   success, atom->q, domain->boxlo, 
+				   domain->prd);
   } else {
     inum = list->inum;
-    ljc_gpu_compute(ntimestep, neighbor->ago, inum, nall, atom->x,
-		    atom->type, list->ilist, list->numneigh, list->firstneigh,
-		    eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time,
-                    success, atom->q);
+    ilist = list->ilist;
+    numneigh = list->numneigh;
+    firstneigh = list->firstneigh;
+    ljc_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
+		    ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
+		    vflag_atom, host_start, cpu_time, success, atom->q,
+		    atom->nlocal, domain->boxlo, domain->prd);
   }
   if (!success)
     error->one("Out of memory on GPGPU");
 
   if (host_start<inum) {
     cpu_time = MPI_Wtime();
-    if (gpu_mode == GPU_NEIGH)
-      cpu_compute(gpulist, host_start, eflag, vflag);
-    else
-      cpu_compute(host_start, eflag, vflag);
+    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
     cpu_time = MPI_Wtime() - cpu_time;
   }
 }
@@ -131,8 +134,9 @@ void PairLJCutCoulCutGPU::init_style()
 {
   if (!atom->q_flag)
     error->all("Pair style lj/cut/coul/cut requires atom attribute q");
-  if (force->pair_match("gpu",0) == NULL)
-    error->all("Cannot use pair hybrid with multiple GPU pair styles");
+
+  if (force->newton_pair) 
+    error->all("Cannot use newton pair with GPU LJ pair style");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
@@ -154,16 +158,12 @@ void PairLJCutCoulCutGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
-  bool init_ok = ljc_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
-                              offset, force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
-                              cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq,
-                              force->special_coul, force->qqrd2e);
-  if (!init_ok)
-    error->one("Insufficient memory on accelerator (or no fix gpu).\n"); 
-
-  if (force->newton_pair) 
-    error->all("Cannot use newton pair with GPU LJ pair style");
+  int success = ljc_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
+			     offset, force->special_lj, atom->nlocal,
+			     atom->nlocal+atom->nghost, 300, maxspecial,
+			     cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq,
+			     force->special_coul, force->qqrd2e);
+  GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode != GPU_NEIGH) {
     int irequest = neighbor->request(this);
@@ -182,12 +182,14 @@ double PairLJCutCoulCutGPU::memory_usage()
 
 /* ---------------------------------------------------------------------- */
 
-void PairLJCutCoulCutGPU::cpu_compute(int start, int eflag, int vflag)
+void PairLJCutCoulCutGPU::cpu_compute(int start, int inum, int eflag, int vflag,
+				      int *ilist, int *numneigh,
+				      int **firstneigh)
 {
-  int i,j,ii,jj,inum,jnum,itype,jtype;
+  int i,j,ii,jj,jnum,itype,jtype;
   double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
   double rsq,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
-  int *ilist,*jlist,*numneigh,**firstneigh;
+  int *jlist;
 
   evdwl = ecoul = 0.0;
 
@@ -201,11 +203,6 @@ void PairLJCutCoulCutGPU::cpu_compute(int start, int eflag, int vflag)
   int newton_pair = force->newton_pair;
   double qqrd2e = force->qqrd2e;
 
-  inum = list->inum;
-  ilist = list->ilist;
-  numneigh = list->numneigh;
-  firstneigh = list->firstneigh;
-
   // loop over neighbors of my atoms
 
   for (ii = start; ii < inum; ii++) {
@@ -264,94 +261,3 @@ void PairLJCutCoulCutGPU::cpu_compute(int start, int eflag, int vflag)
     }
   }
 }
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJCutCoulCutGPU::cpu_compute(int *nbors, int start, int eflag,
-                                      int vflag)
-{
-  int i,j,jnum,itype,jtype;
-  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
-  double rsq,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
-
-  evdwl = ecoul = 0.0;
-
-  double **x = atom->x;
-  double **f = atom->f;
-  double *q = atom->q;
-  int *type = atom->type;
-  int nlocal = atom->nlocal;
-  int stride = nlocal-start;
-  double *special_coul = force->special_coul;
-  double *special_lj = force->special_lj;
-  double qqrd2e = force->qqrd2e;
-
-  // loop over neighbors of my atoms
-
-  for (i = start; i < nlocal; i++) {
-    qtmp = q[i];
-    xtmp = x[i][0];
-    ytmp = x[i][1];
-    ztmp = x[i][2];
-    itype = type[i];
-    int *nbor = nbors + i - start;
-    jnum = *nbor;
-    nbor += stride;
-    int *nbor_end = nbor + stride * jnum;
-
-    for (; nbor<nbor_end; nbor+=stride) {
-      j = *nbor;
-      factor_lj = special_lj[sbmask(j)];
-      factor_coul = special_coul[sbmask(j)];
-      j &= NEIGHMASK;
-
-      delx = xtmp - x[j][0];
-      dely = ytmp - x[j][1];
-      delz = ztmp - x[j][2];
-      rsq = delx*delx + dely*dely + delz*delz;
-      jtype = type[j];
-
-      if (rsq < cutsq[itype][jtype]) {
-	r2inv = 1.0/rsq;
-
-	if (rsq < cut_coulsq[itype][jtype])
-	  forcecoul = qqrd2e * qtmp*q[j]*sqrt(r2inv);
-	else forcecoul = 0.0;
-
-	if (rsq < cut_ljsq[itype][jtype]) {
-	  r6inv = r2inv*r2inv*r2inv;
-	  forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
-	} else forcelj = 0.0;
-
-	fpair = (factor_coul*forcecoul + factor_lj*forcelj) * r2inv;
-
-	f[i][0] += delx*fpair;
-	f[i][1] += dely*fpair;
-	f[i][2] += delz*fpair;
-
-	if (eflag) {
-	  if (rsq < cut_coulsq[itype][jtype])
-	    ecoul = factor_coul * qqrd2e * qtmp*q[j]*sqrt(r2inv);
-	  else ecoul = 0.0;
-	  if (rsq < cut_ljsq[itype][jtype]) {
-	    evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
-	      offset[itype][jtype];
-	    evdwl *= factor_lj;
-	  } else evdwl = 0.0;
-	}
-
-        if (j<start) {
-  	  if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
-        } else {
-          if (j<nlocal) {
-	    f[j][0] -= delx*fpair;
-	    f[j][1] -= dely*fpair;
-	    f[j][2] -= delz*fpair;
-  	  }
-	  if (evflag) ev_tally(i,j,nlocal,0,
-			       evdwl,ecoul,fpair,delx,dely,delz);
-        }
-      }
-    }
-  }
-}
diff --git a/src/GPU/pair_lj_cut_coul_cut_gpu.h b/src/GPU/pair_lj_cut_coul_cut_gpu.h
index 2abf079338..17bd4aa852 100644
--- a/src/GPU/pair_lj_cut_coul_cut_gpu.h
+++ b/src/GPU/pair_lj_cut_coul_cut_gpu.h
@@ -28,8 +28,7 @@ class PairLJCutCoulCutGPU : public PairLJCutCoulCut {
  public:
   PairLJCutCoulCutGPU(LAMMPS *lmp);
   ~PairLJCutCoulCutGPU();
-  void cpu_compute(int, int, int);
-  void cpu_compute(int *, int, int, int);
+  void cpu_compute(int, int, int, int, int *, int *, int **);
   void compute(int, int);
   void init_style();
   double memory_usage();
diff --git a/src/GPU/pair_lj_cut_coul_long_gpu.cpp b/src/GPU/pair_lj_cut_coul_long_gpu.cpp
index 92014e4fc7..dcf3645dd6 100644
--- a/src/GPU/pair_lj_cut_coul_long_gpu.cpp
+++ b/src/GPU/pair_lj_cut_coul_long_gpu.cpp
@@ -35,6 +35,7 @@
 #include "domain.h"
 #include "string.h"
 #include "kspace.h"
+#include "gpu_extra.h"
 
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
@@ -49,27 +50,29 @@
 
 // External functions from cuda library for atom decomposition
 
-bool ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-                  double **host_lj2, double **host_lj3, double **host_lj4, 
-                  double **offset, double *special_lj, const int nlocal, 
-                  const int nall, const int max_nbors, const int maxspecial,
-                  const double cell_size, int &gpu_mode, FILE *screen,
-                  double **host_cut_ljsq, double host_cut_coulsq,
-                  double *host_special_coul, const double qqrd2e,
-                  const double g_ewald);
+int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+		  double **host_lj2, double **host_lj3, double **host_lj4, 
+		  double **offset, double *special_lj, const int nlocal, 
+		  const int nall, const int max_nbors, const int maxspecial,
+		  const double cell_size, int &gpu_mode, FILE *screen,
+		  double **host_cut_ljsq, double host_cut_coulsq,
+		  double *host_special_coul, const double qqrd2e,
+		  const double g_ewald);
 void ljcl_gpu_clear();
-int * ljcl_gpu_compute_n(const int timestep, const int ago, const int inum,
-	 	        const int nall, double **host_x, int *host_type, 
-                        double *boxlo, double *boxhi, int *tag, int **nspecial,
-                        int **special, const bool eflag, const bool vflag,
-                        const bool eatom, const bool vatom, int &host_start,
-                        const double cpu_time, bool &success, double *host_q);
-void ljcl_gpu_compute(const int timestep, const int ago, const int inum,
-	 	     const int nall, double **host_x, int *host_type,
-                     int *ilist, int *numj, int **firstneigh,
-		     const bool eflag, const bool vflag, const bool eatom,
-                     const bool vatom, int &host_start, const double cpu_time,
-                     bool &success, double *host_q);
+int ** ljcl_gpu_compute_n(const int ago, const int inum,
+			  const int nall, double **host_x, int *host_type, 
+			  double *sublo, double *subhi, int *tag, 
+			  int **nspecial, int **special, const bool eflag,
+			  const bool vflag, const bool eatom, const bool vatom,
+			  int &host_start, int **ilist, int **jnum,
+			  const double cpu_time, bool &success, double *host_q,
+			  double *boxlo, double *prd);
+void ljcl_gpu_compute(const int ago, const int inum, const int nall,
+		      double **host_x, int *host_type, int *ilist, int *numj,
+		      int **firstneigh, const bool eflag, const bool vflag,
+		      const bool eatom, const bool vatom, int &host_start,
+		      const double cpu_time, bool &success, double *host_q,
+		      const int nlocal, double *boxlo, double *prd);
 double ljcl_gpu_bytes();
 
 using namespace LAMMPS_NS;
@@ -96,8 +99,6 @@ PairLJCutCoulLongGPU::~PairLJCutCoulLongGPU()
 
 void PairLJCutCoulLongGPU::compute(int eflag, int vflag)
 {
-  int ntimestep = static_cast<int>(update->ntimestep % MAXSMALLINT);
-
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = 0;
   
@@ -105,31 +106,32 @@ void PairLJCutCoulLongGPU::compute(int eflag, int vflag)
   int inum, host_start;
   
   bool success = true;
-  
+  int *ilist, *numneigh, **firstneigh;    
   if (gpu_mode == GPU_NEIGH) {
     inum = atom->nlocal;
-    gpulist = ljcl_gpu_compute_n(ntimestep, neighbor->ago, inum, nall,
-			         atom->x, atom->type, domain->sublo,
-				 domain->subhi, atom->tag, atom->nspecial,
-                                 atom->special, eflag, vflag, eflag_atom,
-                                 vflag_atom, host_start, cpu_time, success,
-                                 atom->q);
+    firstneigh = ljcl_gpu_compute_n(neighbor->ago, inum, nall, atom->x,
+				    atom->type, domain->sublo, domain->subhi,
+				    atom->tag, atom->nspecial, atom->special,
+				    eflag, vflag, eflag_atom, vflag_atom,
+				    host_start, &ilist, &numneigh, cpu_time,
+				    success, atom->q, domain->boxlo,
+				    domain->prd);
   } else {
     inum = list->inum;
-    ljcl_gpu_compute(ntimestep, neighbor->ago, inum, nall, atom->x,
-		    atom->type, list->ilist, list->numneigh, list->firstneigh,
-		    eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time,
-                    success, atom->q);
+    ilist = list->ilist;
+    numneigh = list->numneigh;
+    firstneigh = list->firstneigh;
+    ljcl_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
+		     ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
+		     vflag_atom, host_start, cpu_time, success, atom->q,
+		     atom->nlocal, domain->boxlo, domain->prd);
   }
   if (!success)
     error->one("Out of memory on GPGPU");
 
   if (host_start<inum) {
     cpu_time = MPI_Wtime();
-    if (gpu_mode == GPU_NEIGH)
-      cpu_compute(gpulist, host_start, eflag, vflag);
-    else
-      cpu_compute(host_start, eflag, vflag);
+    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
     cpu_time = MPI_Wtime() - cpu_time;
   }
 }
@@ -144,8 +146,8 @@ void PairLJCutCoulLongGPU::init_style()
 
   if (!atom->q_flag)
     error->all("Pair style lj/cut/coul/cut requires atom attribute q");
-  if (force->pair_match("gpu",0) == NULL)
-    error->all("Cannot use pair hybrid with multiple GPU pair styles");
+  if (force->newton_pair) 
+    error->all("Cannot use newton pair with GPU LJ pair style");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
@@ -179,16 +181,12 @@ void PairLJCutCoulLongGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
-  bool init_ok = ljcl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
+  int success = ljcl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                               offset, force->special_lj, atom->nlocal,
                               atom->nlocal+atom->nghost, 300, maxspecial,
                               cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq,
                               force->special_coul, force->qqrd2e, g_ewald);
-  if (!init_ok)
-    error->one("Insufficient memory on accelerator (or no fix gpu).\n"); 
-
-  if (force->newton_pair) 
-    error->all("Cannot use newton pair with GPU LJ pair style");
+  GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode != GPU_NEIGH) {
     int irequest = neighbor->request(this);
@@ -207,14 +205,16 @@ double PairLJCutCoulLongGPU::memory_usage()
 
 /* ---------------------------------------------------------------------- */
 
-void PairLJCutCoulLongGPU::cpu_compute(int start, int eflag, int vflag)
+void PairLJCutCoulLongGPU::cpu_compute(int start, int inum, int eflag,
+				       int vflag, int *ilist, int *numneigh,
+				       int **firstneigh)
 {
-  int i,j,ii,jj,inum,jnum,itype,jtype,itable;
+  int i,j,ii,jj,jnum,itype,jtype,itable;
   double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
   double fraction,table;
   double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
   double grij,expm2,prefactor,t,erfc;
-  int *ilist,*jlist,*numneigh,**firstneigh;
+  int *jlist;
   double rsq;
 
   evdwl = ecoul = 0.0;
@@ -228,11 +228,6 @@ void PairLJCutCoulLongGPU::cpu_compute(int start, int eflag, int vflag)
   double *special_lj = force->special_lj;
   double qqrd2e = force->qqrd2e;
 
-  inum = list->inum;
-  ilist = list->ilist;
-  numneigh = list->numneigh;
-  firstneigh = list->firstneigh;
-  
   // loop over neighbors of my atoms
 
   for (ii = start; ii < inum; ii++) {
@@ -320,127 +315,3 @@ void PairLJCutCoulLongGPU::cpu_compute(int start, int eflag, int vflag)
     }
   }
 }
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJCutCoulLongGPU::cpu_compute(int *nbors, int start, int eflag,
-                                      int vflag)
-{
-  int i,j,jnum,itype,jtype,itable;
-  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
-  double fraction,table;
-  double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
-  double grij,expm2,prefactor,t,erfc;
-  double rsq;
-
-  evdwl = ecoul = 0.0;
-
-  double **x = atom->x;
-  double **f = atom->f;
-  double *q = atom->q;
-  int *type = atom->type;
-  int nlocal = atom->nlocal;
-  int stride = nlocal-start;
-  double *special_coul = force->special_coul;
-  double *special_lj = force->special_lj;
-  double qqrd2e = force->qqrd2e;
-
-  // loop over neighbors of my atoms
-
-  for (i = start; i < nlocal; i++) {
-    qtmp = q[i];
-    xtmp = x[i][0];
-    ytmp = x[i][1];
-    ztmp = x[i][2];
-    itype = type[i];
-    int *nbor = nbors + i - start;
-    jnum = *nbor;
-    nbor += stride;
-    int *nbor_end = nbor + stride * jnum;
-
-    for (; nbor<nbor_end; nbor+=stride) {
-      j = *nbor;
-      factor_lj = special_lj[sbmask(j)];
-      factor_coul = special_coul[sbmask(j)];
-      j &= NEIGHMASK;
-
-      delx = xtmp - x[j][0];
-      dely = ytmp - x[j][1];
-      delz = ztmp - x[j][2];
-      rsq = delx*delx + dely*dely + delz*delz;
-      jtype = type[j];
-
-      if (rsq < cutsq[itype][jtype]) {
-	r2inv = 1.0/rsq;
-
-	if (rsq < cut_coulsq) {
-	  if (!ncoultablebits || rsq <= tabinnersq) {
-	    r = sqrt(rsq);
-	    grij = g_ewald * r;
-	    expm2 = exp(-grij*grij);
-	    t = 1.0 / (1.0 + EWALD_P*grij);
-	    erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
-	    prefactor = qqrd2e * qtmp*q[j]/r;
-	    forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
-	    if (factor_coul < 1.0) forcecoul -= (1.0-factor_coul)*prefactor;
-	  } else {
-	    union_int_float_t rsq_lookup;
-	    rsq_lookup.f = rsq;
-	    itable = rsq_lookup.i & ncoulmask;
-	    itable >>= ncoulshiftbits;
-	    fraction = (rsq_lookup.f - rtable[itable]) * drtable[itable];
-	    table = ftable[itable] + fraction*dftable[itable];
-	    forcecoul = qtmp*q[j] * table;
-	    if (factor_coul < 1.0) {
-	      table = ctable[itable] + fraction*dctable[itable];
-	      prefactor = qtmp*q[j] * table;
-	      forcecoul -= (1.0-factor_coul)*prefactor;
-	    }
-	  }
-	} else forcecoul = 0.0;
-
-	if (rsq < cut_ljsq[itype][jtype]) {
-	  r6inv = r2inv*r2inv*r2inv;
-	  forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
-	} else forcelj = 0.0;
-
-	fpair = (forcecoul + factor_lj*forcelj) * r2inv;
-
-	f[i][0] += delx*fpair;
-	f[i][1] += dely*fpair;
-	f[i][2] += delz*fpair;
-
-	if (eflag) {
-	  if (rsq < cut_coulsq) {
-	    if (!ncoultablebits || rsq <= tabinnersq)
-	      ecoul = prefactor*erfc;
-	    else {
-	      table = etable[itable] + fraction*detable[itable];
-	      ecoul = qtmp*q[j] * table;
-	    }
-	    if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
-	  } else ecoul = 0.0;
-
-	  if (rsq < cut_ljsq[itype][jtype]) {
-	    evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
-	      offset[itype][jtype];
-	    evdwl *= factor_lj;
-	  } else evdwl = 0.0;
-	}
-
-        if (j<start) {
-  	  if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
-        } else {
-          if (j<nlocal) {
-	    f[j][0] -= delx*fpair;
-	    f[j][1] -= dely*fpair;
-	    f[j][2] -= delz*fpair;
-  	  }
-	  if (evflag) ev_tally(i,j,nlocal,0,
-			       evdwl,ecoul,fpair,delx,dely,delz);
-        }
-      }
-    }
-  }
-}
-
diff --git a/src/GPU/pair_lj_cut_coul_long_gpu.h b/src/GPU/pair_lj_cut_coul_long_gpu.h
index a9545d4f1f..70fbf82a71 100644
--- a/src/GPU/pair_lj_cut_coul_long_gpu.h
+++ b/src/GPU/pair_lj_cut_coul_long_gpu.h
@@ -28,8 +28,7 @@ class PairLJCutCoulLongGPU : public PairLJCutCoulLong {
  public:
   PairLJCutCoulLongGPU(LAMMPS *lmp);
   ~PairLJCutCoulLongGPU();
-  void cpu_compute(int, int, int);
-  void cpu_compute(int *, int, int, int);
+  void cpu_compute(int, int, int, int, int *, int *, int **);
   void compute(int, int);
   void init_style();
   double memory_usage();
diff --git a/src/GPU/pair_lj_cut_gpu.cpp b/src/GPU/pair_lj_cut_gpu.cpp
index 3a0a854f6b..bb81129762 100644
--- a/src/GPU/pair_lj_cut_gpu.cpp
+++ b/src/GPU/pair_lj_cut_gpu.cpp
@@ -34,30 +34,31 @@
 #include "update.h"
 #include "domain.h"
 #include "string.h"
+#include "gpu_extra.h"
 
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 
 // External functions from cuda library for atom decomposition
 
-bool ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-                  double **host_lj2, double **host_lj3, double **host_lj4, 
-                  double **offset, double *special_lj, const int nlocal, 
-                  const int nall, const int max_nbors, const int maxspecial,
-                  const double cell_size, int &gpu_mode, FILE *screen);
+int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+		 double **host_lj2, double **host_lj3, double **host_lj4, 
+		 double **offset, double *special_lj, const int nlocal, 
+		 const int nall, const int max_nbors, const int maxspecial,
+		 const double cell_size, int &gpu_mode, FILE *screen);
 void ljl_gpu_clear();
-int * ljl_gpu_compute_n(const int timestep, const int ago, const int inum,
-	 	        const int nall, double **host_x, int *host_type, 
-                        double *boxlo, double *boxhi, int *tag, int **nspecial,
-                        int **special, const bool eflag, const bool vflag,
-                        const bool eatom, const bool vatom, int &host_start,
-                        const double cpu_time, bool &success);
-void ljl_gpu_compute(const int timestep, const int ago, const int inum,
-	 	     const int nall, double **host_x, int *host_type,
-                     int *ilist, int *numj, int **firstneigh,
-		     const bool eflag, const bool vflag, const bool eatom,
-                     const bool vatom, int &host_start, const double cpu_time,
-                     bool &success);
+int ** ljl_gpu_compute_n(const int ago, const int inum,
+			 const int nall, double **host_x, int *host_type, 
+			 double *sublo, double *subhi, int *tag, int **nspecial,
+			 int **special, const bool eflag, const bool vflag,
+			 const bool eatom, const bool vatom, int &host_start,
+			 int **ilist, int **jnum,
+			 const double cpu_time, bool &success);
+void ljl_gpu_compute(const int ago, const int inum, const int nall, 
+		     double **host_x, int *host_type, int *ilist, int *numj,
+		     int **firstneigh, const bool eflag, const bool vflag,
+		     const bool eatom, const bool vatom, int &host_start,
+		     const double cpu_time, bool &success);
 double ljl_gpu_bytes();
 
 using namespace LAMMPS_NS;
@@ -83,8 +84,6 @@ PairLJCutGPU::~PairLJCutGPU()
 
 void PairLJCutGPU::compute(int eflag, int vflag)
 {
-  int ntimestep = static_cast<int>(update->ntimestep % MAXSMALLINT);
-
   if (eflag || vflag) ev_setup(eflag,vflag);
   else evflag = vflag_fdotr = 0;
   
@@ -92,30 +91,30 @@ void PairLJCutGPU::compute(int eflag, int vflag)
   int inum, host_start;
   
   bool success = true;
-  
+  int *ilist, *numneigh, **firstneigh;
   if (gpu_mode == GPU_NEIGH) {
     inum = atom->nlocal;
-    gpulist = ljl_gpu_compute_n(ntimestep, neighbor->ago, inum, nall,
-			        atom->x, atom->type, domain->sublo,
-				domain->subhi, atom->tag, atom->nspecial,
-                                atom->special, eflag, vflag, eflag_atom,
-                                vflag_atom, host_start, cpu_time, success);
+    firstneigh = ljl_gpu_compute_n(neighbor->ago, inum, nall,
+				   atom->x, atom->type, domain->sublo,
+				   domain->subhi, atom->tag, atom->nspecial,
+				   atom->special, eflag, vflag, eflag_atom,
+				   vflag_atom, host_start, 
+				   &ilist, &numneigh, cpu_time, success);
   } else {
     inum = list->inum;
-    ljl_gpu_compute(ntimestep, neighbor->ago, inum, nall, atom->x,
-		    atom->type, list->ilist, list->numneigh, list->firstneigh,
-		    eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time,
-                    success);
+    ilist = list->ilist;
+    numneigh = list->numneigh;
+    firstneigh = list->firstneigh;
+    ljl_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
+		    ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
+		    vflag_atom, host_start, cpu_time, success);
   }
   if (!success)
     error->one("Out of memory on GPGPU");
 
   if (host_start<inum) {
     cpu_time = MPI_Wtime();
-    if (gpu_mode == GPU_NEIGH)
-      cpu_compute(gpulist, host_start, eflag, vflag);
-    else
-      cpu_compute(host_start, eflag, vflag);
+    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
     cpu_time = MPI_Wtime() - cpu_time;
   }
 }
@@ -128,8 +127,8 @@ void PairLJCutGPU::init_style()
 {
   cut_respa = NULL;
 
-  if (force->pair_match("gpu",0) == NULL)
-    error->all("Cannot use pair hybrid with multiple GPU pair styles");
+  if (force->newton_pair) 
+    error->all("Cannot use newton pair with GPU LJ pair style");
 
   // Repeat cutsq calculation because done after call to init_style
   double maxcut = -1.0;
@@ -151,15 +150,11 @@ void PairLJCutGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
-  bool init_ok = ljl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
-                              offset, force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
-                              cell_size, gpu_mode, screen);
-  if (!init_ok)
-    error->one("Insufficient memory on accelerator (or no fix gpu).\n"); 
-
-  if (force->newton_pair) 
-    error->all("Cannot use newton pair with GPU LJ pair style");
+  int success = ljl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
+			     offset, force->special_lj, atom->nlocal,
+			     atom->nlocal+atom->nghost, 300, maxspecial,
+			     cell_size, gpu_mode, screen);
+  GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode != GPU_NEIGH) {
     int irequest = neighbor->request(this);
@@ -178,11 +173,12 @@ double PairLJCutGPU::memory_usage()
 
 /* ---------------------------------------------------------------------- */
 
-void PairLJCutGPU::cpu_compute(int start, int eflag, int vflag) {
-  int i,j,ii,jj,inum,jnum,itype,jtype;
+void PairLJCutGPU::cpu_compute(int start, int inum, int eflag, int vflag, 
+			       int *ilist, int *numneigh, int **firstneigh) {
+  int i,j,ii,jj,jnum,itype,jtype;
   double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
   double rsq,r2inv,r6inv,forcelj,factor_lj;
-  int *ilist,*jlist,*numneigh,**firstneigh;
+  int *jlist;
 
   double **x = atom->x;
   double **f = atom->f;
@@ -190,11 +186,6 @@ void PairLJCutGPU::cpu_compute(int start, int eflag, int vflag) {
   int nlocal = atom->nlocal;
   double *special_lj = force->special_lj;
 
-  inum = list->inum;
-  ilist = list->ilist;
-  numneigh = list->numneigh;
-  firstneigh = list->firstneigh;
-
   // loop over neighbors of my atoms
 
   for (ii = start; ii < inum; ii++) {
@@ -238,73 +229,3 @@ void PairLJCutGPU::cpu_compute(int start, int eflag, int vflag) {
     }
   }
 }
-
-/* ---------------------------------------------------------------------- */
-
-void PairLJCutGPU::cpu_compute(int *nbors, int start, int eflag, int vflag) {
-  int i,j,itype,jtype;
-  int nlocal = atom->nlocal;
-  int stride = nlocal-start;
-  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
-  double rsq,r2inv,r6inv,forcelj,factor_lj;
-  double *special_lj = force->special_lj;
-
-  double **x = atom->x;
-  double **f = atom->f;
-  int *type = atom->type;
-
-  // loop over neighbors of my atoms
-
-  for (i = start; i < nlocal; i++) {
-    xtmp = x[i][0];
-    ytmp = x[i][1];
-    ztmp = x[i][2];
-    itype = type[i];
-    int *nbor = nbors + i - start;
-    int jnum = *nbor;
-    nbor += stride;
-    int *nbor_end = nbor + stride * jnum;
-
-    for (; nbor<nbor_end; nbor+=stride) {
-      j = *nbor;
-      factor_lj = special_lj[sbmask(j)];
-      j &= NEIGHMASK;
-
-      delx = xtmp - x[j][0];
-      dely = ytmp - x[j][1];
-      delz = ztmp - x[j][2];
-      rsq = delx*delx + dely*dely + delz*delz;
-      jtype = type[j];
-
-      if (rsq < cutsq[itype][jtype]) {
-	r2inv = 1.0/rsq;
-	r6inv = r2inv*r2inv*r2inv;
-	forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
-	fpair = factor_lj*forcelj*r2inv;
-
-	f[i][0] += delx*fpair;
-	f[i][1] += dely*fpair;
-	f[i][2] += delz*fpair;
-
-	if (eflag) {
-	  evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
-	    offset[itype][jtype];
-	  evdwl *= factor_lj;
-	}
-
-        if (j<start) {
-  	  if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz);
-        } else {
-          if (j<nlocal) {
-	    f[j][0] -= delx*fpair;
-	    f[j][1] -= dely*fpair;
-	    f[j][2] -= delz*fpair;
-  	  }
-	  if (evflag) ev_tally(i,j,nlocal,0,
-			       evdwl,0.0,fpair,delx,dely,delz);
-	}
-      }
-    }
-  }
-}
-
diff --git a/src/GPU/pair_lj_cut_gpu.h b/src/GPU/pair_lj_cut_gpu.h
index 052294be28..1a8c1db46f 100644
--- a/src/GPU/pair_lj_cut_gpu.h
+++ b/src/GPU/pair_lj_cut_gpu.h
@@ -28,8 +28,7 @@ class PairLJCutGPU : public PairLJCut {
  public:
   PairLJCutGPU(LAMMPS *lmp);
   ~PairLJCutGPU();
-  void cpu_compute(int, int, int);
-  void cpu_compute(int *, int, int, int);
+  void cpu_compute(int, int, int, int, int *, int *, int **);
   void compute(int, int);
   void init_style();
   double memory_usage();

From 5f4a69b0b01be70465028c1f6474e7bfac63a253 Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Mon, 2 May 2011 15:05:13 +0000
Subject: [PATCH 20/21] git-svn-id:
 svn://svn.icms.temple.edu/lammps-ro/trunk@6056
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 src/pair_hybrid.cpp  | 4 ----
 src/pair_lj_expand.h | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/pair_hybrid.cpp b/src/pair_hybrid.cpp
index 786eb674fd..a3d0dafd32 100644
--- a/src/pair_hybrid.cpp
+++ b/src/pair_hybrid.cpp
@@ -199,7 +199,6 @@ void PairHybrid::settings(int narg, char **arg)
   // exception is 1st arg of style "table", which is non-numeric word
   // exception is 1st two args of style "lj/coul", which are non-numeric
   // exception is 1st two args of style "buck/coul", which are non-numeric
-  // exception is 1st arg of any "gpu" style, which is non-numeric
   // exception is 1st arg of reax/c style, which is non-numeric
   // need a better way to skip these exceptions
 
@@ -209,7 +208,6 @@ void PairHybrid::settings(int narg, char **arg)
     if (strcmp(arg[i],"table") == 0) i++;
     if (strcmp(arg[i],"lj/coul") == 0) i += 2;
     if (strcmp(arg[i],"buck/coul") == 0) i += 2;
-    if (strstr(arg[i],"gpu")) i++;
     if (strcmp(arg[i],"reax/c") == 0) i++;
     i++;
     while (i < narg && !isalpha(arg[i][0])) i++;
@@ -226,7 +224,6 @@ void PairHybrid::settings(int narg, char **arg)
   // exception is 1st arg of style "table", which is non-numeric
   // exception is 1st two args of style "lj/coul", which are non-numeric
   // exception is 1st two args of style "buck/coul", which are non-numeric
-  // exception is 1st arg of any "gpu" style, which is non-numeric
   // exception is 1st arg of reax/c style, which is non-numeric
   // need a better way to skip these exceptions
 
@@ -247,7 +244,6 @@ void PairHybrid::settings(int narg, char **arg)
     if (strcmp(arg[i],"table") == 0) i++;
     if (strcmp(arg[i],"lj/coul") == 0) i += 2;
     if (strcmp(arg[i],"buck/coul") == 0) i += 2;
-    if (strstr(arg[i],"gpu")) i++;
     if (strcmp(arg[i],"reax/c") == 0) i++;
     i++;
     while (i < narg && !isalpha(arg[i][0])) i++;
diff --git a/src/pair_lj_expand.h b/src/pair_lj_expand.h
index fa6b136c67..1d1b10c315 100644
--- a/src/pair_lj_expand.h
+++ b/src/pair_lj_expand.h
@@ -38,7 +38,7 @@ class PairLJExpand : public Pair {
   void read_restart_settings(FILE *);
   double single(int, int, int, int, double, double, double, double &);
 
- private:
+ protected:
   double cut_global;
   double **cut;
   double **epsilon,**sigma,**shift;

From 0add57d01eef748c864c0de0394cf817f4f847ff Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Mon, 2 May 2011 15:10:37 +0000
Subject: [PATCH 21/21] git-svn-id:
 svn://svn.icms.temple.edu/lammps-ro/trunk@6058
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 src/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/version.h b/src/version.h
index b7cd4f016f..382d330033 100644
--- a/src/version.h
+++ b/src/version.h
@@ -1 +1 @@
-#define LAMMPS_VERSION "29 Apr 2011"
+#define LAMMPS_VERSION "2 May 2011"