diff --git a/lib/gpu/pppm_gpu_kernel.cu b/lib/gpu/pppm_gpu_kernel.cu
index 522a0a5260..08c065fa30 100644
--- a/lib/gpu/pppm_gpu_kernel.cu
+++ b/lib/gpu/pppm_gpu_kernel.cu
@@ -59,6 +59,18 @@ __inline double fetch_q(const int& i, const double *q)
 {
   return q[i];
 }
+
+__device__ inline void atomicFloatAdd(double* address, double val) {
+  double old = *address, assumed;
+  do { 
+    assumed = old;
+    old = __longlong_as_double( atomicCAS((unsigned long long int*)address, 
+                                          __double_as_longlong(assumed),
+                                          __double_as_longlong(val +
+                                          assumed)));
+  } while (assumed != old); 
+}
+
 #else
 __inline float4 fetch_pos(const int& i, const float4 *pos)
 {
@@ -68,6 +80,21 @@ __inline float fetch_q(const int& i, const float *q)
 {
   return tex1Dfetch(q_tex, i);
 }
+
+__device__ inline void atomicFloatAdd(float *address, float val)
+{
+       int i_val = __float_as_int(val);
+       int tmp0 = 0;
+       int tmp1;
+
+       while( (tmp1 = atomicCAS((int *)address, tmp0, i_val)) != tmp0)
+       {
+               tmp0 = tmp1;
+               i_val = __float_as_int(val + __int_as_float(tmp1));
+       }
+}
+
+
 #endif
 
 #else
@@ -88,8 +115,8 @@ __inline float fetch_q(const int& i, const float *q)
 
 __kernel void particle_map(__global numtyp4 *x_, const int nlocal, 
                            __global int *counts, __global int *ans, 
-                           const numtyp boxlo_x, const numtyp boxlo_y,
-                           const numtyp boxlo_z, const numtyp delxinv,
+                           const numtyp b_lo_x, const numtyp b_lo_y,
+                           const numtyp b_lo_z, const numtyp delxinv,
                            const numtyp delyinv, const numtyp delzinv,
                            const int nlocal_x, const int nlocal_y,
                            const int nlocal_z, const int atom_stride,
@@ -102,11 +129,11 @@ __kernel void particle_map(__global numtyp4 *x_, const int nlocal,
   if (ii<nlocal) {
     numtyp4 p=fetch_pos(ii,x_);
 
-    tx=(p.x-boxlo_x)*delxinv;
+    tx=(p.x-b_lo_x)*delxinv;
     nx=int(tx);
-    ty=(p.y-boxlo_y)*delyinv;
+    ty=(p.y-b_lo_y)*delyinv;
     ny=int(ty);
-    tz=(p.z-boxlo_z)*delzinv;
+    tz=(p.z-b_lo_z)*delzinv;
     nz=int(tz);
 
     if (tx<0 || ty<0 || tz<0 || nx>=nlocal_x || ny>=nlocal_y || nz>=nlocal_z)
@@ -124,27 +151,14 @@ __kernel void particle_map(__global numtyp4 *x_, const int nlocal,
   }
 }
 
-__device__ inline void atomicFloatAdd(float *address, float val)
-{
-       int i_val = __float_as_int(val);
-       int tmp0 = 0;
-       int tmp1;
-
-       while( (tmp1 = atomicCAS((int *)address, tmp0, i_val)) != tmp0)
-       {
-               tmp0 = tmp1;
-               i_val = __float_as_int(val + __int_as_float(tmp1));
-       }
-}
-
 __kernel void make_rho(__global numtyp4 *x_, __global numtyp *q_,
                        __global int *counts, __global int *atoms,
                        __global numtyp *brick, __global numtyp *rho_coeff,
                        const int atom_stride,
                        const int npts_x, const int npts_y,
                        const int npts_z, const int nlower,
-                       const int nupper, const numtyp boxlo_x,
-                       const numtyp boxlo_y, const numtyp boxlo_z,
+                       const int nupper, const numtyp b_lo_x,
+                       const numtyp b_lo_y, const numtyp b_lo_z,
                        const numtyp delxinv, const numtyp delyinv,
                        const numtyp delzinv,
                        const int order, const numtyp delvolinv) {
@@ -167,9 +181,9 @@ __kernel void make_rho(__global numtyp4 *x_, __global numtyp *q_,
         numtyp4 p=fetch_pos(atom,x_);
         numtyp z0=delvolinv*fetch_q(atom,q_);
         
-        numtyp dx = nx - (p.x-boxlo_x)*delxinv;
-        numtyp dy = ny - (p.y-boxlo_y)*delyinv;
-        numtyp dz = nz - (p.z-boxlo_z)*delzinv;
+        numtyp dx = nx - (p.x-b_lo_x)*delxinv;
+        numtyp dy = ny - (p.y-b_lo_y)*delyinv;
+        numtyp dz = nz - (p.z-b_lo_z)*delzinv;
 
         numtyp rho1d[3][8];
         for (int k = 0; k < order; k++) {
diff --git a/lib/gpu/pppm_gpu_memory.cpp b/lib/gpu/pppm_gpu_memory.cpp
index 5928bcfea8..85414820c9 100644
--- a/lib/gpu/pppm_gpu_memory.cpp
+++ b/lib/gpu/pppm_gpu_memory.cpp
@@ -100,7 +100,7 @@ numtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen,
   _nyhi_out=nyhi_out;
   _nzhi_out=nzhi_out;
   _max_brick_atoms=10;
-  
+
   // Get rho_coeff on device
   int n2lo=(1-order)/2;
   int numel=order*( order/2 - n2lo + 1 );
@@ -142,6 +142,11 @@ numtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen,
   d_error_flag.zero();
   _max_bytes+=1;
   
+std::cout << "LO: " << _nxlo_out << " " << _nylo_out << " " << _nzlo_out << " " << _nlower << std::endl;
+std::cout << "HI: " << _nxhi_out << " " << _nyhi_out << " " << _nzhi_out << " " << _nupper << std::endl;
+std::cout << "pts: " << _npts_x << " " << _npts_y << " " << _npts_z << std::endl;
+std::cout << "local: " << _nlocal_x << " " << _nlocal_y << " " << _nlocal_z << std::endl;
+
   return h_brick.begin();
 }
 
@@ -218,43 +223,54 @@ int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall,
   int _max_atoms=10;
   int ainum=this->ans->inum();
   
-  // Boxlo adjusted to include ghost cells and shift for even stencil order
+  // Boxlo adjusted to be upper left brick and shift for even stencil order
   double shift=0.0;
   if (_order % 2)
-    shift-=0.5/delxinv;
+    shift=0.5;
+  numtyp f_brick_x=boxlo[0]+(_nxlo_out-_nlower-shift)/delxinv;
+  numtyp f_brick_y=boxlo[1]+(_nylo_out-_nlower-shift)/delyinv;
+  numtyp f_brick_z=boxlo[2]+(_nzlo_out-_nlower-shift)/delzinv;
   
-  numtyp f_boxlo_x=boxlo[0]+shift;
-  numtyp f_boxlo_y=boxlo[1]+shift;
-  numtyp f_boxlo_z=boxlo[2]+shift;
   numtyp f_delxinv=delxinv;
   numtyp f_delyinv=delyinv;
   numtyp f_delzinv=delzinv;
   double delvolinv = delxinv*delyinv*delzinv;
   numtyp f_delvolinv = delvolinv;
 
+std::cout << "BOX: " << boxlo[0] << " " << boxlo[1] << " " << boxlo[2] << "\n";
+std::cout << "Brick: " << f_brick_x << " " << f_brick_y << " " << f_brick_z << "\n";
+std::cout << "Delx: " << 1.0/delxinv << " " << 1.0/delyinv << " " << 1.0/delzinv << std::endl;
   time_map.start();
   d_brick_counts.zero();
   k_particle_map.set_size(GX,BX);
   k_particle_map.run(&atom->dev_x.begin(), &ainum, &d_brick_counts.begin(),
-                     &d_brick_atoms.begin(), &f_boxlo_x, &f_boxlo_y, 
-                     &f_boxlo_z, &f_delxinv, &f_delyinv, &f_delzinv, &_nlocal_x,
+                     &d_brick_atoms.begin(), &f_brick_x, &f_brick_y, 
+                     &f_brick_z, &f_delxinv, &f_delyinv, &f_delzinv, &_nlocal_x,
                      &_nlocal_y, &_nlocal_z, &_atom_stride, &_max_brick_atoms, 
                      &d_error_flag.begin());
   time_map.stop();
+//std::cout << "COUNTS: " << d_brick_counts << std::endl;
 
+  if (_order % 2)
+    shift=0.0;
+  else
+    shift=0.5;
+  f_brick_x=boxlo[0]+(_nxlo_out-_nlower+shift)/delxinv;
+  f_brick_y=boxlo[1]+(_nylo_out-_nlower+shift)/delyinv;
+  f_brick_z=boxlo[2]+(_nzlo_out-_nlower+shift)/delzinv;
 
   time_rho.start();
   BX=block_x_size();
   int BY=block_y_size();
-  GX=static_cast<int>(ceil(static_cast<double>(_nlocal_x))/BX);
-  int GY=static_cast<int>(ceil(static_cast<double>(_nlocal_y))/BY);
+  GX=static_cast<int>(ceil(static_cast<double>(_nlocal_x)/BX));
+  int GY=static_cast<int>(ceil(static_cast<double>(_nlocal_y)/BY));
   d_brick.zero();
   k_make_rho.set_size(GX,GY,BX,BY);
   k_make_rho.run(&atom->dev_x.begin(), &atom->dev_q.begin(),
                  &d_brick_counts.begin(), &d_brick_atoms.begin(),
                  &d_brick.begin(), &d_rho_coeff.begin(), &_atom_stride, &_npts_x,
-                 &_npts_y, &_npts_z, &_nlower, &_nupper, &f_boxlo_x,
-                 &f_boxlo_y, &f_boxlo_z, &f_delxinv, &f_delyinv, &f_delzinv,
+                 &_npts_y, &_npts_z, &_nlower, &_nupper, &f_brick_x,
+                 &f_brick_y, &f_brick_z, &f_delxinv, &f_delyinv, &f_delzinv,
                  &_order, &f_delvolinv);
   time_rho.stop();
 
diff --git a/lib/gpu/pppm_l_gpu.cpp b/lib/gpu/pppm_l_gpu.cpp
index e75d291734..6ee0278326 100644
--- a/lib/gpu/pppm_l_gpu.cpp
+++ b/lib/gpu/pppm_l_gpu.cpp
@@ -28,11 +28,12 @@ static PPPMGPUMemory<PRECISION,ACC_PRECISION> PPPMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-float * pppm_gpu_init(const int nlocal, const int nall, FILE *screen,
-                      const int order, const int nxlo_out, const int nylo_out,
-                      const int nzlo_out, const int nxhi_out, 
-                      const int nyhi_out, const int nzhi_out,
-                      double **rho_coeff, bool &success) {
+PRECISION * pppm_gpu_init(const int nlocal, const int nall, FILE *screen,
+                          const int order, const int nxlo_out, 
+                          const int nylo_out, const int nzlo_out,
+                          const int nxhi_out, const int nyhi_out,
+                          const int nzhi_out, double **rho_coeff,
+                          bool &success) {
   PPPMF.clear();
   int first_gpu=PPPMF.device->first_device();
   int last_gpu=PPPMF.device->last_device();
@@ -52,7 +53,7 @@ float * pppm_gpu_init(const int nlocal, const int nall, FILE *screen,
   }
 
   success=true;
-  float * host_brick;
+  PRECISION * host_brick=NULL;
   if (world_me==0) {
     host_brick=PPPMF.init(nlocal,nall,screen,order,nxlo_out,nylo_out,
                                   nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,
diff --git a/src/GPU/pppm_gpu.cpp b/src/GPU/pppm_gpu.cpp
index b62065c8eb..1c2adcf106 100644
--- a/src/GPU/pppm_gpu.cpp
+++ b/src/GPU/pppm_gpu.cpp
@@ -37,7 +37,7 @@
 
 // External functions from cuda library for atom decomposition
 
-float* pppm_gpu_init(const int nlocal, const int nall, FILE *screen,
+numtyp* pppm_gpu_init(const int nlocal, const int nall, FILE *screen,
                      const int order, const int nxlo_out, const int nylo_out,
                      const int nzlo_out, const int nxhi_out, const int nyhi_out,
                      const int nzhi_out, double **rho_coeff, bool &success);
@@ -703,17 +703,36 @@ double t3=MPI_Wtime();
   make_rho();
 time3+=MPI_Wtime()-t3;
 
+double max_error=0;
 int _npts_x=nxhi_out-nxlo_out+1;
 int _npts_y=nyhi_out-nylo_out+1;
 int _npts_z=nzhi_out-nzlo_out+1;
-double *cpup = &density_brick[nlower][nlower][nlower];
-float *gpup = host_brick;
+double *cpup = &density_brick[nxlo_out][nylo_out][nzlo_out];
+numtyp *gpup = host_brick;
 int iend=_npts_x*_npts_y*_npts_z;
+int counter_x=0, counter_y=0, counter_z=0;
 for (int i=0; i<iend; i++) {
-  std::cout << "CPU GPU: " << *cpup << " " << *gpup << std::endl;
+  double error=0.0;
+  if (*cpup>1e-8)
+    error = fabs(((*gpup)-(*cpup))/(*cpup));
+  if (error>0.05)
+//    std::cout << "* ";
+    std::cout << counter_z << " " << counter_y << " " << counter_x << " CPU GPU: " << error << " " << *cpup << " " << *gpup << std::endl;
+  if (error>max_error)
+    max_error=error;
   cpup++;
   gpup++;
+  counter_x++;
+  if (counter_x==_npts_x) {
+    counter_x=0;
+    counter_y++;
+  }
+  if (counter_y==_npts_y) {
+    counter_y=0;
+    counter_z++;
+  }
 }
+std::cout << "Maximum relative error: " << max_error*100.0 << "%\n";
 
   // all procs communicate density values from their ghost cells
   //   to fully sum contribution in their 3d bricks
diff --git a/src/GPU/pppm_gpu.h b/src/GPU/pppm_gpu.h
index 997f6bcaa4..773ff31b0f 100644
--- a/src/GPU/pppm_gpu.h
+++ b/src/GPU/pppm_gpu.h
@@ -23,6 +23,8 @@ KSpaceStyle(pppm/gpu,PPPMGPU)
 #include "pppm.h"
 #include "lmptype.h"
 
+#define numtyp float
+
 namespace LAMMPS_NS {
 
 class PPPMGPU : public KSpace {
@@ -98,7 +100,7 @@ class PPPMGPU : public KSpace {
   void compute_rho1d(double, double, double);
   void compute_rho_coeff();
   void slabcorr(int);
-float *host_brick;
+numtyp *host_brick;
 double time1,time2,time3;  
 };