diff --git a/lib/gpu/pppm_gpu_kernel.cu b/lib/gpu/pppm_gpu_kernel.cu
index 80a38ca168..f09e7cfddb 100644
--- a/lib/gpu/pppm_gpu_kernel.cu
+++ b/lib/gpu/pppm_gpu_kernel.cu
@@ -19,6 +19,7 @@
 #define PPPM_GPU_KERNEL
 
 #define MAX_STENCIL 8
+#define BLOCK_1D 64
 
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
@@ -168,6 +169,7 @@ __kernel void make_rho(__global numtyp4 *x_, __global numtyp *q_,
     int ri=nx*order+ny;
     rho_coeff[ri]=_rho_coeff[ri];
   }
+  __syncthreads();
   
   nx+=BLOCK_ID_X*BLOCK_SIZE_X;
   ny+=BLOCK_ID_Y*BLOCK_SIZE_Y;
@@ -217,5 +219,146 @@ __kernel void make_rho(__global numtyp4 *x_, __global numtyp *q_,
 	}
 }
 
+/* --------------------------- */
+
+__kernel void make_rho2(__global numtyp4 *x_, __global numtyp *q_,
+                       __global int *counts, __global int *atoms,
+                       __global numtyp *brick, __global numtyp *_rho_coeff,
+                       const int atom_stride, const int npts_x,
+                       const int npts_y, const int nlocal_x, const int nlocal_y,
+                       const int nlocal_z, const numtyp b_lo_x,
+                       const numtyp b_lo_y, const numtyp b_lo_z,
+                       const numtyp delxinv, const numtyp delyinv,
+                       const numtyp delzinv, const int order,
+                       const numtyp delvolinv) {
+  __local numtyp rho_coeff[MAX_STENCIL*MAX_STENCIL];
+
+  int nx=BLOCK_ID_X;
+  int ny=BLOCK_ID_Y;
+  int tx=THREAD_ID_X;
+  if (tx<order*order)
+    rho_coeff[tx]=_rho_coeff[tx];
+  __syncthreads();
+
+  int nz=tx;
+  if (nx<nlocal_x && ny<nlocal_y) {
+    int z_stride=nlocal_x*nlocal_y;
+    int z_pos=nz*z_stride+ny*nlocal_x+nx;
+    z_stride*=BLOCK_1D;
+    for ( ; nz<nlocal_z; nz+=BLOCK_1D) {
+      int natoms=counts[z_pos];
+      for (int i=0; i<order; i++)
+        for (int j=0; j<order; j++) {
+          int atom=atoms[atom_stride*row+z_pos+i*nlocal_x+j];
+          numtyp4 p=fetch_pos(atom,x_);
+          numtyp z0=delvolinv*fetch_q(atom,q_);
+        
+          numtyp dx = nx - (p.x-b_lo_x)*delxinv;
+          numtyp dy = ny - (p.y-b_lo_y)*delyinv;
+          numtyp dz = nz - (p.z-b_lo_z)*delzinv;
+          
+z0*rho1d_2*          rho1d[1][j]*rho1d[0][i] ??
+
+        numtyp rho1d[2][MAX_STENCIL];
+        for (int k = 0; k < order; k++) {
+          rho1d[0][k] = 0.0;
+          rho1d[1][k] = 0.0;
+          for (int l = order-1; l >= 0; l--) {
+            rho1d[0][k] = rho_coeff[l*order+k] + rho1d[0][k]*dx;
+            rho1d[1][k] = rho_coeff[l*order+k] + rho1d[1][k]*dy;
+          }
+        }
+        
+        for (int n = 0; n < order; n++) {
+          numtyp rho1d_2 = 0.0;
+          for (int k = order-1; k >= 0; k--)
+            rho1d_2 = rho_coeff[k*order+n] + rho1d_2*dz;
+          numtyp y0 = z0*rho1d_2;
+          int mz = (n+nz)*npts_y*npts_x + ny*npts_x +nx;
+          for (int m = 0; m < order; m++) {
+	          numtyp x0 = y0*rho1d[1][m];
+	          for (int l = 0; l < order; l++) {
+              atomicFloatAdd(brick+mz+l,x0*rho1d[0][l]);
+	          }
+	          mz+=npts_x;
+	        }
+	      }
+	    }
+	    z_pos+=z_stride;
+	  }
+	}
+}
+
+/* --------------------------- */
+
+__kernel void make_rho3(__global numtyp4 *x_, __global numtyp *q_,
+                       const int nlocal,
+                       __global numtyp *brick, __global numtyp *_rho_coeff,
+                       const int npts_x,
+                       const int npts_y, const int nlocal_x, const int nlocal_y,
+                       const int nlocal_z, const numtyp b_lo_x,
+                       const numtyp b_lo_y, const numtyp b_lo_z,
+                       const numtyp delxinv, const numtyp delyinv,
+                       const numtyp delzinv, const numtyp shift,
+                       const int order,
+                       const numtyp delvolinv, __global int *error) {
+  __local numtyp rho_coeff[MAX_STENCIL*MAX_STENCIL];
+  int ii=THREAD_ID_X;
+  if (ii<order*order)
+    rho_coeff[ii]=_rho_coeff[ii];
+  __syncthreads();
+  
+  ii+=BLOCK_ID_X*BLOCK_SIZE_X;
+  
+  int nx,ny,nz;
+  numtyp tx,ty,tz;
+
+  if (ii<nlocal) {
+    numtyp4 p=fetch_pos(ii,x_);
+
+    tx=(p.x-b_lo_x)*delxinv;
+    nx=int(tx);
+    ty=(p.y-b_lo_y)*delyinv;
+    ny=int(ty);
+    tz=(p.z-b_lo_z)*delzinv;
+    nz=int(tz);
+
+    if (tx<0 || ty<0 || tz<0 || nx>=nlocal_x || ny>=nlocal_y || nz>=nlocal_z)
+      *error=1;
+    else {
+      numtyp z0=delvolinv*fetch_q(ii,q_);
+        
+      numtyp dx = nx+shift - tx;
+      numtyp dy = ny+shift - ty;
+      numtyp dz = nz+shift - tz;
+
+      numtyp rho1d[2][MAX_STENCIL];
+      for (int k = 0; k < order; k++) {
+        rho1d[0][k] = 0.0;
+        rho1d[1][k] = 0.0;
+        for (int l = order-1; l >= 0; l--) {
+          rho1d[0][k] = rho_coeff[l*order+k] + rho1d[0][k]*dx;
+          rho1d[1][k] = rho_coeff[l*order+k] + rho1d[1][k]*dy;
+        }
+      }
+        
+      for (int n = 0; n < order; n++) {
+        numtyp rho1d_2 = 0.0;
+        for (int k = order-1; k >= 0; k--)
+          rho1d_2 = rho_coeff[k*order+n] + rho1d_2*dz;
+        numtyp y0 = z0*rho1d_2;
+        int mz = (n+nz)*npts_y*npts_x + ny*npts_x +nx;
+        for (int m = 0; m < order; m++) {
+          numtyp x0 = y0*rho1d[1][m];
+	        for (int l = 0; l < order; l++) {
+            atomicFloatAdd(brick+mz+l,x0*rho1d[0][l]);
+	        }
+          mz+=npts_x;
+        }
+	    }
+	  }
+	}
+}
+
 #endif
 
diff --git a/lib/gpu/pppm_gpu_memory.cpp b/lib/gpu/pppm_gpu_memory.cpp
index 9393e95dac..0c278cd8cb 100644
--- a/lib/gpu/pppm_gpu_memory.cpp
+++ b/lib/gpu/pppm_gpu_memory.cpp
@@ -181,7 +181,7 @@ void PPPMGPUMemoryT::clear() {
 
   device->clear();
 }
-
+/*
 // ---------------------------------------------------------------------------
 // Copy nbor list from host if necessary and then calculate forces, virials,..
 // ---------------------------------------------------------------------------
@@ -237,9 +237,6 @@ int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall,
   double delvolinv = delxinv*delyinv*delzinv;
   numtyp f_delvolinv = delvolinv;
 
-std::cout << "BOX: " << boxlo[0] << " " << boxlo[1] << " " << boxlo[2] << "\n";
-std::cout << "Brick: " << f_brick_x << " " << f_brick_y << " " << f_brick_z << "\n";
-std::cout << "Delx: " << 1.0/delxinv << " " << 1.0/delyinv << " " << 1.0/delzinv << std::endl;
   time_map.start();
   d_brick_counts.zero();
   k_particle_map.set_size(GX,BX);
@@ -249,7 +246,6 @@ std::cout << "Delx: " << 1.0/delxinv << " " << 1.0/delyinv << " " << 1.0/delzinv
                      &_nlocal_y, &_nlocal_z, &_atom_stride, &_max_brick_atoms, 
                      &d_error_flag.begin());
   time_map.stop();
-//std::cout << "COUNTS: " << d_brick_counts << std::endl;
 
   if (_order % 2)
     shift=0.0;
@@ -291,6 +287,84 @@ std::cout << "Delx: " << 1.0/delxinv << " " << 1.0/delyinv << " " << 1.0/delzinv
   
   return h_error_flag[0];
 }
+*/
+// ---------------------------------------------------------------------------
+// Copy nbor list from host if necessary and then calculate forces, virials,..
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall,
+                            double **host_x, int *host_type, bool &success,
+                            double *host_q, double *boxlo, 
+                            const double delxinv, const double delyinv,
+                            const double delzinv) {
+  acc_timers();
+  if (nlocal==0) {
+    zero_timers();
+    return 0;
+  }
+  
+  ans->inum(nlocal);
+
+  if (ago==0) {
+    resize_atom(nlocal,nall,success);
+    resize_local(nlocal,success);
+    if (!success)
+      return 0;
+
+    double bytes=ans->gpu_bytes();
+    if (bytes>_max_an_bytes)
+      _max_an_bytes=bytes;
+  }
+
+  atom->cast_x_data(host_x,host_type);
+  atom->cast_q_data(host_q);
+  atom->add_x_data(host_x,host_type);
+  atom->add_q_data();
+
+  // Compute the block size and grid size to keep all cores busy
+  int BX=this->block_size();
+  
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
+
+  int ainum=this->ans->inum();
+  
+  // Boxlo adjusted to be upper left brick and shift for even stencil order
+  double shift=0.0;
+  numtyp shift2=-0.5;
+  if (_order % 2) {
+    shift=0.5;
+    shift2=0.5;
+  }
+  numtyp f_brick_x=boxlo[0]+(_nxlo_out-_nlower-shift)/delxinv;
+  numtyp f_brick_y=boxlo[1]+(_nylo_out-_nlower-shift)/delyinv;
+  numtyp f_brick_z=boxlo[2]+(_nzlo_out-_nlower-shift)/delzinv;
+  
+  numtyp f_delxinv=delxinv;
+  numtyp f_delyinv=delyinv;
+  numtyp f_delzinv=delzinv;
+  double delvolinv = delxinv*delyinv*delzinv;
+  numtyp f_delvolinv = delvolinv;
+
+  time_map.start();
+  time_map.stop();
+
+  time_rho.start();
+  d_brick.zero();
+  k_make_rho.set_size(GX,BX);
+  k_make_rho.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &ainum,
+                 &d_brick.begin(), &d_rho_coeff.begin(), &_npts_x,
+                 &_npts_y, &_nlocal_x, &_nlocal_y, &_nlocal_z, &f_brick_x,
+                 &f_brick_y, &f_brick_z, &f_delxinv, &f_delyinv, &f_delzinv,
+                 &shift2, &_order, &f_delvolinv, &d_error_flag.begin());
+  time_rho.stop();
+
+  time_out.start();
+  ucl_copy(h_brick,d_brick,true);
+  ucl_copy(h_error_flag,d_error_flag,false);
+  time_out.stop();
+  
+  return h_error_flag[0];
+}
 
 template <class numtyp, class acctyp>
 double PPPMGPUMemoryT::host_memory_usage() const {
@@ -308,7 +382,7 @@ void PPPMGPUMemoryT::compile_kernels(UCL_Device &dev) {
   pppm_program=new UCL_Program(dev);
   pppm_program->load_string(pppm_gpu_kernel,flags.c_str());
   k_particle_map.set_function(*pppm_program,"particle_map");
-  k_make_rho.set_function(*pppm_program,"make_rho");
+  k_make_rho.set_function(*pppm_program,"make_rho2");
   pos_tex.get_texture(*pppm_program,"pos_tex");
   q_tex.get_texture(*pppm_program,"q_tex");