diff --git a/lib/gpu/README b/lib/gpu/README
index 73a51fc391..b6358170f8 100644
--- a/lib/gpu/README
+++ b/lib/gpu/README
@@ -83,7 +83,11 @@ NOTE: PPPM acceleration can only be run on GPUs with compute capability>=1.1.
       when attempting to run PPPM on a GPU with compute capability 1.0.
 
 NOTE: Double precision is only supported on certain GPUs (with
-      compute capability>=1.3).
+      compute capability>=1.3). If you compile the GPU library for
+      a GPU with compute capability 1.1 and 1.2, then only single
+      precistion FFTs are supported, i.e. LAMMPS has to be compiled
+      with -DFFT_SINGLE. For details on configuring FFT support in 
+      LAMMPS, see http://lammps.sandia.gov/doc/Section_start.html#2_2_4
       
 NOTE: For Tesla and other graphics cards with compute capability>=1.3,
       make sure that -arch=sm_13 is set on the CUDA_ARCH line.
@@ -96,9 +100,8 @@ NOTE: The gayberne/gpu pair style will only be installed if the ASPHERE
 NOTE: The cg/cmm/gpu and cg/cmm/coul/long/gpu pair styles will only be
       installed if the USER-CG-CMM package has been installed.
 
-NOTE: The lj/cut/coul/long/gpu, cg/cmm/coul/long/gpu, pppm/gpu/single, and
-      pppm/gpu/double styles will only be installed if the KSPACE package has
-      been installed.
+NOTE: The lj/cut/coul/long/gpu, cg/cmm/coul/long/gpu, and pppm/gpu styles
+      will only be installed if the KSPACE package has been installed.
 
 NOTE: The lj/charmm/coul/long will only be installed if the MOLECULE package
       has been installed.
diff --git a/lib/gpu/cmmc_long_gpu_kernel.cu b/lib/gpu/cmmc_long_gpu_kernel.cu
index 39ae01cb78..e098efb7f3 100644
--- a/lib/gpu/cmmc_long_gpu_kernel.cu
+++ b/lib/gpu/cmmc_long_gpu_kernel.cu
@@ -200,10 +200,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
           _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
           prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
           forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
-        } else {
+        } else
           forcecoul = (numtyp)0.0;
-          prefactor = (numtyp)0.0;
-        }
 
         force = (force_lj + forcecoul) * r2inv;
 
@@ -212,7 +210,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
         f.z+=delz*force;
 
         if (eflag>0) {
-          e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < lj1[mtype].y) {
             energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
                       lj3[mtype].w;
@@ -394,10 +393,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
           _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
           prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
           forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
-        } else {
+        } else
           forcecoul = (numtyp)0.0;
-          prefactor = (numtyp)0.0;
-        }
 
         force = (force_lj + forcecoul) * r2inv;
 
@@ -406,7 +403,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
         f.z+=delz*force;
 
         if (eflag>0) {
-          e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < lj1[mtype].y) {
             energy += factor_lj*inv1*(lj3[mtype].y*inv2-lj3[mtype].z)-
                       lj3[mtype].w;
diff --git a/lib/gpu/crml_gpu_kernel.cu b/lib/gpu/crml_gpu_kernel.cu
index f2ba74c8b6..7722da9746 100644
--- a/lib/gpu/crml_gpu_kernel.cu
+++ b/lib/gpu/crml_gpu_kernel.cu
@@ -204,10 +204,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
           _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
           prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
           forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
-        } else {
+        } else
           forcecoul = (numtyp)0.0;
-          prefactor = (numtyp)0.0;
-        }
 
         force = (force_lj + forcecoul) * r2inv;
 
@@ -216,7 +214,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
         f.z+=delz*force;
 
         if (eflag>0) {
-          e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < cut_ljsq) {
             numtyp e=r6inv*(lj1[mtype].z*r6inv-lj1[mtype].w);
             if (rsq > cut_lj_innersq)
@@ -407,10 +406,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
           _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
           prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
           forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
-        } else {
+        } else
           forcecoul = (numtyp)0.0;
-          prefactor = (numtyp)0.0;
-        }
 
         force = (force_lj + forcecoul) * r2inv;
 
@@ -419,7 +416,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp2 *ljd_in,
         f.z+=delz*force;
 
         if (eflag>0) {
-          e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < cut_ljsq) {
             numtyp e=lj3-lj4;
             if (rsq > cut_lj_innersq)
diff --git a/lib/gpu/lj_class2_long.cu b/lib/gpu/lj_class2_long.cu
index 5fc7c408df..6bac225fa1 100644
--- a/lib/gpu/lj_class2_long.cu
+++ b/lib/gpu/lj_class2_long.cu
@@ -191,10 +191,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
           _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
           prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
           forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
-        } else {
+        } else
           forcecoul = (numtyp)0.0;
-          prefactor = (numtyp)0.0;
-        }
 
         force = (force_lj + forcecoul) * r2inv;
 
@@ -203,7 +201,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
         f.z+=delz*force;
 
         if (eflag>0) {
-          e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < lj1[mtype].w) {
             numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
             energy+=factor_lj*(e-lj3[mtype].z);
@@ -379,10 +378,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
           _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
           prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
           forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
-        } else {
+        } else
           forcecoul = (numtyp)0.0;
-          prefactor = (numtyp)0.0;
-        }
 
         force = (force_lj + forcecoul) * r2inv;
 
@@ -391,7 +388,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
         f.z+=delz*force;
 
         if (eflag>0) {
-          e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < lj1[mtype].w) {
             numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
             energy+=factor_lj*(e-lj3[mtype].z);
diff --git a/lib/gpu/ljcl_cut_gpu_kernel.cu b/lib/gpu/ljcl_cut_gpu_kernel.cu
index e177f1a0fd..638304247f 100644
--- a/lib/gpu/ljcl_cut_gpu_kernel.cu
+++ b/lib/gpu/ljcl_cut_gpu_kernel.cu
@@ -191,10 +191,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
           _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
           prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
           forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
-        } else {
+        } else
           forcecoul = (numtyp)0.0;
-          prefactor = (numtyp)0.0;
-        }
 
         force = (force_lj + forcecoul) * r2inv;
 
@@ -203,7 +201,8 @@ __kernel void kernel_pair(__global numtyp4 *x_, __global numtyp4 *lj1,
         f.z+=delz*force;
 
         if (eflag>0) {
-          e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < lj1[mtype].w) {
             numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
             energy+=factor_lj*(e-lj3[mtype].z);
@@ -377,10 +376,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
           _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
           prefactor = qqrd2e * qtmp*fetch_q(j,q_)/r;
           forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
-        } else {
+        } else
           forcecoul = (numtyp)0.0;
-          prefactor = (numtyp)0.0;
-        }
 
         force = (force_lj + forcecoul) * r2inv;
 
@@ -389,7 +386,8 @@ __kernel void kernel_pair_fast(__global numtyp4 *x_, __global numtyp4 *lj1_in,
         f.z+=delz*force;
 
         if (eflag>0) {
-          e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < lj1[mtype].w) {
             numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
             energy+=factor_lj*(e-lj3[mtype].z);
diff --git a/lib/gpu/pppm_gpu_kernel.cu b/lib/gpu/pppm_gpu_kernel.cu
index 4b50fbd930..93780f6980 100644
--- a/lib/gpu/pppm_gpu_kernel.cu
+++ b/lib/gpu/pppm_gpu_kernel.cu
@@ -144,7 +144,7 @@ __kernel void particle_map(__global numtyp4 *x_,  __global numtyp *q_,
       
         int i=nz*nlocal_y*nlocal_x+ny*nlocal_x+nx;
         int old=atom_add(counts+i, 1);
-        if (old==max_atoms) {
+        if (old>=max_atoms) {
           *error=2;
           atom_add(counts+i, -1);
         } else
diff --git a/lib/gpu/pppm_gpu_memory.cpp b/lib/gpu/pppm_gpu_memory.cpp
index 2f7b35d051..43e9f8c753 100644
--- a/lib/gpu/pppm_gpu_memory.cpp
+++ b/lib/gpu/pppm_gpu_memory.cpp
@@ -51,7 +51,7 @@ grdtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen,
                               const int order, const int nxlo_out,
                               const int nylo_out, const int nzlo_out,
                               const int nxhi_out, const int nyhi_out,
-                              const int nzhi_out, double **rho_coeff,
+                              const int nzhi_out, grdtyp **rho_coeff,
                               grdtyp **vd_brick, const double slab_volfactor, 
                               const int nx_pppm, const int ny_pppm,
                               const int nz_pppm, int &flag) {
@@ -123,7 +123,7 @@ grdtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen,
   int numel=order*( order/2 - n2lo + 1 );
   success=success && (d_rho_coeff.alloc(numel,*ucl_device,UCL_READ_ONLY)==
                       UCL_SUCCESS);
-  UCL_H_Vec<double> view;
+  UCL_H_Vec<grdtyp> view;
   view.view(rho_coeff[0]+n2lo,numel,*ucl_device);
   ucl_copy(d_rho_coeff,view,true);
   _max_bytes+=d_rho_coeff.row_bytes();
diff --git a/lib/gpu/pppm_gpu_memory.h b/lib/gpu/pppm_gpu_memory.h
index a6e57535bb..870c8aeb92 100644
--- a/lib/gpu/pppm_gpu_memory.h
+++ b/lib/gpu/pppm_gpu_memory.h
@@ -46,7 +46,7 @@ class PPPMGPUMemory {
   grdtyp * init(const int nlocal, const int nall, FILE *screen, const int order,
                 const int nxlo_out, const int nylo_out, const int nzlo_out,
                 const int nxhi_out, const int nyhi_out, const int nzhi_out,
-                double **rho_coeff, grdtyp **vd_brick, 
+                grdtyp **rho_coeff, grdtyp **vd_brick, 
                 const double slab_volfactor, const int nx_pppm, 
                 const int ny_pppm, const int nz_pppm, int &success);
 
diff --git a/lib/gpu/pppm_l_gpu.cpp b/lib/gpu/pppm_l_gpu.cpp
index 200d2f1685..c48462abf7 100644
--- a/lib/gpu/pppm_l_gpu.cpp
+++ b/lib/gpu/pppm_l_gpu.cpp
@@ -34,7 +34,7 @@ grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall,
                        FILE *screen, const int order, const int nxlo_out, 
                        const int nylo_out, const int nzlo_out,
                        const int nxhi_out, const int nyhi_out,
-                       const int nzhi_out, double **rho_coeff,
+                       const int nzhi_out, grdtyp **rho_coeff,
                        grdtyp **vd_brick, const double slab_volfactor,
                        const int nx_pppm, const int ny_pppm, const int nz_pppm,
                        int &success) {
@@ -95,7 +95,7 @@ float * pppm_gpu_init_f(const int nlocal, const int nall, FILE *screen,
                         const int order, const int nxlo_out, 
                         const int nylo_out, const int nzlo_out,
                         const int nxhi_out, const int nyhi_out,
-                        const int nzhi_out, double **rho_coeff,
+                        const int nzhi_out, float **rho_coeff,
                         float **vd_brick, const double slab_volfactor,
                         const int nx_pppm, const int ny_pppm, const int nz_pppm,
                         int &success) {