diff --git a/lib/gpu/pppm_gpu_kernel.cu b/lib/gpu/pppm_gpu_kernel.cu
index 206492255e..167b01e3a1 100644
--- a/lib/gpu/pppm_gpu_kernel.cu
+++ b/lib/gpu/pppm_gpu_kernel.cu
@@ -93,9 +93,11 @@ __inline float fetch_q(const int& i, const float *q)
 // Thread block size for all kernels (Must be >=MAX_STENCIL^2)
 #define BLOCK_1D 64
 // Number of threads per pencil for charge spread
-#define PENCIL_SIZE MEM_THREADS
+//#define PENCIL_SIZE MEM_THREADS
+#define PENCIL_SIZE 32
 // Number of pencils per block for charge spread
-#define BLOCK_PENCILS (BLOCK_1D/PENCIL_SIZE)
+//#define BLOCK_PENCILS (BLOCK_1D/PENCIL_SIZE)
+#define BLOCK_PENCILS 2
 
 __kernel void particle_map(__global numtyp4 *x_,  __global numtyp *q_,
                            const numtyp delvolinv, const int nlocal, 
diff --git a/lib/gpu/pppm_gpu_memory.cpp b/lib/gpu/pppm_gpu_memory.cpp
index 679bc67ebb..eab5103fcd 100644
--- a/lib/gpu/pppm_gpu_memory.cpp
+++ b/lib/gpu/pppm_gpu_memory.cpp
@@ -30,9 +30,11 @@
 // Thread block size for all kernels (Must be >=MAX_STENCIL^2)
 #define BLOCK_1D 64
 // Number of threads per pencil for charge spread
-#define PENCIL_SIZE MEM_THREADS
+//#define PENCIL_SIZE MEM_THREADS
+#define PENCIL_SIZE 32
 // Number of pencils per block for charge spread
-#define BLOCK_PENCILS (BLOCK_1D/PENCIL_SIZE)
+//#define BLOCK_PENCILS (BLOCK_1D/PENCIL_SIZE)
+#define BLOCK_PENCILS 2
 
 #define PPPMGPUMemoryT PPPMGPUMemory<numtyp, acctyp>