diff --git a/lib/gpu/pppm_gpu_kernel.cu b/lib/gpu/pppm_gpu_kernel.cu index 206492255e..167b01e3a1 100644 --- a/lib/gpu/pppm_gpu_kernel.cu +++ b/lib/gpu/pppm_gpu_kernel.cu @@ -93,9 +93,11 @@ __inline float fetch_q(const int& i, const float *q) // Thread block size for all kernels (Must be >=MAX_STENCIL^2) #define BLOCK_1D 64 // Number of threads per pencil for charge spread -#define PENCIL_SIZE MEM_THREADS +//#define PENCIL_SIZE MEM_THREADS +#define PENCIL_SIZE 32 // Number of pencils per block for charge spread -#define BLOCK_PENCILS (BLOCK_1D/PENCIL_SIZE) +//#define BLOCK_PENCILS (BLOCK_1D/PENCIL_SIZE) +#define BLOCK_PENCILS 2 __kernel void particle_map(__global numtyp4 *x_, __global numtyp *q_, const numtyp delvolinv, const int nlocal, diff --git a/lib/gpu/pppm_gpu_memory.cpp b/lib/gpu/pppm_gpu_memory.cpp index 679bc67ebb..eab5103fcd 100644 --- a/lib/gpu/pppm_gpu_memory.cpp +++ b/lib/gpu/pppm_gpu_memory.cpp @@ -30,9 +30,11 @@ // Thread block size for all kernels (Must be >=MAX_STENCIL^2) #define BLOCK_1D 64 // Number of threads per pencil for charge spread -#define PENCIL_SIZE MEM_THREADS +//#define PENCIL_SIZE MEM_THREADS +#define PENCIL_SIZE 32 // Number of pencils per block for charge spread -#define BLOCK_PENCILS (BLOCK_1D/PENCIL_SIZE) +//#define BLOCK_PENCILS (BLOCK_1D/PENCIL_SIZE) +#define BLOCK_PENCILS 2 #define PPPMGPUMemoryT PPPMGPUMemory