From 29bbace06a048d5a027a82156c920ca05eaead17 Mon Sep 17 00:00:00 2001 From: "W. Michael Brown" Date: Fri, 25 Feb 2011 20:21:07 -0500 Subject: [PATCH] Switching PPPM kernel to use coalesced access. --- lib/gpu/pppm_gpu_kernel.cu | 91 ++++++++++++++++++------------------- lib/gpu/pppm_gpu_memory.cpp | 4 +- 2 files changed, 46 insertions(+), 49 deletions(-) diff --git a/lib/gpu/pppm_gpu_kernel.cu b/lib/gpu/pppm_gpu_kernel.cu index 08f434c463..d9b6f4a9fa 100644 --- a/lib/gpu/pppm_gpu_kernel.cu +++ b/lib/gpu/pppm_gpu_kernel.cu @@ -151,92 +151,89 @@ __kernel void make_rho(__global numtyp4 *x_, __global numtyp *q_, __local numtyp rho_coeff[MAX_STENCIL*MAX_STENCIL]; __local numtyp front[BLOCK_1D+MAX_STENCIL]; __local numtyp ans[MAX_STENCIL][BLOCK_1D]; - __local int nx,ny,x_start,y_start,x_stop,y_stop; - __local int z_stride, z_local_stride; + __local int ny,nz,y_start,z_start,y_stop,z_stop; + __local int z_stride; - int tx=THREAD_ID_X; - int tx_halo=BLOCK_1D+tx; - if (tx=nlocal_x) - x_stop-=nx-nlocal_x+1; + if (nz=nlocal_y) y_stop-=ny-nlocal_y+1; - z_stride=mul24(npts_yx,BLOCK_1D); - z_local_stride=mul24(mul24(nlocal_x,nlocal_y),BLOCK_1D); + if (nz>=nlocal_z) + z_stop-=nz-nlocal_z+1; + z_stride=mul24(nlocal_x,nlocal_y); } - if (tx -1; k-=order) { - rho1d_1=rho_coeff[k-m]+rho1d_1*delta.y; - rho1d_0=rho_coeff[k-l]+rho1d_0*delta.x; + rho1d_1=rho_coeff[k-l]+rho1d_1*delta.y; + rho1d_2=rho_coeff[k-m]+rho1d_2*delta.z; } - delta.w*=rho1d_1*rho1d_0; + delta.w*=rho1d_1*rho1d_2; for (int n=0; n=n; k-=order) - rho1d_2=rho_coeff[k]+rho1d_2*delta.z; - ans[n][tx]+=delta.w*rho1d_2; + rho1d_0=rho_coeff[k]+rho1d_0*delta.x; + ans[n][tid]+=delta.w*rho1d_0; } } + y_pos+=nlocal_x; } + z_pos+=z_stride; } } __syncthreads(); - if (txdev_x.begin(), &atom->dev_q.begin(), &d_brick_counts.begin(), &d_brick_atoms.begin(),