diff --git a/lib/gpu/pppm_gpu_kernel.cu b/lib/gpu/pppm_gpu_kernel.cu index 08c065fa30..80a38ca168 100644 --- a/lib/gpu/pppm_gpu_kernel.cu +++ b/lib/gpu/pppm_gpu_kernel.cu @@ -18,7 +18,7 @@ #ifndef PPPM_GPU_KERNEL #define PPPM_GPU_KERNEL -#define OFFSET 16384 +#define MAX_STENCIL 8 #ifdef _DOUBLE_DOUBLE #define numtyp double @@ -153,24 +153,26 @@ __kernel void particle_map(__global numtyp4 *x_, const int nlocal, __kernel void make_rho(__global numtyp4 *x_, __global numtyp *q_, __global int *counts, __global int *atoms, - __global numtyp *brick, __global numtyp *rho_coeff, - const int atom_stride, - const int npts_x, const int npts_y, - const int npts_z, const int nlower, - const int nupper, const numtyp b_lo_x, + __global numtyp *brick, __global numtyp *_rho_coeff, + const int atom_stride, const int npts_x, + const int npts_y, const int nlocal_x, const int nlocal_y, + const int nlocal_z, const numtyp b_lo_x, const numtyp b_lo_y, const numtyp b_lo_z, const numtyp delxinv, const numtyp delyinv, - const numtyp delzinv, - const int order, const numtyp delvolinv) { - // ii indexes the two interacting particles in gi - int nx=GLOBAL_ID_X; - int ny=GLOBAL_ID_Y; + const numtyp delzinv, const int order, + const numtyp delvolinv) { + __local numtyp rho_coeff[MAX_STENCIL*MAX_STENCIL]; + int nx=THREAD_ID_X; + int ny=THREAD_ID_Y; + if (nx= 0; l--) { rho1d[0][k] = rho_coeff[l*order+k] + rho1d[0][k]*dx; rho1d[1][k] = rho_coeff[l*order+k] + rho1d[1][k]*dy; - rho1d[2][k] = rho_coeff[l*order+k] + rho1d[2][k]*dz; } } for (int n = 0; n < order; n++) { - int mz = n+nz; - numtyp y0 = z0*rho1d[2][n]; + numtyp rho1d_2 = 0.0; + for (int k = order-1; k >= 0; k--) + rho1d_2 = rho_coeff[k*order+n] + rho1d_2*dz; + numtyp y0 = z0*rho1d_2; + int mz = (n+nz)*npts_y*npts_x + ny*npts_x +nx; for (int m = 0; m < order; m++) { - int my = m+ny; numtyp x0 = y0*rho1d[1][m]; for (int l = 0; l < order; l++) { - int mx = l+nx; - int bi = mz*npts_y*npts_x+my*npts_x+mx; - atomicFloatAdd(brick+bi,x0*rho1d[0][l]); + atomicFloatAdd(brick+mz+l,x0*rho1d[0][l]); } + mz+=npts_x; } } } diff --git a/lib/gpu/pppm_gpu_memory.cpp b/lib/gpu/pppm_gpu_memory.cpp index 85414820c9..9393e95dac 100644 --- a/lib/gpu/pppm_gpu_memory.cpp +++ b/lib/gpu/pppm_gpu_memory.cpp @@ -269,7 +269,7 @@ std::cout << "Delx: " << 1.0/delxinv << " " << 1.0/delyinv << " " << 1.0/delzinv k_make_rho.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &d_brick_counts.begin(), &d_brick_atoms.begin(), &d_brick.begin(), &d_rho_coeff.begin(), &_atom_stride, &_npts_x, - &_npts_y, &_npts_z, &_nlower, &_nupper, &f_brick_x, + &_npts_y, &_nlocal_x, &_nlocal_y, &_nlocal_z, &f_brick_x, &f_brick_y, &f_brick_z, &f_delxinv, &f_delyinv, &f_delzinv, &_order, &f_delvolinv); time_rho.stop();