diff --git a/lib/gpu/pppm_gpu_kernel.cu b/lib/gpu/pppm_gpu_kernel.cu index 80a38ca168..f09e7cfddb 100644 --- a/lib/gpu/pppm_gpu_kernel.cu +++ b/lib/gpu/pppm_gpu_kernel.cu @@ -19,6 +19,7 @@ #define PPPM_GPU_KERNEL #define MAX_STENCIL 8 +#define BLOCK_1D 64 #ifdef _DOUBLE_DOUBLE #define numtyp double @@ -168,6 +169,7 @@ __kernel void make_rho(__global numtyp4 *x_, __global numtyp *q_, int ri=nx*order+ny; rho_coeff[ri]=_rho_coeff[ri]; } + __syncthreads(); nx+=BLOCK_ID_X*BLOCK_SIZE_X; ny+=BLOCK_ID_Y*BLOCK_SIZE_Y; @@ -217,5 +219,146 @@ __kernel void make_rho(__global numtyp4 *x_, __global numtyp *q_, } } +/* --------------------------- */ + +__kernel void make_rho2(__global numtyp4 *x_, __global numtyp *q_, + __global int *counts, __global int *atoms, + __global numtyp *brick, __global numtyp *_rho_coeff, + const int atom_stride, const int npts_x, + const int npts_y, const int nlocal_x, const int nlocal_y, + const int nlocal_z, const numtyp b_lo_x, + const numtyp b_lo_y, const numtyp b_lo_z, + const numtyp delxinv, const numtyp delyinv, + const numtyp delzinv, const int order, + const numtyp delvolinv) { + __local numtyp rho_coeff[MAX_STENCIL*MAX_STENCIL]; + + int nx=BLOCK_ID_X; + int ny=BLOCK_ID_Y; + int tx=THREAD_ID_X; + if (tx= 0; l--) { + rho1d[0][k] = rho_coeff[l*order+k] + rho1d[0][k]*dx; + rho1d[1][k] = rho_coeff[l*order+k] + rho1d[1][k]*dy; + } + } + + for (int n = 0; n < order; n++) { + numtyp rho1d_2 = 0.0; + for (int k = order-1; k >= 0; k--) + rho1d_2 = rho_coeff[k*order+n] + rho1d_2*dz; + numtyp y0 = z0*rho1d_2; + int mz = (n+nz)*npts_y*npts_x + ny*npts_x +nx; + for (int m = 0; m < order; m++) { + numtyp x0 = y0*rho1d[1][m]; + for (int l = 0; l < order; l++) { + atomicFloatAdd(brick+mz+l,x0*rho1d[0][l]); + } + mz+=npts_x; + } + } + } + z_pos+=z_stride; + } + } +} + +/* --------------------------- */ + +__kernel void make_rho3(__global numtyp4 *x_, __global numtyp *q_, + const int nlocal, + __global numtyp *brick, __global numtyp *_rho_coeff, + const int npts_x, + const int npts_y, const int nlocal_x, const int nlocal_y, + const int nlocal_z, const numtyp b_lo_x, + const numtyp b_lo_y, const numtyp b_lo_z, + const numtyp delxinv, const numtyp delyinv, + const numtyp delzinv, const numtyp shift, + const int order, + const numtyp delvolinv, __global int *error) { + __local numtyp rho_coeff[MAX_STENCIL*MAX_STENCIL]; + int ii=THREAD_ID_X; + if (ii=nlocal_x || ny>=nlocal_y || nz>=nlocal_z) + *error=1; + else { + numtyp z0=delvolinv*fetch_q(ii,q_); + + numtyp dx = nx+shift - tx; + numtyp dy = ny+shift - ty; + numtyp dz = nz+shift - tz; + + numtyp rho1d[2][MAX_STENCIL]; + for (int k = 0; k < order; k++) { + rho1d[0][k] = 0.0; + rho1d[1][k] = 0.0; + for (int l = order-1; l >= 0; l--) { + rho1d[0][k] = rho_coeff[l*order+k] + rho1d[0][k]*dx; + rho1d[1][k] = rho_coeff[l*order+k] + rho1d[1][k]*dy; + } + } + + for (int n = 0; n < order; n++) { + numtyp rho1d_2 = 0.0; + for (int k = order-1; k >= 0; k--) + rho1d_2 = rho_coeff[k*order+n] + rho1d_2*dz; + numtyp y0 = z0*rho1d_2; + int mz = (n+nz)*npts_y*npts_x + ny*npts_x +nx; + for (int m = 0; m < order; m++) { + numtyp x0 = y0*rho1d[1][m]; + for (int l = 0; l < order; l++) { + atomicFloatAdd(brick+mz+l,x0*rho1d[0][l]); + } + mz+=npts_x; + } + } + } + } +} + #endif diff --git a/lib/gpu/pppm_gpu_memory.cpp b/lib/gpu/pppm_gpu_memory.cpp index 9393e95dac..0c278cd8cb 100644 --- a/lib/gpu/pppm_gpu_memory.cpp +++ b/lib/gpu/pppm_gpu_memory.cpp @@ -181,7 +181,7 @@ void PPPMGPUMemoryT::clear() { device->clear(); } - +/* // --------------------------------------------------------------------------- // Copy nbor list from host if necessary and then calculate forces, virials,.. // --------------------------------------------------------------------------- @@ -237,9 +237,6 @@ int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall, double delvolinv = delxinv*delyinv*delzinv; numtyp f_delvolinv = delvolinv; -std::cout << "BOX: " << boxlo[0] << " " << boxlo[1] << " " << boxlo[2] << "\n"; -std::cout << "Brick: " << f_brick_x << " " << f_brick_y << " " << f_brick_z << "\n"; -std::cout << "Delx: " << 1.0/delxinv << " " << 1.0/delyinv << " " << 1.0/delzinv << std::endl; time_map.start(); d_brick_counts.zero(); k_particle_map.set_size(GX,BX); @@ -249,7 +246,6 @@ std::cout << "Delx: " << 1.0/delxinv << " " << 1.0/delyinv << " " << 1.0/delzinv &_nlocal_y, &_nlocal_z, &_atom_stride, &_max_brick_atoms, &d_error_flag.begin()); time_map.stop(); -//std::cout << "COUNTS: " << d_brick_counts << std::endl; if (_order % 2) shift=0.0; @@ -291,6 +287,84 @@ std::cout << "Delx: " << 1.0/delxinv << " " << 1.0/delyinv << " " << 1.0/delzinv return h_error_flag[0]; } +*/ +// --------------------------------------------------------------------------- +// Copy nbor list from host if necessary and then calculate forces, virials,.. +// --------------------------------------------------------------------------- +template +int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall, + double **host_x, int *host_type, bool &success, + double *host_q, double *boxlo, + const double delxinv, const double delyinv, + const double delzinv) { + acc_timers(); + if (nlocal==0) { + zero_timers(); + return 0; + } + + ans->inum(nlocal); + + if (ago==0) { + resize_atom(nlocal,nall,success); + resize_local(nlocal,success); + if (!success) + return 0; + + double bytes=ans->gpu_bytes(); + if (bytes>_max_an_bytes) + _max_an_bytes=bytes; + } + + atom->cast_x_data(host_x,host_type); + atom->cast_q_data(host_q); + atom->add_x_data(host_x,host_type); + atom->add_q_data(); + + // Compute the block size and grid size to keep all cores busy + int BX=this->block_size(); + + int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); + + int ainum=this->ans->inum(); + + // Boxlo adjusted to be upper left brick and shift for even stencil order + double shift=0.0; + numtyp shift2=-0.5; + if (_order % 2) { + shift=0.5; + shift2=0.5; + } + numtyp f_brick_x=boxlo[0]+(_nxlo_out-_nlower-shift)/delxinv; + numtyp f_brick_y=boxlo[1]+(_nylo_out-_nlower-shift)/delyinv; + numtyp f_brick_z=boxlo[2]+(_nzlo_out-_nlower-shift)/delzinv; + + numtyp f_delxinv=delxinv; + numtyp f_delyinv=delyinv; + numtyp f_delzinv=delzinv; + double delvolinv = delxinv*delyinv*delzinv; + numtyp f_delvolinv = delvolinv; + + time_map.start(); + time_map.stop(); + + time_rho.start(); + d_brick.zero(); + k_make_rho.set_size(GX,BX); + k_make_rho.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &ainum, + &d_brick.begin(), &d_rho_coeff.begin(), &_npts_x, + &_npts_y, &_nlocal_x, &_nlocal_y, &_nlocal_z, &f_brick_x, + &f_brick_y, &f_brick_z, &f_delxinv, &f_delyinv, &f_delzinv, + &shift2, &_order, &f_delvolinv, &d_error_flag.begin()); + time_rho.stop(); + + time_out.start(); + ucl_copy(h_brick,d_brick,true); + ucl_copy(h_error_flag,d_error_flag,false); + time_out.stop(); + + return h_error_flag[0]; +} template double PPPMGPUMemoryT::host_memory_usage() const { @@ -308,7 +382,7 @@ void PPPMGPUMemoryT::compile_kernels(UCL_Device &dev) { pppm_program=new UCL_Program(dev); pppm_program->load_string(pppm_gpu_kernel,flags.c_str()); k_particle_map.set_function(*pppm_program,"particle_map"); - k_make_rho.set_function(*pppm_program,"make_rho"); + k_make_rho.set_function(*pppm_program,"make_rho2"); pos_tex.get_texture(*pppm_program,"pos_tex"); q_tex.get_texture(*pppm_program,"q_tex");