From e8da16ff23acaa7625643a26bbc90302eb4fe565 Mon Sep 17 00:00:00 2001 From: "W. Michael Brown" Date: Thu, 17 Feb 2011 13:37:02 -0500 Subject: [PATCH] Finishing laptop optimization of PPPM charge spread kernels. --- lib/gpu/geryon/ucl_nv_kernel.h | 2 + lib/gpu/pair_gpu_device.cpp | 1 + lib/gpu/pppm_gpu_kernel.cu | 69 ++++++++++++++++++++++------------ lib/gpu/pppm_gpu_memory.cpp | 51 +++++++++++++++++-------- lib/gpu/pppm_gpu_memory.h | 1 + src/GPU/pppm_gpu.cpp | 11 +++--- 6 files changed, 90 insertions(+), 45 deletions(-) diff --git a/lib/gpu/geryon/ucl_nv_kernel.h b/lib/gpu/geryon/ucl_nv_kernel.h index 47f6f235b6..0040447549 100644 --- a/lib/gpu/geryon/ucl_nv_kernel.h +++ b/lib/gpu/geryon/ucl_nv_kernel.h @@ -27,6 +27,8 @@ #define GLOBAL_ID_X threadIdx.x+__mul24(blockIdx.x,blockDim.x) #define GLOBAL_ID_Y threadIdx.y+__mul24(blockIdx.y,blockDim.y) +#define GLOBAL_SIZE_X __mul24(gridDim.x,blockDim.x); +#define GLOBAL_SIZE_Y __mul24(gridDim.y,blockDim.y); #define THREAD_ID_X threadIdx.x #define THREAD_ID_Y threadIdx.y #define BLOCK_ID_X blockIdx.x diff --git a/lib/gpu/pair_gpu_device.cpp b/lib/gpu/pair_gpu_device.cpp index b94068629c..550c1207b5 100644 --- a/lib/gpu/pair_gpu_device.cpp +++ b/lib/gpu/pair_gpu_device.cpp @@ -293,6 +293,7 @@ void PairGPUDeviceT::output_kspace_times(UCL_Timer &time_in, fprintf(screen,"Kernel (map): %.4f s.\n",times[2]/_replica_size); fprintf(screen,"Kernel (rho): %.4f s.\n",times[3]/_replica_size); // fprintf(screen,"Force calc: %.4f s.\n",times[3]/_replica_size); + fprintf(screen,"Total: %.4f s.\n",(times[0]+times[1]+times[2]+times[3])/_replica_size); } fprintf(screen,"Max Mem / Proc: %.2f MB.\n",max_mb); diff --git a/lib/gpu/pppm_gpu_kernel.cu b/lib/gpu/pppm_gpu_kernel.cu index 51c7fdf525..37ffb9f8d8 100644 --- a/lib/gpu/pppm_gpu_kernel.cu +++ b/lib/gpu/pppm_gpu_kernel.cu @@ -106,6 +106,7 @@ __device__ inline void atomicFloatAdd(float *address, float val) #define THREAD_ID_X get_local_id(0) #define BLOCK_ID_X get_group_id(0) #define BLOCK_SIZE_X get_local_size(0) +#define GLOBAL_SIZE_X get_global_size(0) #define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE) #define __inline inline @@ -121,9 +122,16 @@ __kernel void particle_map(__global numtyp4 *x_, const int nlocal, const numtyp delyinv, const numtyp delzinv, const int nlocal_x, const int nlocal_y, const int nlocal_z, const int atom_stride, - const int max_atoms, __global int *error) { + const int max_atoms, __global int *error, + const int skip) { // ii indexes the two interacting particles in gi int ii=GLOBAL_ID_X; + + // Resequence the atom indices to avoid collisions during atomic ops + int nthreads=GLOBAL_SIZE_X; + ii=__mul24(ii,skip); + ii-=int(ii/nthreads)*(nthreads-1); + int nx,ny,nz; numtyp tx,ty,tz; @@ -156,32 +164,40 @@ __kernel void make_rho(__global numtyp4 *x_, __global numtyp *q_, __global numtyp *brick, __global numtyp *_rho_coeff, const int atom_stride, const int npts_x, const int npts_yx, const int nlocal_x, - const int nlocal_y, - const int nlocal_z, const numtyp b_lo_x, + const int nlocal_y, const int nlocal_z, + const int x_threads, const numtyp b_lo_x, const numtyp b_lo_y, const numtyp b_lo_z, const numtyp delxinv, const numtyp delyinv, const numtyp delzinv, const int order, const int order2, const numtyp delvolinv) { __local numtyp rho_coeff[MAX_STENCIL*MAX_STENCIL]; + int nx=THREAD_ID_X; int ny=THREAD_ID_Y; if (nxnlocal_z) + z_stop=nlocal_z; if (nx=n; k-=order) rho1d_2=rho_coeff[k]+rho1d_2*dz; numtyp y0=z0*rho1d_2; - int my=mz+ny*npts_x; + int my=mz+__mul24(ny,npts_x); for (int m=0; m=nlocal_y) y_stop-=ny-nlocal_y+1; - z_stride=npts_yx*BLOCK_1D; - z_local_stride=nlocal_x*nlocal_y*BLOCK_1D; + z_stride=__mul24(npts_yx,BLOCK_1D); + z_local_stride=__mul24(__mul24(nlocal_x,nlocal_y),BLOCK_1D); } if (tx=n; k-=order) rho1d_2=rho_coeff[k]+rho1d_2*dz; numtyp y0=z0*rho1d_2; - int my=mz+ny*npts_x; + int my=mz+__mul24(ny,npts_x); for (int m=0; m extern PairGPUDevice pair_gpu_device; @@ -146,11 +147,19 @@ numtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen, d_error_flag.zero(); _max_bytes+=1; + // Used to resequence atom indices to reduce probability of collisions + // during atomic ops + _resequence_skip=ucl_device->cores(); + if (_resequence_skip%_block_size!=0) + _resequence_skip=int(_resequence_skip/_block_size+1)*_block_size; + assert(_block_size%_block_x_size==0); + std::cout << "LO: " << _nxlo_out << " " << _nylo_out << " " << _nzlo_out << " " << _nlower << std::endl; std::cout << "HI: " << _nxhi_out << " " << _nyhi_out << " " << _nzhi_out << " " << _nupper << std::endl; std::cout << "pts: " << _npts_x << " " << _npts_y << " " << _npts_z << std::endl; std::cout << "local: " << _nlocal_x << " " << _nlocal_y << " " << _nlocal_z << std::endl; + return h_brick.begin(); } @@ -185,7 +194,7 @@ void PPPMGPUMemoryT::clear() { device->clear(); } -/* + // --------------------------------------------------------------------------- // Copy nbor list from host if necessary and then calculate forces, virials,.. // --------------------------------------------------------------------------- @@ -221,8 +230,11 @@ int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall, // Compute the block size and grid size to keep all cores busy int BX=this->block_size(); - int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); + // Resequence atom indices to avoid collisions during atomic ops + int skip=_resequence_skip; + if (skip>GX*BX/8) + skip=_block_x_size; int _max_atoms=10; int ainum=this->ans->inum(); @@ -248,7 +260,7 @@ int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall, &d_brick_atoms.begin(), &f_brick_x, &f_brick_y, &f_brick_z, &f_delxinv, &f_delyinv, &f_delzinv, &_nlocal_x, &_nlocal_y, &_nlocal_z, &_atom_stride, &_max_brick_atoms, - &d_error_flag.begin()); + &d_error_flag.begin(),&skip); time_map.stop(); if (_order % 2) @@ -264,14 +276,17 @@ int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall, int BY=block_y_size(); GX=static_cast(ceil(static_cast(_nlocal_x)/BX)); int GY=static_cast(ceil(static_cast(_nlocal_y)/BY)); + int x_threads=GX*BX; + int GZ=static_cast(ceil(static_cast(_nlocal_z)/8)); + GX*=GZ; d_brick.zero(); k_make_rho.set_size(GX,GY,BX,BY); k_make_rho.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &d_brick_counts.begin(), &d_brick_atoms.begin(), &d_brick.begin(), &d_rho_coeff.begin(), &_atom_stride, &_npts_x, - &_npts_yx, &_nlocal_x, &_nlocal_y, &_nlocal_z, &f_brick_x, - &f_brick_y, &f_brick_z, &f_delxinv, &f_delyinv, &f_delzinv, - &_order, &_order2, &f_delvolinv); + &_npts_yx, &_nlocal_x, &_nlocal_y, &_nlocal_z, &x_threads, + &f_brick_x, &f_brick_y, &f_brick_z, &f_delxinv, &f_delyinv, + &f_delzinv, &_order, &_order2, &f_delvolinv); time_rho.stop(); time_out.start(); @@ -291,7 +306,7 @@ int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall, return h_error_flag[0]; } -*/ + /* // --------------------------------------------------------------------------- // Copy nbor list from host if necessary and then calculate forces, virials,.. @@ -328,8 +343,11 @@ int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall, // Compute the block size and grid size to keep all cores busy int BX=this->block_size(); - int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); + // Resequence atom indices to avoid collisions during atomic ops + int skip=_resequence_skip; + if (skip>GX*BX/8) + skip=_block_x_size; int _max_atoms=10; int ainum=this->ans->inum(); @@ -355,7 +373,7 @@ int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall, &d_brick_atoms.begin(), &f_brick_x, &f_brick_y, &f_brick_z, &f_delxinv, &f_delyinv, &f_delzinv, &_nlocal_x, &_nlocal_y, &_nlocal_z, &_atom_stride, &_max_brick_atoms, - &d_error_flag.begin()); + &d_error_flag.begin(),&skip); time_map.stop(); if (_order % 2) @@ -398,7 +416,7 @@ int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall, return h_error_flag[0]; } */ - +/* // --------------------------------------------------------------------------- // Copy nbor list from host if necessary and then calculate forces, virials,.. // --------------------------------------------------------------------------- @@ -434,8 +452,11 @@ int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall, // Compute the block size and grid size to keep all cores busy int BX=this->block_size(); - int GX=static_cast(ceil(static_cast(this->ans->inum())/BX)); + // Resequence atom indices to avoid collisions during atomic ops + int skip=_resequence_skip; + if (skip>GX*BX/8) + skip=_block_x_size; int ainum=this->ans->inum(); @@ -458,13 +479,13 @@ int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall, time_rho.start(); d_brick.zero(); - k_make_rho.set_size(GX,BX); k_make_rho.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &ainum, &d_brick.begin(), &d_rho_coeff.begin(), &_npts_x, &_npts_yx, &_nlocal_x, &_nlocal_y, &_nlocal_z, &f_brick_x, &f_brick_y, &f_brick_z, &f_delxinv, &f_delyinv, &f_delzinv, - &_order, &_order2, &f_delvolinv, &d_error_flag.begin()); + &_order, &_order2, &f_delvolinv, &d_error_flag.begin(), + &skip); time_rho.stop(); time_out.start(); @@ -474,7 +495,7 @@ int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall, return h_error_flag[0]; } - +*/ template double PPPMGPUMemoryT::host_memory_usage() const { @@ -492,7 +513,7 @@ void PPPMGPUMemoryT::compile_kernels(UCL_Device &dev) { pppm_program=new UCL_Program(dev); pppm_program->load_string(pppm_gpu_kernel,flags.c_str()); k_particle_map.set_function(*pppm_program,"particle_map"); - k_make_rho.set_function(*pppm_program,"make_rho3"); + k_make_rho.set_function(*pppm_program,"make_rho"); pos_tex.get_texture(*pppm_program,"pos_tex"); q_tex.get_texture(*pppm_program,"q_tex"); diff --git a/lib/gpu/pppm_gpu_memory.h b/lib/gpu/pppm_gpu_memory.h index d9052472fc..8e0452ff6d 100644 --- a/lib/gpu/pppm_gpu_memory.h +++ b/lib/gpu/pppm_gpu_memory.h @@ -152,6 +152,7 @@ class PPPMGPUMemory { protected: bool _allocated, _compiled; int _block_size, _block_x_size, _block_y_size, _max_brick_atoms; + int _resequence_skip; double _max_bytes, _max_an_bytes; void compile_kernels(UCL_Device &dev); diff --git a/src/GPU/pppm_gpu.cpp b/src/GPU/pppm_gpu.cpp index 638753bd5b..33c92e9ed3 100644 --- a/src/GPU/pppm_gpu.cpp +++ b/src/GPU/pppm_gpu.cpp @@ -495,6 +495,9 @@ void PPPMGPU::init() compute_gf_denom(); compute_rho_coeff(); + if (order>8) + error->all("Cannot use order greater than 8 with pppm/gpu."); + bool success; host_brick = pppm_gpu_init(atom->nlocal, atom->nlocal+atom->nghost, screen, order, nxlo_out, nylo_out, nzlo_out, nxhi_out, @@ -689,19 +692,15 @@ double t1=MPI_Wtime(); nmax = atom->nmax; part2grid = memory->create_2d_int_array(nmax,3,"pppm:part2grid"); } -time1+=MPI_Wtime()-t1; energy = 0.0; if (vflag) for (i = 0; i < 6; i++) virial[i] = 0.0; // find grid points for all my particles // map my particle charge onto my local 3d density grid -double t2=MPI_Wtime(); particle_map(); -time2+=MPI_Wtime()-t2; -double t3=MPI_Wtime(); make_rho(); -time3+=MPI_Wtime()-t3; +time1+=MPI_Wtime()-t1; double max_error=0; int _npts_x=nxhi_out-nxlo_out+1; @@ -753,7 +752,9 @@ std::cout << "Maximum relative error: " << max_error*100.0 << "%\n"; // calculate the force on my particles +double t2=MPI_Wtime(); fieldforce(); +time2+=MPI_Wtime()-t2; // sum energy across procs and add in volume-dependent term