diff --git a/lib/gpu/pppm_gpu_kernel.cu b/lib/gpu/pppm_gpu_kernel.cu index 522a0a5260..08c065fa30 100644 --- a/lib/gpu/pppm_gpu_kernel.cu +++ b/lib/gpu/pppm_gpu_kernel.cu @@ -59,6 +59,18 @@ __inline double fetch_q(const int& i, const double *q) { return q[i]; } + +__device__ inline void atomicFloatAdd(double* address, double val) { + double old = *address, assumed; + do { + assumed = old; + old = __longlong_as_double( atomicCAS((unsigned long long int*)address, + __double_as_longlong(assumed), + __double_as_longlong(val + + assumed))); + } while (assumed != old); +} + #else __inline float4 fetch_pos(const int& i, const float4 *pos) { @@ -68,6 +80,21 @@ __inline float fetch_q(const int& i, const float *q) { return tex1Dfetch(q_tex, i); } + +__device__ inline void atomicFloatAdd(float *address, float val) +{ + int i_val = __float_as_int(val); + int tmp0 = 0; + int tmp1; + + while( (tmp1 = atomicCAS((int *)address, tmp0, i_val)) != tmp0) + { + tmp0 = tmp1; + i_val = __float_as_int(val + __int_as_float(tmp1)); + } +} + + #endif #else @@ -88,8 +115,8 @@ __inline float fetch_q(const int& i, const float *q) __kernel void particle_map(__global numtyp4 *x_, const int nlocal, __global int *counts, __global int *ans, - const numtyp boxlo_x, const numtyp boxlo_y, - const numtyp boxlo_z, const numtyp delxinv, + const numtyp b_lo_x, const numtyp b_lo_y, + const numtyp b_lo_z, const numtyp delxinv, const numtyp delyinv, const numtyp delzinv, const int nlocal_x, const int nlocal_y, const int nlocal_z, const int atom_stride, @@ -102,11 +129,11 @@ __kernel void particle_map(__global numtyp4 *x_, const int nlocal, if (ii=nlocal_x || ny>=nlocal_y || nz>=nlocal_z) @@ -124,27 +151,14 @@ __kernel void particle_map(__global numtyp4 *x_, const int nlocal, } } -__device__ inline void atomicFloatAdd(float *address, float val) -{ - int i_val = __float_as_int(val); - int tmp0 = 0; - int tmp1; - - while( (tmp1 = atomicCAS((int *)address, tmp0, i_val)) != tmp0) - { - tmp0 = tmp1; - i_val = __float_as_int(val + __int_as_float(tmp1)); - } -} - __kernel void make_rho(__global numtyp4 *x_, __global numtyp *q_, __global int *counts, __global int *atoms, __global numtyp *brick, __global numtyp *rho_coeff, const int atom_stride, const int npts_x, const int npts_y, const int npts_z, const int nlower, - const int nupper, const numtyp boxlo_x, - const numtyp boxlo_y, const numtyp boxlo_z, + const int nupper, const numtyp b_lo_x, + const numtyp b_lo_y, const numtyp b_lo_z, const numtyp delxinv, const numtyp delyinv, const numtyp delzinv, const int order, const numtyp delvolinv) { @@ -167,9 +181,9 @@ __kernel void make_rho(__global numtyp4 *x_, __global numtyp *q_, numtyp4 p=fetch_pos(atom,x_); numtyp z0=delvolinv*fetch_q(atom,q_); - numtyp dx = nx - (p.x-boxlo_x)*delxinv; - numtyp dy = ny - (p.y-boxlo_y)*delyinv; - numtyp dz = nz - (p.z-boxlo_z)*delzinv; + numtyp dx = nx - (p.x-b_lo_x)*delxinv; + numtyp dy = ny - (p.y-b_lo_y)*delyinv; + numtyp dz = nz - (p.z-b_lo_z)*delzinv; numtyp rho1d[3][8]; for (int k = 0; k < order; k++) { diff --git a/lib/gpu/pppm_gpu_memory.cpp b/lib/gpu/pppm_gpu_memory.cpp index 5928bcfea8..85414820c9 100644 --- a/lib/gpu/pppm_gpu_memory.cpp +++ b/lib/gpu/pppm_gpu_memory.cpp @@ -100,7 +100,7 @@ numtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen, _nyhi_out=nyhi_out; _nzhi_out=nzhi_out; _max_brick_atoms=10; - + // Get rho_coeff on device int n2lo=(1-order)/2; int numel=order*( order/2 - n2lo + 1 ); @@ -142,6 +142,11 @@ numtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen, d_error_flag.zero(); _max_bytes+=1; +std::cout << "LO: " << _nxlo_out << " " << _nylo_out << " " << _nzlo_out << " " << _nlower << std::endl; +std::cout << "HI: " << _nxhi_out << " " << _nyhi_out << " " << _nzhi_out << " " << _nupper << std::endl; +std::cout << "pts: " << _npts_x << " " << _npts_y << " " << _npts_z << std::endl; +std::cout << "local: " << _nlocal_x << " " << _nlocal_y << " " << _nlocal_z << std::endl; + return h_brick.begin(); } @@ -218,43 +223,54 @@ int PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall, int _max_atoms=10; int ainum=this->ans->inum(); - // Boxlo adjusted to include ghost cells and shift for even stencil order + // Boxlo adjusted to be upper left brick and shift for even stencil order double shift=0.0; if (_order % 2) - shift-=0.5/delxinv; + shift=0.5; + numtyp f_brick_x=boxlo[0]+(_nxlo_out-_nlower-shift)/delxinv; + numtyp f_brick_y=boxlo[1]+(_nylo_out-_nlower-shift)/delyinv; + numtyp f_brick_z=boxlo[2]+(_nzlo_out-_nlower-shift)/delzinv; - numtyp f_boxlo_x=boxlo[0]+shift; - numtyp f_boxlo_y=boxlo[1]+shift; - numtyp f_boxlo_z=boxlo[2]+shift; numtyp f_delxinv=delxinv; numtyp f_delyinv=delyinv; numtyp f_delzinv=delzinv; double delvolinv = delxinv*delyinv*delzinv; numtyp f_delvolinv = delvolinv; +std::cout << "BOX: " << boxlo[0] << " " << boxlo[1] << " " << boxlo[2] << "\n"; +std::cout << "Brick: " << f_brick_x << " " << f_brick_y << " " << f_brick_z << "\n"; +std::cout << "Delx: " << 1.0/delxinv << " " << 1.0/delyinv << " " << 1.0/delzinv << std::endl; time_map.start(); d_brick_counts.zero(); k_particle_map.set_size(GX,BX); k_particle_map.run(&atom->dev_x.begin(), &ainum, &d_brick_counts.begin(), - &d_brick_atoms.begin(), &f_boxlo_x, &f_boxlo_y, - &f_boxlo_z, &f_delxinv, &f_delyinv, &f_delzinv, &_nlocal_x, + &d_brick_atoms.begin(), &f_brick_x, &f_brick_y, + &f_brick_z, &f_delxinv, &f_delyinv, &f_delzinv, &_nlocal_x, &_nlocal_y, &_nlocal_z, &_atom_stride, &_max_brick_atoms, &d_error_flag.begin()); time_map.stop(); +//std::cout << "COUNTS: " << d_brick_counts << std::endl; + if (_order % 2) + shift=0.0; + else + shift=0.5; + f_brick_x=boxlo[0]+(_nxlo_out-_nlower+shift)/delxinv; + f_brick_y=boxlo[1]+(_nylo_out-_nlower+shift)/delyinv; + f_brick_z=boxlo[2]+(_nzlo_out-_nlower+shift)/delzinv; time_rho.start(); BX=block_x_size(); int BY=block_y_size(); - GX=static_cast(ceil(static_cast(_nlocal_x))/BX); - int GY=static_cast(ceil(static_cast(_nlocal_y))/BY); + GX=static_cast(ceil(static_cast(_nlocal_x)/BX)); + int GY=static_cast(ceil(static_cast(_nlocal_y)/BY)); d_brick.zero(); k_make_rho.set_size(GX,GY,BX,BY); k_make_rho.run(&atom->dev_x.begin(), &atom->dev_q.begin(), &d_brick_counts.begin(), &d_brick_atoms.begin(), &d_brick.begin(), &d_rho_coeff.begin(), &_atom_stride, &_npts_x, - &_npts_y, &_npts_z, &_nlower, &_nupper, &f_boxlo_x, - &f_boxlo_y, &f_boxlo_z, &f_delxinv, &f_delyinv, &f_delzinv, + &_npts_y, &_npts_z, &_nlower, &_nupper, &f_brick_x, + &f_brick_y, &f_brick_z, &f_delxinv, &f_delyinv, &f_delzinv, &_order, &f_delvolinv); time_rho.stop(); diff --git a/lib/gpu/pppm_l_gpu.cpp b/lib/gpu/pppm_l_gpu.cpp index e75d291734..6ee0278326 100644 --- a/lib/gpu/pppm_l_gpu.cpp +++ b/lib/gpu/pppm_l_gpu.cpp @@ -28,11 +28,12 @@ static PPPMGPUMemory PPPMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -float * pppm_gpu_init(const int nlocal, const int nall, FILE *screen, - const int order, const int nxlo_out, const int nylo_out, - const int nzlo_out, const int nxhi_out, - const int nyhi_out, const int nzhi_out, - double **rho_coeff, bool &success) { +PRECISION * pppm_gpu_init(const int nlocal, const int nall, FILE *screen, + const int order, const int nxlo_out, + const int nylo_out, const int nzlo_out, + const int nxhi_out, const int nyhi_out, + const int nzhi_out, double **rho_coeff, + bool &success) { PPPMF.clear(); int first_gpu=PPPMF.device->first_device(); int last_gpu=PPPMF.device->last_device(); @@ -52,7 +53,7 @@ float * pppm_gpu_init(const int nlocal, const int nall, FILE *screen, } success=true; - float * host_brick; + PRECISION * host_brick=NULL; if (world_me==0) { host_brick=PPPMF.init(nlocal,nall,screen,order,nxlo_out,nylo_out, nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff, diff --git a/src/GPU/pppm_gpu.cpp b/src/GPU/pppm_gpu.cpp index b62065c8eb..1c2adcf106 100644 --- a/src/GPU/pppm_gpu.cpp +++ b/src/GPU/pppm_gpu.cpp @@ -37,7 +37,7 @@ // External functions from cuda library for atom decomposition -float* pppm_gpu_init(const int nlocal, const int nall, FILE *screen, +numtyp* pppm_gpu_init(const int nlocal, const int nall, FILE *screen, const int order, const int nxlo_out, const int nylo_out, const int nzlo_out, const int nxhi_out, const int nyhi_out, const int nzhi_out, double **rho_coeff, bool &success); @@ -703,17 +703,36 @@ double t3=MPI_Wtime(); make_rho(); time3+=MPI_Wtime()-t3; +double max_error=0; int _npts_x=nxhi_out-nxlo_out+1; int _npts_y=nyhi_out-nylo_out+1; int _npts_z=nzhi_out-nzlo_out+1; -double *cpup = &density_brick[nlower][nlower][nlower]; -float *gpup = host_brick; +double *cpup = &density_brick[nxlo_out][nylo_out][nzlo_out]; +numtyp *gpup = host_brick; int iend=_npts_x*_npts_y*_npts_z; +int counter_x=0, counter_y=0, counter_z=0; for (int i=0; i1e-8) + error = fabs(((*gpup)-(*cpup))/(*cpup)); + if (error>0.05) +// std::cout << "* "; + std::cout << counter_z << " " << counter_y << " " << counter_x << " CPU GPU: " << error << " " << *cpup << " " << *gpup << std::endl; + if (error>max_error) + max_error=error; cpup++; gpup++; + counter_x++; + if (counter_x==_npts_x) { + counter_x=0; + counter_y++; + } + if (counter_y==_npts_y) { + counter_y=0; + counter_z++; + } } +std::cout << "Maximum relative error: " << max_error*100.0 << "%\n"; // all procs communicate density values from their ghost cells // to fully sum contribution in their 3d bricks diff --git a/src/GPU/pppm_gpu.h b/src/GPU/pppm_gpu.h index 997f6bcaa4..773ff31b0f 100644 --- a/src/GPU/pppm_gpu.h +++ b/src/GPU/pppm_gpu.h @@ -23,6 +23,8 @@ KSpaceStyle(pppm/gpu,PPPMGPU) #include "pppm.h" #include "lmptype.h" +#define numtyp float + namespace LAMMPS_NS { class PPPMGPU : public KSpace { @@ -98,7 +100,7 @@ class PPPMGPU : public KSpace { void compute_rho1d(double, double, double); void compute_rho_coeff(); void slabcorr(int); -float *host_brick; +numtyp *host_brick; double time1,time2,time3; };