diff --git a/lib/gpu/pppm_gpu_memory.cpp b/lib/gpu/pppm_gpu_memory.cpp index 08b0a792cb..8304d85199 100644 --- a/lib/gpu/pppm_gpu_memory.cpp +++ b/lib/gpu/pppm_gpu_memory.cpp @@ -45,7 +45,11 @@ int PPPMGPUMemoryT::bytes_per_atom() const { } template -bool PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen) { +bool PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen, + const int order, const int nxlo_out, + const int nylo_out, const int nzlo_out, + const int nxhi_out, const int nyhi_out, + const int nzhi_out, double **rho_coeff) { screen=_screen; if (!device->init(*ans,true,false,nlocal,nall)) @@ -68,6 +72,21 @@ bool PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen) { _allocated=true; _max_bytes=0; _max_an_bytes=ans->gpu_bytes(); + + _order=order; + _nxlo_out=nxlo_out; + _nylo_out=nylo_out; + _nzlo_out=nzlo_out; + _nxhi_out=nxhi_out; + _nyhi_out=nyhi_out; + _nzhi_out=nzhi_out; + + int n2lo=(1-order)/2; + int numel=order*( order/2 - n2lo + 1 ); + d_rho_coeff.alloc(numel,*ucl_device,UCL_READ_ONLY); + UCL_H_Vec view; + view.view(rho_coeff[0]+n2lo,numel,*ucl_device); + ucl_copy(d_rho_coeff,view,true); return true; } @@ -98,8 +117,10 @@ void PPPMGPUMemoryT::clear() { // --------------------------------------------------------------------------- template void PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall, - double **host_x, int *host_type, - bool &success, double *host_q) { + double **host_x, int *host_type, bool &success, + double *host_q, double *boxlo, + const double delxinv, const double delyinv, + const double delzinv) { acc_timers(); if (nlocal==0) { zero_timers(); @@ -132,6 +153,12 @@ void PPPMGPUMemoryT::compute(const int ago, const int nlocal, const int nall, int ainum=this->ans->inum(); int anall=this->atom->nall(); + numtyp f_boxlo_x=boxlo[0]; + numtyp f_boxlo_y=boxlo[1]; + numtyp f_boxlo_z=boxlo[2]; + numtyp f_delxinv=delxinv; + numtyp f_delyinv=delyinv; + numtyp f_delzinv=delzinv; // this->time_pair.start(); // this->k_pair.set_size(GX,BX); diff --git a/lib/gpu/pppm_gpu_memory.h b/lib/gpu/pppm_gpu_memory.h index 596ed41ae4..e4f43831af 100644 --- a/lib/gpu/pppm_gpu_memory.h +++ b/lib/gpu/pppm_gpu_memory.h @@ -37,7 +37,10 @@ class PPPMGPUMemory { virtual ~PPPMGPUMemory(); /// Clear any previous data and set up for a new LAMMPS run - bool init(const int nlocal, const int nall, FILE *screen); + bool init(const int nlocal, const int nall, FILE *screen, const int order, + const int nxlo_out, const int nylo_out, const int nzlo_out, + const int nxhi_out, const int nyhi_out, const int nzhi_out, + double **rho_coeff); /// Check if there is enough storage for atom arrays and realloc if not /** \param success set to false if insufficient memory **/ @@ -75,8 +78,9 @@ class PPPMGPUMemory { ans->zero_timers(); } - void compute(const int ago, const int nlocal, const int nall, double **host_x, - int *host_type, bool &success, double *charge); + void compute(const int ago,const int nlocal,const int nall,double **host_x, + int *host_type,bool &success,double *charge,double *boxlo, + const double delxinv,const double delyinv,const double delzinv); // -------------------------- DEVICE DATA ------------------------- @@ -98,6 +102,15 @@ class PPPMGPUMemory { PairGPUAtom *atom; + // --------------------------- GRID DATA -------------------------- + + UCL_H_Vec *h_brick; + UCL_D_Vec *d_brick; + + // -------------------------- STENCIL DATA ------------------------- + UCL_D_Vec d_rho_coeff; + int _order, _nxlo_out, _nylo_out, _nzlo_out, _nxhi_out, _nyhi_out, _nzhi_out; + // ------------------------ FORCE/ENERGY DATA ----------------------- PairGPUAns *ans; diff --git a/lib/gpu/pppm_l_gpu.cpp b/lib/gpu/pppm_l_gpu.cpp index bd7c6a51f3..99a8b12122 100644 --- a/lib/gpu/pppm_l_gpu.cpp +++ b/lib/gpu/pppm_l_gpu.cpp @@ -28,7 +28,10 @@ static PPPMGPUMemory PPPMF; // --------------------------------------------------------------------------- // Allocate memory on host and device and copy constants to device // --------------------------------------------------------------------------- -bool pppm_gpu_init(const int nlocal, const int nall, FILE *screen) { +bool pppm_gpu_init(const int nlocal, const int nall, FILE *screen, + const int order, const int nxlo_out, const int nylo_out, + const int nzlo_out, const int nxhi_out, const int nyhi_out, + const int nzhi_out, double **rho_coeff) { PPPMF.clear(); int first_gpu=PPPMF.device->first_device(); int last_gpu=PPPMF.device->last_device(); @@ -48,7 +51,8 @@ bool pppm_gpu_init(const int nlocal, const int nall, FILE *screen) { } if (world_me==0) { - bool init_ok=PPPMF.init(nlocal,nall,screen); + bool init_ok=PPPMF.init(nlocal,nall,screen,order,nxlo_out,nylo_out, + nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff); if (!init_ok) return false; } @@ -67,7 +71,8 @@ bool pppm_gpu_init(const int nlocal, const int nall, FILE *screen) { fflush(screen); } if (gpu_rank==i && world_me!=0) { - bool init_ok=PPPMF.init(nlocal,nall,screen); + bool init_ok=PPPMF.init(nlocal,nall,screen,order,nxlo_out,nylo_out, + nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff); if (!init_ok) return false; } @@ -86,8 +91,10 @@ void pppm_gpu_clear() { void pppm_gpu_compute(const int ago, const int nlocal, const int nall, double **host_x, int *host_type, bool &success, - double *host_q) { - PPPMF.compute(ago,nlocal,nall,host_x,host_type,success,host_q); + double *host_q, double *boxlo, const double delxinv, + const double delyinv, const double delzinv) { + PPPMF.compute(ago,nlocal,nall,host_x,host_type,success,host_q,boxlo,delxinv, + delyinv,delzinv); } double pppm_gpu_bytes() { diff --git a/src/GPU/pppm_gpu.cpp b/src/GPU/pppm_gpu.cpp index 2ea85b164e..c93972beb9 100644 --- a/src/GPU/pppm_gpu.cpp +++ b/src/GPU/pppm_gpu.cpp @@ -37,11 +37,15 @@ // External functions from cuda library for atom decomposition -bool pppm_gpu_init(const int nlocal, const int nall, FILE *screen); +bool pppm_gpu_init(const int nlocal, const int nall, FILE *screen, + const int order, const int nxlo_out, const int nylo_out, + const int nzlo_out, const int nxhi_out, const int nyhi_out, + const int nzhi_out, double **rho_coeff); void pppm_gpu_clear(); void pppm_gpu_compute(const int ago, const int nlocal, const int nall, double **host_x, int *host_type, bool &success, - double *host_q); + double *host_q, double *boxlo, const double delxinv, + const double delyinv, const double delzinv); double pppm_gpu_bytes(); using namespace LAMMPS_NS; @@ -112,8 +116,8 @@ std::cout << "DEBUG_TIMES: " << time1 << " " << time2 << " " << time3 void PPPMGPU::init() { if (me == 0) { - if (screen) fprintf(screen,"PPPMGPU initialization ...\n"); - if (logfile) fprintf(logfile,"PPPMGPU initialization ...\n"); + if (screen) fprintf(screen,"PPPM initialization ...\n"); + if (logfile) fprintf(logfile,"PPPM initialization ...\n"); } // error check @@ -134,7 +138,7 @@ void PPPMGPU::init() if (order > MAXORDER) { char str[128]; - sprintf(str,"PPPMGPU order cannot be greater than %d",MAXORDER); + sprintf(str,"PPPM order cannot be greater than %d",MAXORDER); error->all(str); } @@ -220,7 +224,7 @@ void PPPMGPU::init() set_grid(); if (nx_pppm >= OFFSET || ny_pppm >= OFFSET || nz_pppm >= OFFSET) - error->all("PPPMGPU grid is too large"); + error->all("PPPM grid is too large"); // global indices of PPPMGPU grid range from 0 to N-1 // nlo_in,nhi_in = lower/upper limits of the 3d sub-brick of @@ -391,7 +395,7 @@ void PPPMGPU::init() order--; } - if (order == 0) error->all("PPPMGPU order has been reduced to 0"); + if (order == 0) error->all("PPPM order has been reduced to 0"); // decomposition of FFT mesh // global indices range from 0 to N-1 @@ -492,7 +496,8 @@ void PPPMGPU::init() compute_rho_coeff(); bool init_ok = pppm_gpu_init(atom->nlocal, atom->nlocal+atom->nghost, - screen); + screen, order, nxlo_out, nylo_out, nzlo_out, + nxhi_out, nyhi_out, nzhi_out, rho_coeff); if (!init_ok) error->one("Insufficient memory on accelerator (or no fix gpu).\n"); time1=0; time2=0; time3=0; @@ -659,7 +664,8 @@ void PPPMGPU::compute(int eflag, int vflag) { bool success = true; pppm_gpu_compute(neighbor->ago, atom->nlocal, atom->nlocal + atom->nghost, - atom->x, atom->type, success, atom->q); + atom->x, atom->type, success, atom->q, domain->boxlo, + delxinv, delyinv, delzinv); if (!success) error->one("Out of memory on GPGPU");