diff --git a/lib/gpu/pair_gpu_device.h b/lib/gpu/pair_gpu_device.h index 3082591b73..ccb9d31c8e 100644 --- a/lib/gpu/pair_gpu_device.h +++ b/lib/gpu/pair_gpu_device.h @@ -74,6 +74,9 @@ class PairGPUDevice { * \param driver_overhead Estimated overhead from driver per timestep (s) **/ void estimate_gpu_overhead(const int kernel_calls, double &gpu_overhead, double &gpu_driver_overhead); + + /// Returns true if double precision is supported on card + inline bool double_precision() { return gpu->double_precision(); } /// Output a message with timing information void output_times(UCL_Timer &time_pair, PairGPUAns &ans, diff --git a/lib/gpu/pppm_gpu_kernel.cu b/lib/gpu/pppm_gpu_kernel.cu index 4a692ed38a..b82e95fbd6 100644 --- a/lib/gpu/pppm_gpu_kernel.cu +++ b/lib/gpu/pppm_gpu_kernel.cu @@ -148,8 +148,7 @@ __kernel void particle_map(__global numtyp4 *x_, __global numtyp *q_, /* --------------------------- */ -__kernel void make_rho(__global numtyp4 *x_, __global numtyp *q_, - __global int *counts, __global grdtyp4 *atoms, +__kernel void make_rho(__global int *counts, __global grdtyp4 *atoms, __global grdtyp *brick, __global grdtyp *_rho_coeff, const int atom_stride, const int npts_x, const int npts_y, const int npts_z, const int nlocal_x, diff --git a/lib/gpu/pppm_gpu_memory.cpp b/lib/gpu/pppm_gpu_memory.cpp index 3d622d9ad7..6f1bbf0d2c 100644 --- a/lib/gpu/pppm_gpu_memory.cpp +++ b/lib/gpu/pppm_gpu_memory.cpp @@ -64,14 +64,20 @@ grdtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen, const int nylo_out, const int nzlo_out, const int nxhi_out, const int nyhi_out, const int nzhi_out, double **rho_coeff, - grdtyp **vd_brick, bool &success) { + grdtyp **vd_brick, int &flag) { _max_bytes=10; screen=_screen; + bool success=true; if (!device->init(*ans,nlocal,nall)) { - success=false; + flag=-2; return 0; } + if (sizeof(grdtyp)==sizeof(double) && device->double_precision()==false) { + flag=-5; + return 0; + } + ucl_device=device->gpu; atom=&device->atom; @@ -157,6 +163,11 @@ grdtyp * PPPMGPUMemoryT::init(const int nlocal, const int nall, FILE *_screen, success=success && (h_error_flag.alloc(1,*ucl_device)==UCL_SUCCESS); success=success && (d_error_flag.alloc(1,*ucl_device,UCL_WRITE_ONLY)== UCL_SUCCESS); + if (!success) { + flag=-3; + return 0; + } + d_error_flag.zero(); _max_bytes+=1; @@ -269,8 +280,7 @@ int PPPMGPUMemoryT::spread(const int ago, const int nlocal, const int nall, BX=block_size(); GX=static_cast(ceil(static_cast(_npts_y*_npts_z)/BLOCK_PENCILS)); k_make_rho.set_size(GX,BX); - k_make_rho.run(&atom->dev_x.begin(), &atom->dev_q.begin(), - &d_brick_counts.begin(), &d_brick_atoms.begin(), + k_make_rho.run(&d_brick_counts.begin(), &d_brick_atoms.begin(), &d_brick.begin(), &d_rho_coeff.begin(), &_atom_stride, &_npts_x, &_npts_y, &_npts_z, &_nlocal_x, &_nlocal_y, &_nlocal_z, &_order_m_1, &_order, &_order2); diff --git a/lib/gpu/pppm_gpu_memory.h b/lib/gpu/pppm_gpu_memory.h index d3928d29c2..f996cdb1a3 100644 --- a/lib/gpu/pppm_gpu_memory.h +++ b/lib/gpu/pppm_gpu_memory.h @@ -35,10 +35,17 @@ class PPPMGPUMemory { virtual ~PPPMGPUMemory(); /// Clear any previous data and set up for a new LAMMPS run + /** Success will be: + * - 0 if successfull + * - -1 if fix gpu not found + * - -2 if GPU could not be found + * - -3 if there is an out of memory error + * - -4 if the GPU library was not compiled for GPU + * - -5 Double precision is not supported on card **/ grdtyp * init(const int nlocal, const int nall, FILE *screen, const int order, const int nxlo_out, const int nylo_out, const int nzlo_out, const int nxhi_out, const int nyhi_out, const int nzhi_out, - double **rho_coeff, grdtyp **vd_brick, bool &success); + double **rho_coeff, grdtyp **vd_brick, int &success); /// Check if there is enough storage for atom arrays and realloc if not /** \param success set to false if insufficient memory **/ diff --git a/lib/gpu/pppm_l_gpu.cpp b/lib/gpu/pppm_l_gpu.cpp index c36f8eaf0a..6f207113f9 100644 --- a/lib/gpu/pppm_l_gpu.cpp +++ b/lib/gpu/pppm_l_gpu.cpp @@ -35,7 +35,7 @@ grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall, const int nylo_out, const int nzlo_out, const int nxhi_out, const int nyhi_out, const int nzhi_out, double **rho_coeff, - grdtyp **vd_brick, bool &success) { + grdtyp **vd_brick, int &success) { pppm.clear(0.0); int first_gpu=pppm.device->first_device(); int last_gpu=pppm.device->last_device(); @@ -54,13 +54,11 @@ grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall, fflush(screen); } - success=true; + success=0; grdtyp * host_brick=NULL; if (world_me==0) { host_brick=pppm.init(nlocal,nall,screen,order,nxlo_out,nylo_out,nzlo_out, nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick,success); - if (!success) - return host_brick; } pppm.device->world_barrier(); @@ -80,8 +78,6 @@ grdtyp * pppm_gpu_init(memtyp &pppm, const int nlocal, const int nall, host_brick=pppm.init(nlocal,nall,screen,order,nxlo_out,nylo_out, nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff, vd_brick, success); - if (!success) - return host_brick; } pppm.device->gpu_barrier(); if (message) @@ -97,7 +93,7 @@ float * pppm_gpu_init_f(const int nlocal, const int nall, FILE *screen, const int nylo_out, const int nzlo_out, const int nxhi_out, const int nyhi_out, const int nzhi_out, double **rho_coeff, - float **vd_brick, bool &success) { + float **vd_brick, int &success) { return pppm_gpu_init(PPPMF,nlocal,nall,screen,order,nxlo_out,nylo_out, nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick, success); @@ -128,7 +124,7 @@ double * pppm_gpu_init_d(const int nlocal, const int nall, FILE *screen, const int nylo_out, const int nzlo_out, const int nxhi_out, const int nyhi_out, const int nzhi_out, double **rho_coeff, - double **vd_brick, bool &success) { + double **vd_brick, int &success) { return pppm_gpu_init(PPPMD,nlocal,nall,screen,order,nxlo_out,nylo_out, nzlo_out,nxhi_out,nyhi_out,nzhi_out,rho_coeff,vd_brick, success); diff --git a/src/GPU/Install.sh b/src/GPU/Install.sh index eca648d3f3..9541f82782 100644 --- a/src/GPU/Install.sh +++ b/src/GPU/Install.sh @@ -16,7 +16,11 @@ if (test $1 = 1) then if (test -e ../pppm.cpp) then cp pppm_gpu.cpp .. + cp pppm_gpu_single.cpp .. + cp pppm_gpu_double.cpp .. cp pppm_gpu.h .. + cp pppm_gpu_single.h .. + cp pppm_gpu_double.h .. fi if (test -e ../pair_gayberne.cpp) then @@ -75,6 +79,8 @@ elif (test $1 = 0) then fi rm ../pppm_gpu.cpp + rm ../pppm_gpu_single.cpp + rm ../pppm_gpu_double.cpp rm ../pair_gayberne_gpu.cpp rm ../pair_lj_cut_gpu.cpp rm ../pair_morse_gpu.cpp @@ -92,6 +98,8 @@ elif (test $1 = 0) then rm ../pair_omp_gpu.cpp rm ../pppm_gpu.h + rm ../pppm_gpu_single.cpp + rm ../pppm_gpu_double.h rm ../pair_gayberne_gpu.h rm ../pair_lj_cut_gpu.h rm ../pair_morse_gpu.h diff --git a/src/GPU/pppm_gpu.cpp b/src/GPU/pppm_gpu.cpp index e6f210b5d0..35edcdedbf 100644 --- a/src/GPU/pppm_gpu.cpp +++ b/src/GPU/pppm_gpu.cpp @@ -101,7 +101,7 @@ PPPMGPUT::~PPPMGPU() ------------------------------------------------------------------------- */ template -void PPPMGPUT::init() +void PPPMGPUT::base_init() { if (me == 0) { if (screen) fprintf(screen,"PPPM initialization ...\n"); @@ -111,8 +111,8 @@ void PPPMGPUT::init() // error check if (domain->triclinic) - error->all("Cannot (yet) use PPPMGPU with triclinic box"); - if (domain->dimension == 2) error->all("Cannot use PPPMGPU with 2d simulation"); + error->all("Cannot (yet) use PPPM with triclinic box"); + if (domain->dimension == 2) error->all("Cannot use PPPM with 2d simulation"); if (!atom->q_flag) error->all("Kspace style requires atom attribute q"); diff --git a/src/GPU/pppm_gpu.h b/src/GPU/pppm_gpu.h index 8eeb269521..ba0b3e3e93 100644 --- a/src/GPU/pppm_gpu.h +++ b/src/GPU/pppm_gpu.h @@ -101,4 +101,3 @@ class PPPMGPU : public KSpace { } #endif -#endif diff --git a/src/GPU/pppm_gpu_single.cpp b/src/GPU/pppm_gpu_single.cpp index 9030d76414..693c47f437 100644 --- a/src/GPU/pppm_gpu_single.cpp +++ b/src/GPU/pppm_gpu_single.cpp @@ -35,20 +35,22 @@ #include "memory.h" #include "error.h" +#define grdtyp float + // External functions from cuda library for atom decomposition -numtyp* pppm_gpu_init_f(const int nlocal, const int nall, FILE *screen, - const int order, const int nxlo_out, const int nylo_out, - const int nzlo_out, const int nxhi_out, - const int nyhi_out, const int nzhi_out, - double **rho_coeff, numtyp **_vd_brick, - bool &success); +grdtyp* pppm_gpu_init_f(const int nlocal, const int nall, FILE *screen, + const int order, const int nxlo_out, + const int nylo_out, const int nzlo_out, + const int nxhi_out, const int nyhi_out, + const int nzhi_out, double **rho_coeff, + grdtyp **_vd_brick, int &success); void pppm_gpu_clear_f(const double poisson_time); int pppm_gpu_spread_f(const int ago, const int nlocal, const int nall, double **host_x, int *host_type, bool &success, double *host_q, double *boxlo, const double delxinv, const double delyinv, const double delzinv); -void pppm_gpu_interp_f(const numtyp qqrd2e_scale); +void pppm_gpu_interp_f(const grdtyp qqrd2e_scale); double pppm_gpu_bytes_f(); using namespace LAMMPS_NS; @@ -64,446 +66,54 @@ using namespace LAMMPS_NS; /* ---------------------------------------------------------------------- */ -PPPMGPU::PPPMGPU(LAMMPS *lmp, int narg, char **arg) : KSpace(lmp, narg, arg) +PPPMGPUSingle::PPPMGPUSingle(LAMMPS *lmp, int narg, char **arg) : + PPPMGPU(lmp, narg, arg) { - if (narg != 1) error->all("Illegal kspace_style pppm command"); - - precision = atof(arg[0]); - PI = 4.0*atan(1.0); - - nfactors = 3; - factors = new int[nfactors]; - factors[0] = 2; - factors[1] = 3; - factors[2] = 5; - - MPI_Comm_rank(world,&me); - MPI_Comm_size(world,&nprocs); - - density_brick = NULL; - vd_brick = NULL; - density_fft = NULL; - greensfn = NULL; - work1 = work2 = NULL; - vg = NULL; - fkx = fky = fkz = NULL; - buf1 = buf2 = NULL; - - gf_b = NULL; - rho1d = rho_coeff = NULL; - - fft1 = fft2 = NULL; - remap = NULL; - - nmax = 0; } /* ---------------------------------------------------------------------- free all memory ------------------------------------------------------------------------- */ -PPPMGPU::~PPPMGPU() +PPPMGPUSingle::~PPPMGPUSingle() { - delete [] factors; - deallocate(); - pppm_gpu_clear(poisson_time); + pppm_gpu_clear_f(poisson_time); } /* ---------------------------------------------------------------------- called once before run ------------------------------------------------------------------------- */ -void PPPMGPU::init() +void PPPMGPUSingle::init() { - if (me == 0) { - if (screen) fprintf(screen,"PPPM initialization ...\n"); - if (logfile) fprintf(logfile,"PPPM initialization ...\n"); - } - - // error check - - if (domain->triclinic) - error->all("Cannot (yet) use PPPMGPU with triclinic box"); - if (domain->dimension == 2) error->all("Cannot use PPPMGPU with 2d simulation"); - - if (!atom->q_flag) error->all("Kspace style requires atom attribute q"); - - if (slabflag == 0 && domain->nonperiodic > 0) - error->all("Cannot use nonperiodic boundaries with PPPMGPU"); - if (slabflag == 1) { - if (domain->xperiodic != 1 || domain->yperiodic != 1 || - domain->boundary[2][0] != 1 || domain->boundary[2][1] != 1) - error->all("Incorrect boundaries with slab PPPMGPU"); - } - - if (order > MAXORDER) { - char str[128]; - sprintf(str,"PPPM order cannot be greater than %d",MAXORDER); - error->all(str); - } - - // free all arrays previously allocated - - deallocate(); - - // extract short-range Coulombic cutoff from pair style - - qqrd2e = force->qqrd2e; - scale = 1.0; - - if (force->pair == NULL) - error->all("KSpace style is incompatible with Pair style"); - int itmp; - double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp); - if (p_cutoff == NULL) - error->all("KSpace style is incompatible with Pair style"); - cutoff = *p_cutoff; - - // if kspace is TIP4P, extract TIP4P params from pair style - - qdist = 0.0; - - if (strcmp(force->kspace_style,"pppm/tip4p") == 0) { - if (force->pair == NULL) - error->all("KSpace style is incompatible with Pair style"); - double *p_qdist = (double *) force->pair->extract("qdist",itmp); - int *p_typeO = (int *) force->pair->extract("typeO",itmp); - int *p_typeH = (int *) force->pair->extract("typeH",itmp); - int *p_typeA = (int *) force->pair->extract("typeA",itmp); - int *p_typeB = (int *) force->pair->extract("typeB",itmp); - if (!p_qdist || !p_typeO || !p_typeH || !p_typeA || !p_typeB) - error->all("KSpace style is incompatible with Pair style"); - qdist = *p_qdist; - typeO = *p_typeO; - typeH = *p_typeH; - int typeA = *p_typeA; - int typeB = *p_typeB; - - if (force->angle == NULL || force->bond == NULL) - error->all("Bond and angle potentials must be defined for TIP4P"); - double theta = force->angle->equilibrium_angle(typeA); - double blen = force->bond->equilibrium_distance(typeB); - alpha = qdist / (2.0 * cos(0.5*theta) * blen); - } - - // compute qsum & qsqsum and warn if not charge-neutral - - qsum = qsqsum = 0.0; - for (int i = 0; i < atom->nlocal; i++) { - qsum += atom->q[i]; - qsqsum += atom->q[i]*atom->q[i]; - } - - double tmp; - MPI_Allreduce(&qsum,&tmp,1,MPI_DOUBLE,MPI_SUM,world); - qsum = tmp; - MPI_Allreduce(&qsqsum,&tmp,1,MPI_DOUBLE,MPI_SUM,world); - qsqsum = tmp; - - if (qsqsum == 0.0) - error->all("Cannot use kspace solver on system with no charge"); - if (fabs(qsum) > SMALL && me == 0) { - char str[128]; - sprintf(str,"System is not charge neutral, net charge = %g",qsum); - error->warning(str); - } - - // setup FFT grid resolution and g_ewald - // normally one iteration thru while loop is all that is required - // if grid stencil extends beyond neighbor proc, reduce order and try again - - int iteration = 0; - - while (order > 0) { - - if (iteration && me == 0) - error->warning("Reducing PPPMGPU order b/c stencil extends " - "beyond neighbor processor"); - iteration++; - - set_grid(); - - if (nx_pppm >= OFFSET || ny_pppm >= OFFSET || nz_pppm >= OFFSET) - error->all("PPPM grid is too large"); - - // global indices of PPPMGPU grid range from 0 to N-1 - // nlo_in,nhi_in = lower/upper limits of the 3d sub-brick of - // global PPPMGPU grid that I own without ghost cells - // for slab PPPMGPU, assign z grid as if it were not extended - - nxlo_in = comm->myloc[0]*nx_pppm / comm->procgrid[0]; - nxhi_in = (comm->myloc[0]+1)*nx_pppm / comm->procgrid[0] - 1; - nylo_in = comm->myloc[1]*ny_pppm / comm->procgrid[1]; - nyhi_in = (comm->myloc[1]+1)*ny_pppm / comm->procgrid[1] - 1; - nzlo_in = comm->myloc[2] * - (static_cast (nz_pppm/slab_volfactor)) / comm->procgrid[2]; - nzhi_in = (comm->myloc[2]+1) * - (static_cast (nz_pppm/slab_volfactor)) / comm->procgrid[2] - 1; - - // nlower,nupper = stencil size for mapping particles to PPPMGPU grid - - nlower = -(order-1)/2; - nupper = order/2; - - // shift values for particle <-> grid mapping - // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1 - - if (order % 2) shift = OFFSET + 0.5; - else shift = OFFSET; - if (order % 2) shiftone = 0.0; - else shiftone = 0.5; - - // nlo_out,nhi_out = lower/upper limits of the 3d sub-brick of - // global PPPMGPU grid that my particles can contribute charge to - // effectively nlo_in,nhi_in + ghost cells - // nlo,nhi = global coords of grid pt to "lower left" of smallest/largest - // position a particle in my box can be at - // dist[3] = particle position bound = subbox + skin/2.0 + qdist - // qdist = offset due to TIP4P fictitious charge - // convert to triclinic if necessary - // nlo_out,nhi_out = nlo,nhi + stencil size for particle mapping - // for slab PPPMGPU, assign z grid as if it were not extended - - triclinic = domain->triclinic; - double *prd,*sublo,*subhi; - - if (triclinic == 0) { - prd = domain->prd; - boxlo = domain->boxlo; - sublo = domain->sublo; - subhi = domain->subhi; - } else { - prd = domain->prd_lamda; - boxlo = domain->boxlo_lamda; - sublo = domain->sublo_lamda; - subhi = domain->subhi_lamda; - } - - double xprd = prd[0]; - double yprd = prd[1]; - double zprd = prd[2]; - double zprd_slab = zprd*slab_volfactor; - - double dist[3]; - double cuthalf = 0.5*neighbor->skin + qdist; - if (triclinic == 0) dist[0] = dist[1] = dist[2] = cuthalf; - else { - dist[0] = cuthalf/domain->prd[0]; - dist[1] = cuthalf/domain->prd[1]; - dist[2] = cuthalf/domain->prd[2]; - } - - int nlo,nhi; - - nlo = static_cast ((sublo[0]-dist[0]-boxlo[0]) * - nx_pppm/xprd + shift) - OFFSET; - nhi = static_cast ((subhi[0]+dist[0]-boxlo[0]) * - nx_pppm/xprd + shift) - OFFSET; - nxlo_out = nlo + nlower; - nxhi_out = nhi + nupper; - - nlo = static_cast ((sublo[1]-dist[1]-boxlo[1]) * - ny_pppm/yprd + shift) - OFFSET; - nhi = static_cast ((subhi[1]+dist[1]-boxlo[1]) * - ny_pppm/yprd + shift) - OFFSET; - nylo_out = nlo + nlower; - nyhi_out = nhi + nupper; - - nlo = static_cast ((sublo[2]-dist[2]-boxlo[2]) * - nz_pppm/zprd_slab + shift) - OFFSET; - nhi = static_cast ((subhi[2]+dist[2]-boxlo[2]) * - nz_pppm/zprd_slab + shift) - OFFSET; - nzlo_out = nlo + nlower; - nzhi_out = nhi + nupper; - - // for slab PPPMGPU, change the grid boundary for processors at +z end - // to include the empty volume between periodically repeating slabs - // for slab PPPMGPU, want charge data communicated from -z proc to +z proc, - // but not vice versa, also want field data communicated from +z proc to - // -z proc, but not vice versa - // this is accomplished by nzhi_in = nzhi_out on +z end (no ghost cells) - - if (slabflag && ((comm->myloc[2]+1) == (comm->procgrid[2]))) { - nzhi_in = nz_pppm - 1; - nzhi_out = nz_pppm - 1; - } - - // nlo_ghost,nhi_ghost = # of planes I will recv from 6 directions - // that overlay domain I own - // proc in that direction tells me via sendrecv() - // if no neighbor proc, value is from self since I have ghosts regardless - - int nplanes; - MPI_Status status; - - nplanes = nxlo_in - nxlo_out; - if (comm->procneigh[0][0] != me) - MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[0][0],0, - &nxhi_ghost,1,MPI_INT,comm->procneigh[0][1],0, - world,&status); - else nxhi_ghost = nplanes; - - nplanes = nxhi_out - nxhi_in; - if (comm->procneigh[0][1] != me) - MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[0][1],0, - &nxlo_ghost,1,MPI_INT,comm->procneigh[0][0], - 0,world,&status); - else nxlo_ghost = nplanes; - - nplanes = nylo_in - nylo_out; - if (comm->procneigh[1][0] != me) - MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[1][0],0, - &nyhi_ghost,1,MPI_INT,comm->procneigh[1][1],0, - world,&status); - else nyhi_ghost = nplanes; - - nplanes = nyhi_out - nyhi_in; - if (comm->procneigh[1][1] != me) - MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[1][1],0, - &nylo_ghost,1,MPI_INT,comm->procneigh[1][0],0, - world,&status); - else nylo_ghost = nplanes; - - nplanes = nzlo_in - nzlo_out; - if (comm->procneigh[2][0] != me) - MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[2][0],0, - &nzhi_ghost,1,MPI_INT,comm->procneigh[2][1],0, - world,&status); - else nzhi_ghost = nplanes; - - nplanes = nzhi_out - nzhi_in; - if (comm->procneigh[2][1] != me) - MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[2][1],0, - &nzlo_ghost,1,MPI_INT,comm->procneigh[2][0],0, - world,&status); - else nzlo_ghost = nplanes; - - // test that ghost overlap is not bigger than my sub-domain - - int flag = 0; - if (nxlo_ghost > nxhi_in-nxlo_in+1) flag = 1; - if (nxhi_ghost > nxhi_in-nxlo_in+1) flag = 1; - if (nylo_ghost > nyhi_in-nylo_in+1) flag = 1; - if (nyhi_ghost > nyhi_in-nylo_in+1) flag = 1; - if (nzlo_ghost > nzhi_in-nzlo_in+1) flag = 1; - if (nzhi_ghost > nzhi_in-nzlo_in+1) flag = 1; - - int flag_all; - MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world); - - if (flag_all == 0) break; - order--; - } - - if (order == 0) error->all("PPPM order has been reduced to 0"); - - // decomposition of FFT mesh - // global indices range from 0 to N-1 - // proc owns entire x-dimension, clump of columns in y,z dimensions - // npey_fft,npez_fft = # of procs in y,z dims - // if nprocs is small enough, proc can own 1 or more entire xy planes, - // else proc owns 2d sub-blocks of yz plane - // me_y,me_z = which proc (0-npe_fft-1) I am in y,z dimensions - // nlo_fft,nhi_fft = lower/upper limit of the section - // of the global FFT mesh that I own - - int npey_fft,npez_fft; - if (nz_pppm >= nprocs) { - npey_fft = 1; - npez_fft = nprocs; - } else procs2grid2d(nprocs,ny_pppm,nz_pppm,&npey_fft,&npez_fft); - - int me_y = me % npey_fft; - int me_z = me / npey_fft; - - nxlo_fft = 0; - nxhi_fft = nx_pppm - 1; - nylo_fft = me_y*ny_pppm/npey_fft; - nyhi_fft = (me_y+1)*ny_pppm/npey_fft - 1; - nzlo_fft = me_z*nz_pppm/npez_fft; - nzhi_fft = (me_z+1)*nz_pppm/npez_fft - 1; - - // PPPMGPU grid for this proc, including ghosts - - ngrid = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) * - (nzhi_out-nzlo_out+1); - - // FFT arrays on this proc, without ghosts - // nfft = FFT points in FFT decomposition on this proc - // nfft_brick = FFT points in 3d brick-decomposition on this proc - // nfft_both = greater of 2 values - - nfft = (nxhi_fft-nxlo_fft+1) * (nyhi_fft-nylo_fft+1) * - (nzhi_fft-nzlo_fft+1); - int nfft_brick = (nxhi_in-nxlo_in+1) * (nyhi_in-nylo_in+1) * - (nzhi_in-nzlo_in+1); - nfft_both = MAX(nfft,nfft_brick); - - // buffer space for use in brick2fft and fillbrick - // idel = max # of ghost planes to send or recv in +/- dir of each dim - // nx,ny,nz = owned planes (including ghosts) in each dim - // nxx,nyy,nzz = max # of grid cells to send in each dim - // nbuf = max in any dim, augment by 3x for components of vd_xyz in fillbrick - - int idelx,idely,idelz,nx,ny,nz,nxx,nyy,nzz; - - idelx = MAX(nxlo_ghost,nxhi_ghost); - idelx = MAX(idelx,nxhi_out-nxhi_in); - idelx = MAX(idelx,nxlo_in-nxlo_out); - - idely = MAX(nylo_ghost,nyhi_ghost); - idely = MAX(idely,nyhi_out-nyhi_in); - idely = MAX(idely,nylo_in-nylo_out); - - idelz = MAX(nzlo_ghost,nzhi_ghost); - idelz = MAX(idelz,nzhi_out-nzhi_in); - idelz = MAX(idelz,nzlo_in-nzlo_out); - - nx = nxhi_out - nxlo_out + 1; - ny = nyhi_out - nylo_out + 1; - nz = nzhi_out - nzlo_out + 1; - - nxx = idelx * ny * nz; - nyy = idely * nx * nz; - nzz = idelz * nx * ny; - - nbuf = MAX(nxx,nyy); - nbuf = MAX(nbuf,nzz); - nbuf *= 3; - - // print stats - - int ngrid_max,nfft_both_max,nbuf_max; - MPI_Allreduce(&ngrid,&ngrid_max,1,MPI_INT,MPI_MAX,world); - MPI_Allreduce(&nfft_both,&nfft_both_max,1,MPI_INT,MPI_MAX,world); - MPI_Allreduce(&nbuf,&nbuf_max,1,MPI_INT,MPI_MAX,world); - - if (me == 0) { - if (screen) fprintf(screen," brick FFT buffer size/proc = %d %d %d\n", - ngrid_max,nfft_both_max,nbuf_max); - if (logfile) fprintf(logfile," brick FFT buffer size/proc = %d %d %d\n", - ngrid_max,nfft_both_max,nbuf_max); - } - - // allocate K-space dependent memory - - allocate(); - - // pre-compute Green's function denomiator expansion - // pre-compute 1d charge distribution coefficients - - compute_gf_denom(); - compute_rho_coeff(); + base_init(); if (order>8) error->all("Cannot use order greater than 8 with pppm/gpu."); - pppm_gpu_clear(poisson_time); + pppm_gpu_clear_f(poisson_time); - bool success; - numtyp *data, *h_brick; - h_brick = pppm_gpu_init(atom->nlocal, atom->nlocal+atom->nghost, screen, - order, nxlo_out, nylo_out, nzlo_out, nxhi_out, - nyhi_out, nzhi_out, rho_coeff, &data, success); + int success; + grdtyp *data, *h_brick; + h_brick = pppm_gpu_init_f(atom->nlocal, atom->nlocal+atom->nghost, screen, + order, nxlo_out, nylo_out, nzlo_out, nxhi_out, + nyhi_out, nzhi_out, rho_coeff, &data, success); + + int all_success; + MPI_Allreduce(&success, &all_success, 1, MPI_INT, MPI_MIN, world); + if (all_success != 0) { + if (all_success == -1) + error->all("Could not find fix gpu"); + else if (all_success == -2) + error->all("At least one node could not find specified GPU."); + else if (all_success == -3) + error->all("Out of memory on GPU."); + else if (all_success == -4) + error->all("GPU library not compiled for this GPU."); + else if (all_success == -5) + error->all("Double precision is not supported on this GPU."); + else + error->all("Unknown error in GPU library."); + } density_brick = create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out, @@ -512,175 +122,20 @@ void PPPMGPU::init() create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out, nxlo_out,nxhi_out,"pppm:vd_brick",data,4); - if (!success) - error->one("Insufficient memory on accelerator (or no fix gpu).\n"); - poisson_time=0; } -/* ---------------------------------------------------------------------- - adjust PPPMGPU coeffs, called initially and whenever volume has changed -------------------------------------------------------------------------- */ - -void PPPMGPU::setup() -{ - int i,j,k,l,m,n; - double *prd; - - // volume-dependent factors - // adjust z dimension for 2d slab PPPMGPU - // z dimension for 3d PPPMGPU is zprd since slab_volfactor = 1.0 - - if (triclinic == 0) prd = domain->prd; - else prd = domain->prd_lamda; - - double xprd = prd[0]; - double yprd = prd[1]; - double zprd = prd[2]; - double zprd_slab = zprd*slab_volfactor; - volume = xprd * yprd * zprd_slab; - - delxinv = nx_pppm/xprd; - delyinv = ny_pppm/yprd; - delzinv = nz_pppm/zprd_slab; - - delvolinv = delxinv*delyinv*delzinv; - - double unitkx = (2.0*PI/xprd); - double unitky = (2.0*PI/yprd); - double unitkz = (2.0*PI/zprd_slab); - - // fkx,fky,fkz for my FFT grid pts - - double per; - - for (i = nxlo_fft; i <= nxhi_fft; i++) { - per = i - nx_pppm*(2*i/nx_pppm); - fkx[i] = unitkx*per; - } - - for (i = nylo_fft; i <= nyhi_fft; i++) { - per = i - ny_pppm*(2*i/ny_pppm); - fky[i] = unitky*per; - } - - for (i = nzlo_fft; i <= nzhi_fft; i++) { - per = i - nz_pppm*(2*i/nz_pppm); - fkz[i] = unitkz*per; - } - - // virial coefficients - - double sqk,vterm; - - n = 0; - for (k = nzlo_fft; k <= nzhi_fft; k++) { - for (j = nylo_fft; j <= nyhi_fft; j++) { - for (i = nxlo_fft; i <= nxhi_fft; i++) { - sqk = fkx[i]*fkx[i] + fky[j]*fky[j] + fkz[k]*fkz[k]; - if (sqk == 0.0) { - vg[n][0] = 0.0; - vg[n][1] = 0.0; - vg[n][2] = 0.0; - vg[n][3] = 0.0; - vg[n][4] = 0.0; - vg[n][5] = 0.0; - } else { - vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald)); - vg[n][0] = 1.0 + vterm*fkx[i]*fkx[i]; - vg[n][1] = 1.0 + vterm*fky[j]*fky[j]; - vg[n][2] = 1.0 + vterm*fkz[k]*fkz[k]; - vg[n][3] = vterm*fkx[i]*fky[j]; - vg[n][4] = vterm*fkx[i]*fkz[k]; - vg[n][5] = vterm*fky[j]*fkz[k]; - } - n++; - } - } - } - - // modified (Hockney-Eastwood) Coulomb Green's function - - int nx,ny,nz,kper,lper,mper; - double snx,sny,snz,snx2,sny2,snz2; - double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz; - double sum1,dot1,dot2; - double numerator,denominator; - - int nbx = static_cast ((g_ewald*xprd/(PI*nx_pppm)) * - pow(-log(EPS_HOC),0.25)); - int nby = static_cast ((g_ewald*yprd/(PI*ny_pppm)) * - pow(-log(EPS_HOC),0.25)); - int nbz = static_cast ((g_ewald*zprd_slab/(PI*nz_pppm)) * - pow(-log(EPS_HOC),0.25)); - - double form = 1.0; - - n = 0; - for (m = nzlo_fft; m <= nzhi_fft; m++) { - mper = m - nz_pppm*(2*m/nz_pppm); - snz = sin(0.5*unitkz*mper*zprd_slab/nz_pppm); - snz2 = snz*snz; - - for (l = nylo_fft; l <= nyhi_fft; l++) { - lper = l - ny_pppm*(2*l/ny_pppm); - sny = sin(0.5*unitky*lper*yprd/ny_pppm); - sny2 = sny*sny; - - for (k = nxlo_fft; k <= nxhi_fft; k++) { - kper = k - nx_pppm*(2*k/nx_pppm); - snx = sin(0.5*unitkx*kper*xprd/nx_pppm); - snx2 = snx*snx; - - sqk = pow(unitkx*kper,2.0) + pow(unitky*lper,2.0) + - pow(unitkz*mper,2.0); - - if (sqk != 0.0) { - numerator = form*12.5663706/sqk; - denominator = gf_denom(snx2,sny2,snz2); - sum1 = 0.0; - for (nx = -nbx; nx <= nbx; nx++) { - qx = unitkx*(kper+nx_pppm*nx); - sx = exp(-.25*pow(qx/g_ewald,2.0)); - wx = 1.0; - argx = 0.5*qx*xprd/nx_pppm; - if (argx != 0.0) wx = pow(sin(argx)/argx,order); - for (ny = -nby; ny <= nby; ny++) { - qy = unitky*(lper+ny_pppm*ny); - sy = exp(-.25*pow(qy/g_ewald,2.0)); - wy = 1.0; - argy = 0.5*qy*yprd/ny_pppm; - if (argy != 0.0) wy = pow(sin(argy)/argy,order); - for (nz = -nbz; nz <= nbz; nz++) { - qz = unitkz*(mper+nz_pppm*nz); - sz = exp(-.25*pow(qz/g_ewald,2.0)); - wz = 1.0; - argz = 0.5*qz*zprd_slab/nz_pppm; - if (argz != 0.0) wz = pow(sin(argz)/argz,order); - - dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz; - dot2 = qx*qx+qy*qy+qz*qz; - sum1 += (dot1/dot2) * sx*sy*sz * pow(wx*wy*wz,2.0); - } - } - } - greensfn[n++] = numerator*sum1/denominator; - } else greensfn[n++] = 0.0; - } - } - } -} - /* ---------------------------------------------------------------------- compute the PPPMGPU long-range force, energy, virial ------------------------------------------------------------------------- */ -void PPPMGPU::compute(int eflag, int vflag) +void PPPMGPUSingle::compute(int eflag, int vflag) { bool success = true; - int flag=pppm_gpu_spread(neighbor->ago, atom->nlocal, atom->nlocal + - atom->nghost, atom->x, atom->type, success, atom->q, - domain->boxlo, delxinv, delyinv, delzinv); + int flag=pppm_gpu_spread_f(neighbor->ago, atom->nlocal, atom->nlocal + + atom->nghost, atom->x, atom->type, success, + atom->q, domain->boxlo, delxinv, delyinv, + delzinv); if (!success) error->one("Out of memory on GPGPU"); if (flag != 0) @@ -722,8 +177,8 @@ void PPPMGPU::compute(int eflag, int vflag) // calculate the force on my particles - numtyp qqrd2e_scale=qqrd2e*scale; - pppm_gpu_interp(qqrd2e_scale); + grdtyp qqrd2e_scale=qqrd2e*scale; + pppm_gpu_interp_f(qqrd2e_scale); // sum energy across procs and add in volume-dependent term @@ -755,1094 +210,18 @@ void PPPMGPU::compute(int eflag, int vflag) if (triclinic) domain->lamda2x(atom->nlocal); } -/* ---------------------------------------------------------------------- - allocate memory that depends on # of K-vectors and order -------------------------------------------------------------------------- */ - -void PPPMGPU::allocate() -{ - density_fft = - (double *) memory->smalloc(nfft_both*sizeof(double),"pppm:density_fft"); - greensfn = - (double *) memory->smalloc(nfft_both*sizeof(double),"pppm:greensfn"); - work1 = (double *) memory->smalloc(2*nfft_both*sizeof(double),"pppm:work1"); - work2 = (double *) memory->smalloc(2*nfft_both*sizeof(double),"pppm:work2"); - vg = memory->create_2d_double_array(nfft_both,6,"pppm:vg"); - - fkx = memory->create_1d_double_array(nxlo_fft,nxhi_fft,"pppm:fkx"); - fky = memory->create_1d_double_array(nylo_fft,nyhi_fft,"pppm:fky"); - fkz = memory->create_1d_double_array(nzlo_fft,nzhi_fft,"pppm:fkz"); - - buf1 = (double *) memory->smalloc(nbuf*sizeof(double),"pppm:buf1"); - buf2 = (double *) memory->smalloc(nbuf*sizeof(double),"pppm:buf2"); - - // summation coeffs - - gf_b = new double[order]; - rho1d = memory->create_2d_double_array(3,-order/2,order/2,"pppm:rho1d"); - rho_coeff = memory->create_2d_double_array(order,(1-order)/2,order/2, - "pppm:rho_coeff"); - - // create 2 FFTs and a Remap - // 1st FFT keeps data in FFT decompostion - // 2nd FFT returns data in 3d brick decomposition - // remap takes data from 3d brick to FFT decomposition - - int tmp; - - fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm, - nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 0,0,&tmp); - - fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm, - nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, - 0,0,&tmp); - - remap = new Remap(lmp,world, - nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, - nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, - 1,0,0,2); -} - -/* ---------------------------------------------------------------------- - deallocate memory that depends on # of K-vectors and order -------------------------------------------------------------------------- */ - -void PPPMGPU::deallocate() -{ - destroy_3d_offset(density_brick,nzlo_out,nylo_out); - destroy_3d_offset(vd_brick,nzlo_out,nylo_out); - - memory->sfree(density_fft); - memory->sfree(greensfn); - memory->sfree(work1); - memory->sfree(work2); - memory->destroy_2d_double_array(vg); - - memory->destroy_1d_double_array(fkx,nxlo_fft); - memory->destroy_1d_double_array(fky,nylo_fft); - memory->destroy_1d_double_array(fkz,nzlo_fft); - - memory->sfree(buf1); - memory->sfree(buf2); - - delete [] gf_b; - memory->destroy_2d_double_array(rho1d,-order/2); - memory->destroy_2d_double_array(rho_coeff,(1-order)/2); - - delete fft1; - delete fft2; - delete remap; -} - -/* ---------------------------------------------------------------------- - set size of FFT grid (nx,ny,nz_pppm) and g_ewald -------------------------------------------------------------------------- */ - -void PPPMGPU::set_grid() -{ - // see JCP 109, pg 7698 for derivation of coefficients - // higher order coefficients may be computed if needed - - double **acons = memory->create_2d_double_array(8,7,"pppm:acons"); - - acons[1][0] = 2.0 / 3.0; - acons[2][0] = 1.0 / 50.0; - acons[2][1] = 5.0 / 294.0; - acons[3][0] = 1.0 / 588.0; - acons[3][1] = 7.0 / 1440.0; - acons[3][2] = 21.0 / 3872.0; - acons[4][0] = 1.0 / 4320.0; - acons[4][1] = 3.0 / 1936.0; - acons[4][2] = 7601.0 / 2271360.0; - acons[4][3] = 143.0 / 28800.0; - acons[5][0] = 1.0 / 23232.0; - acons[5][1] = 7601.0 / 13628160.0; - acons[5][2] = 143.0 / 69120.0; - acons[5][3] = 517231.0 / 106536960.0; - acons[5][4] = 106640677.0 / 11737571328.0; - acons[6][0] = 691.0 / 68140800.0; - acons[6][1] = 13.0 / 57600.0; - acons[6][2] = 47021.0 / 35512320.0; - acons[6][3] = 9694607.0 / 2095994880.0; - acons[6][4] = 733191589.0 / 59609088000.0; - acons[6][5] = 326190917.0 / 11700633600.0; - acons[7][0] = 1.0 / 345600.0; - acons[7][1] = 3617.0 / 35512320.0; - acons[7][2] = 745739.0 / 838397952.0; - acons[7][3] = 56399353.0 / 12773376000.0; - acons[7][4] = 25091609.0 / 1560084480.0; - acons[7][5] = 1755948832039.0 / 36229939200000.0; - acons[7][6] = 4887769399.0 / 37838389248.0; - - double q2 = qsqsum / force->dielectric; - bigint natoms = atom->natoms; - - // use xprd,yprd,zprd even if triclinic so grid size is the same - // adjust z dimension for 2d slab PPPMGPU - // 3d PPPMGPU just uses zprd since slab_volfactor = 1.0 - - double xprd = domain->xprd; - double yprd = domain->yprd; - double zprd = domain->zprd; - double zprd_slab = zprd*slab_volfactor; - - // make initial g_ewald estimate - // based on desired error and real space cutoff - // fluid-occupied volume used to estimate real-space error - // zprd used rather than zprd_slab - - double hx,hy,hz; - - if (!gewaldflag) - g_ewald = sqrt(-log(precision*sqrt(natoms*cutoff*xprd*yprd*zprd) / - (2.0*q2))) / cutoff; - - // set optimal nx_pppm,ny_pppm,nz_pppm based on order and precision - // nz_pppm uses extended zprd_slab instead of zprd - // h = 1/g_ewald is upper bound on h such that h*g_ewald <= 1 - // reduce it until precision target is met - - if (!gridflag) { - double err; - hx = hy = hz = 1/g_ewald; - - nx_pppm = static_cast (xprd/hx + 1); - ny_pppm = static_cast (yprd/hy + 1); - nz_pppm = static_cast (zprd_slab/hz + 1); - - err = rms(hx,xprd,natoms,q2,acons); - while (err > precision) { - err = rms(hx,xprd,natoms,q2,acons); - nx_pppm++; - hx = xprd/nx_pppm; - } - - err = rms(hy,yprd,natoms,q2,acons); - while (err > precision) { - err = rms(hy,yprd,natoms,q2,acons); - ny_pppm++; - hy = yprd/ny_pppm; - } - - err = rms(hz,zprd_slab,natoms,q2,acons); - while (err > precision) { - err = rms(hz,zprd_slab,natoms,q2,acons); - nz_pppm++; - hz = zprd_slab/nz_pppm; - } - } - - // boost grid size until it is factorable - - while (!factorable(nx_pppm)) nx_pppm++; - while (!factorable(ny_pppm)) ny_pppm++; - while (!factorable(nz_pppm)) nz_pppm++; - - // adjust g_ewald for new grid size - - hx = xprd/nx_pppm; - hy = yprd/ny_pppm; - hz = zprd_slab/nz_pppm; - - if (!gewaldflag) { - double gew1,gew2,dgew,f,fmid,hmin,rtb; - int ncount; - - gew1 = 0.0; - g_ewald = gew1; - f = diffpr(hx,hy,hz,q2,acons); - - hmin = MIN(hx,MIN(hy,hz)); - gew2 = 10/hmin; - g_ewald = gew2; - fmid = diffpr(hx,hy,hz,q2,acons); - - if (f*fmid >= 0.0) error->all("Cannot compute PPPMGPU G"); - rtb = f < 0.0 ? (dgew=gew2-gew1,gew1) : (dgew=gew1-gew2,gew2); - ncount = 0; - while (fabs(dgew) > SMALL && fmid != 0.0) { - dgew *= 0.5; - g_ewald = rtb + dgew; - fmid = diffpr(hx,hy,hz,q2,acons); - if (fmid <= 0.0) rtb = g_ewald; - ncount++; - if (ncount > LARGE) error->all("Cannot compute PPPMGPU G"); - } - } - - // final RMS precision - - double lprx = rms(hx,xprd,natoms,q2,acons); - double lpry = rms(hy,yprd,natoms,q2,acons); - double lprz = rms(hz,zprd_slab,natoms,q2,acons); - double lpr = sqrt(lprx*lprx + lpry*lpry + lprz*lprz) / sqrt(3.0); - double spr = 2.0*q2 * exp(-g_ewald*g_ewald*cutoff*cutoff) / - sqrt(natoms*cutoff*xprd*yprd*zprd_slab); - - // free local memory - - memory->destroy_2d_double_array(acons); - - // print info - - if (me == 0) { - if (screen) { - fprintf(screen," G vector = %g\n",g_ewald); - fprintf(screen," grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm); - fprintf(screen," stencil order = %d\n",order); - fprintf(screen," RMS precision = %g\n",MAX(lpr,spr)); - } - if (logfile) { - fprintf(logfile," G vector = %g\n",g_ewald); - fprintf(logfile," grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm); - fprintf(logfile," stencil order = %d\n",order); - fprintf(logfile," RMS precision = %g\n",MAX(lpr,spr)); - } - } -} - -/* ---------------------------------------------------------------------- - check if all factors of n are in list of factors - return 1 if yes, 0 if no -------------------------------------------------------------------------- */ - -int PPPMGPU::factorable(int n) -{ - int i; - - while (n > 1) { - for (i = 0; i < nfactors; i++) { - if (n % factors[i] == 0) { - n /= factors[i]; - break; - } - } - if (i == nfactors) return 0; - } - - return 1; -} - -/* ---------------------------------------------------------------------- - compute RMS precision for a dimension -------------------------------------------------------------------------- */ - -double PPPMGPU::rms(double h, double prd, bigint natoms, - double q2, double **acons) -{ - double sum = 0.0; - for (int m = 0; m < order; m++) - sum += acons[order][m] * pow(h*g_ewald,2.0*m); - double value = q2 * pow(h*g_ewald,order) * - sqrt(g_ewald*prd*sqrt(2.0*PI)*sum/natoms) / (prd*prd); - return value; -} - -/* ---------------------------------------------------------------------- - compute difference in real-space and kspace RMS precision -------------------------------------------------------------------------- */ - -double PPPMGPU::diffpr(double hx, double hy, double hz, double q2, double **acons) -{ - double lprx,lpry,lprz,kspace_prec,real_prec; - double xprd = domain->xprd; - double yprd = domain->yprd; - double zprd = domain->zprd; - bigint natoms = atom->natoms; - - lprx = rms(hx,xprd,natoms,q2,acons); - lpry = rms(hy,yprd,natoms,q2,acons); - lprz = rms(hz,zprd*slab_volfactor,natoms,q2,acons); - kspace_prec = sqrt(lprx*lprx + lpry*lpry + lprz*lprz) / sqrt(3.0); - real_prec = 2.0*q2 * exp(-g_ewald*g_ewald*cutoff*cutoff) / - sqrt(natoms*cutoff*xprd*yprd*zprd); - double value = kspace_prec - real_prec; - return value; -} - -/* ---------------------------------------------------------------------- - denominator for Hockney-Eastwood Green's function - of x,y,z = sin(kx*deltax/2), etc - - inf n-1 - S(n,k) = Sum W(k+pi*j)**2 = Sum b(l)*(z*z)**l - j=-inf l=0 - - = -(z*z)**n /(2n-1)! * (d/dx)**(2n-1) cot(x) at z = sin(x) - gf_b = denominator expansion coeffs -------------------------------------------------------------------------- */ - -double PPPMGPU::gf_denom(double x, double y, double z) -{ - double sx,sy,sz; - sz = sy = sx = 0.0; - for (int l = order-1; l >= 0; l--) { - sx = gf_b[l] + sx*x; - sy = gf_b[l] + sy*y; - sz = gf_b[l] + sz*z; - } - double s = sx*sy*sz; - return s*s; -} - -/* ---------------------------------------------------------------------- - pre-compute Green's function denominator expansion coeffs, Gamma(2n) -------------------------------------------------------------------------- */ - -void PPPMGPU::compute_gf_denom() -{ - int k,l,m; - - for (l = 1; l < order; l++) gf_b[l] = 0.0; - gf_b[0] = 1.0; - - for (m = 1; m < order; m++) { - for (l = m; l > 0; l--) - gf_b[l] = 4.0 * (gf_b[l]*(l-m)*(l-m-0.5)-gf_b[l-1]*(l-m-1)*(l-m-1)); - gf_b[0] = 4.0 * (gf_b[0]*(l-m)*(l-m-0.5)); - } - - int ifact = 1; - for (k = 1; k < 2*order; k++) ifact *= k; - double gaminv = 1.0/ifact; - for (l = 0; l < order; l++) gf_b[l] *= gaminv; -} - -/* ---------------------------------------------------------------------- - ghost-swap to accumulate full density in brick decomposition - remap density from 3d brick decomposition to FFT decomposition -------------------------------------------------------------------------- */ - -void PPPMGPU::brick2fft() -{ - int i,n,ix,iy,iz; - MPI_Request request; - MPI_Status status; - - // pack my ghosts for +x processor - // pass data to self or +x processor - // unpack and sum recv data into my real cells - - n = 0; - for (iz = nzlo_out; iz <= nzhi_out; iz++) - for (iy = nylo_out; iy <= nyhi_out; iy++) - for (ix = nxhi_in+1; ix <= nxhi_out; ix++) - buf1[n++] = density_brick[iz][iy][ix]; - - if (comm->procneigh[0][1] == me) - for (i = 0; i < n; i++) buf2[i] = buf1[i]; - else { - MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[0][0],0,world,&request); - MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[0][1],0,world); - MPI_Wait(&request,&status); - } - - n = 0; - for (iz = nzlo_out; iz <= nzhi_out; iz++) - for (iy = nylo_out; iy <= nyhi_out; iy++) - for (ix = nxlo_in; ix < nxlo_in+nxlo_ghost; ix++) - density_brick[iz][iy][ix] += buf2[n++]; - - // pack my ghosts for -x processor - // pass data to self or -x processor - // unpack and sum recv data into my real cells - - n = 0; - for (iz = nzlo_out; iz <= nzhi_out; iz++) - for (iy = nylo_out; iy <= nyhi_out; iy++) - for (ix = nxlo_out; ix < nxlo_in; ix++) - buf1[n++] = density_brick[iz][iy][ix]; - - if (comm->procneigh[0][0] == me) - for (i = 0; i < n; i++) buf2[i] = buf1[i]; - else { - MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[0][1],0,world,&request); - MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[0][0],0,world); - MPI_Wait(&request,&status); - } - - n = 0; - for (iz = nzlo_out; iz <= nzhi_out; iz++) - for (iy = nylo_out; iy <= nyhi_out; iy++) - for (ix = nxhi_in-nxhi_ghost+1; ix <= nxhi_in; ix++) - density_brick[iz][iy][ix] += buf2[n++]; - - // pack my ghosts for +y processor - // pass data to self or +y processor - // unpack and sum recv data into my real cells - - n = 0; - for (iz = nzlo_out; iz <= nzhi_out; iz++) - for (iy = nyhi_in+1; iy <= nyhi_out; iy++) - for (ix = nxlo_in; ix <= nxhi_in; ix++) - buf1[n++] = density_brick[iz][iy][ix]; - - if (comm->procneigh[1][1] == me) - for (i = 0; i < n; i++) buf2[i] = buf1[i]; - else { - MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[1][0],0,world,&request); - MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[1][1],0,world); - MPI_Wait(&request,&status); - } - - n = 0; - for (iz = nzlo_out; iz <= nzhi_out; iz++) - for (iy = nylo_in; iy < nylo_in+nylo_ghost; iy++) - for (ix = nxlo_in; ix <= nxhi_in; ix++) - density_brick[iz][iy][ix] += buf2[n++]; - - // pack my ghosts for -y processor - // pass data to self or -y processor - // unpack and sum recv data into my real cells - - n = 0; - for (iz = nzlo_out; iz <= nzhi_out; iz++) - for (iy = nylo_out; iy < nylo_in; iy++) - for (ix = nxlo_in; ix <= nxhi_in; ix++) - buf1[n++] = density_brick[iz][iy][ix]; - - if (comm->procneigh[1][0] == me) - for (i = 0; i < n; i++) buf2[i] = buf1[i]; - else { - MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[1][1],0,world,&request); - MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[1][0],0,world); - MPI_Wait(&request,&status); - } - - n = 0; - for (iz = nzlo_out; iz <= nzhi_out; iz++) - for (iy = nyhi_in-nyhi_ghost+1; iy <= nyhi_in; iy++) - for (ix = nxlo_in; ix <= nxhi_in; ix++) - density_brick[iz][iy][ix] += buf2[n++]; - - // pack my ghosts for +z processor - // pass data to self or +z processor - // unpack and sum recv data into my real cells - - n = 0; - for (iz = nzhi_in+1; iz <= nzhi_out; iz++) - for (iy = nylo_in; iy <= nyhi_in; iy++) - for (ix = nxlo_in; ix <= nxhi_in; ix++) - buf1[n++] = density_brick[iz][iy][ix]; - - if (comm->procneigh[2][1] == me) - for (i = 0; i < n; i++) buf2[i] = buf1[i]; - else { - MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[2][0],0,world,&request); - MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[2][1],0,world); - MPI_Wait(&request,&status); - } - - n = 0; - for (iz = nzlo_in; iz < nzlo_in+nzlo_ghost; iz++) - for (iy = nylo_in; iy <= nyhi_in; iy++) - for (ix = nxlo_in; ix <= nxhi_in; ix++) - density_brick[iz][iy][ix] += buf2[n++]; - - // pack my ghosts for -z processor - // pass data to self or -z processor - // unpack and sum recv data into my real cells - - n = 0; - for (iz = nzlo_out; iz < nzlo_in; iz++) - for (iy = nylo_in; iy <= nyhi_in; iy++) - for (ix = nxlo_in; ix <= nxhi_in; ix++) - buf1[n++] = density_brick[iz][iy][ix]; - - if (comm->procneigh[2][0] == me) - for (i = 0; i < n; i++) buf2[i] = buf1[i]; - else { - MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[2][1],0,world,&request); - MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[2][0],0,world); - MPI_Wait(&request,&status); - } - - n = 0; - for (iz = nzhi_in-nzhi_ghost+1; iz <= nzhi_in; iz++) - for (iy = nylo_in; iy <= nyhi_in; iy++) - for (ix = nxlo_in; ix <= nxhi_in; ix++) - density_brick[iz][iy][ix] += buf2[n++]; - - // remap from 3d brick decomposition to FFT decomposition - // copy grabs inner portion of density from 3d brick - // remap could be done as pre-stage of FFT, - // but this works optimally on only double values, not complex values - - n = 0; - for (iz = nzlo_in; iz <= nzhi_in; iz++) - for (iy = nylo_in; iy <= nyhi_in; iy++) - for (ix = nxlo_in; ix <= nxhi_in; ix++) - density_fft[n++] = density_brick[iz][iy][ix]; - - remap->perform(density_fft,density_fft,work1); -} - -/* ---------------------------------------------------------------------- - ghost-swap to fill ghost cells of my brick with field values -------------------------------------------------------------------------- */ - -void PPPMGPU::fillbrick() -{ - int i,n,ix,iy,iz; - MPI_Request request; - MPI_Status status; - - // pack my real cells for +z processor - // pass data to self or +z processor - // unpack and sum recv data into my ghost cells - - n = 0; - int x_lo = nxlo_in * 4; - int x_hi = nxhi_in * 4 + 1; - for (iz = nzhi_in-nzhi_ghost+1; iz <= nzhi_in; iz++) - for (iy = nylo_in; iy <= nyhi_in; iy++) - for (ix = x_lo; ix < x_hi; ix+=4) { - buf1[n++] = vd_brick[iz][iy][ix]; - buf1[n++] = vd_brick[iz][iy][ix+1]; - buf1[n++] = vd_brick[iz][iy][ix+2]; - } - - if (comm->procneigh[2][1] == me) - for (i = 0; i < n; i++) buf2[i] = buf1[i]; - else { - MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[2][0],0,world,&request); - MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[2][1],0,world); - MPI_Wait(&request,&status); - } - - n = 0; - for (iz = nzlo_out; iz < nzlo_in; iz++) - for (iy = nylo_in; iy <= nyhi_in; iy++) - for (ix = x_lo; ix < x_hi; ix+=4) { - vd_brick[iz][iy][ix] = buf2[n++]; - vd_brick[iz][iy][ix+1] = buf2[n++]; - vd_brick[iz][iy][ix+2] = buf2[n++]; - } - - // pack my real cells for -z processor - // pass data to self or -z processor - // unpack and sum recv data into my ghost cells - - n = 0; - for (iz = nzlo_in; iz < nzlo_in+nzlo_ghost; iz++) - for (iy = nylo_in; iy <= nyhi_in; iy++) - for (ix = x_lo; ix < x_hi; ix+=4) { - buf1[n++] = vd_brick[iz][iy][ix]; - buf1[n++] = vd_brick[iz][iy][ix+1]; - buf1[n++] = vd_brick[iz][iy][ix+2]; - } - - if (comm->procneigh[2][0] == me) - for (i = 0; i < n; i++) buf2[i] = buf1[i]; - else { - MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[2][1],0,world,&request); - MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[2][0],0,world); - MPI_Wait(&request,&status); - } - - n = 0; - for (iz = nzhi_in+1; iz <= nzhi_out; iz++) - for (iy = nylo_in; iy <= nyhi_in; iy++) - for (ix = x_lo; ix < x_hi; ix+=4) { - vd_brick[iz][iy][ix] = buf2[n++]; - vd_brick[iz][iy][ix+1] = buf2[n++]; - vd_brick[iz][iy][ix+2] = buf2[n++]; - } - - // pack my real cells for +y processor - // pass data to self or +y processor - // unpack and sum recv data into my ghost cells - - n = 0; - for (iz = nzlo_out; iz <= nzhi_out; iz++) - for (iy = nyhi_in-nyhi_ghost+1; iy <= nyhi_in; iy++) - for (ix = x_lo; ix < x_hi; ix+=4) { - buf1[n++] = vd_brick[iz][iy][ix]; - buf1[n++] = vd_brick[iz][iy][ix+1]; - buf1[n++] = vd_brick[iz][iy][ix+2]; - } - - if (comm->procneigh[1][1] == me) - for (i = 0; i < n; i++) buf2[i] = buf1[i]; - else { - MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[1][0],0,world,&request); - MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[1][1],0,world); - MPI_Wait(&request,&status); - } - - n = 0; - for (iz = nzlo_out; iz <= nzhi_out; iz++) - for (iy = nylo_out; iy < nylo_in; iy++) - for (ix = x_lo; ix < x_hi; ix+=4) { - vd_brick[iz][iy][ix] = buf2[n++]; - vd_brick[iz][iy][ix+1] = buf2[n++]; - vd_brick[iz][iy][ix+2] = buf2[n++]; - } - - // pack my real cells for -y processor - // pass data to self or -y processor - // unpack and sum recv data into my ghost cells - - n = 0; - for (iz = nzlo_out; iz <= nzhi_out; iz++) - for (iy = nylo_in; iy < nylo_in+nylo_ghost; iy++) - for (ix = x_lo; ix < x_hi; ix+=4) { - buf1[n++] = vd_brick[iz][iy][ix]; - buf1[n++] = vd_brick[iz][iy][ix+1]; - buf1[n++] = vd_brick[iz][iy][ix+2]; - } - - if (comm->procneigh[1][0] == me) - for (i = 0; i < n; i++) buf2[i] = buf1[i]; - else { - MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[1][1],0,world,&request); - MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[1][0],0,world); - MPI_Wait(&request,&status); - } - - n = 0; - for (iz = nzlo_out; iz <= nzhi_out; iz++) - for (iy = nyhi_in+1; iy <= nyhi_out; iy++) - for (ix = x_lo; ix < x_hi; ix+=4) { - vd_brick[iz][iy][ix] = buf2[n++]; - vd_brick[iz][iy][ix+1] = buf2[n++]; - vd_brick[iz][iy][ix+2] = buf2[n++]; - } - - // pack my real cells for +x processor - // pass data to self or +x processor - // unpack and sum recv data into my ghost cells - - n = 0; - x_lo = (nxhi_in-nxhi_ghost+1) * 4; - for (iz = nzlo_out; iz <= nzhi_out; iz++) - for (iy = nylo_out; iy <= nyhi_out; iy++) - for (ix = x_lo; ix < x_hi; ix+=4) { - buf1[n++] = vd_brick[iz][iy][ix]; - buf1[n++] = vd_brick[iz][iy][ix+1]; - buf1[n++] = vd_brick[iz][iy][ix+2]; - } - - if (comm->procneigh[0][1] == me) - for (i = 0; i < n; i++) buf2[i] = buf1[i]; - else { - MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[0][0],0,world,&request); - MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[0][1],0,world); - MPI_Wait(&request,&status); - } - - n = 0; - x_lo = nxlo_out * 4; - x_hi = nxlo_in * 4; - for (iz = nzlo_out; iz <= nzhi_out; iz++) - for (iy = nylo_out; iy <= nyhi_out; iy++) - for (ix = x_lo; ix < x_hi; ix+=4) { - vd_brick[iz][iy][ix] = buf2[n++]; - vd_brick[iz][iy][ix+1] = buf2[n++]; - vd_brick[iz][iy][ix+2] = buf2[n++]; - } - - // pack my real cells for -x processor - // pass data to self or -x processor - // unpack and sum recv data into my ghost cells - - n = 0; - x_lo = x_hi; - x_hi = (nxlo_in+nxlo_ghost) * 4; - for (iz = nzlo_out; iz <= nzhi_out; iz++) - for (iy = nylo_out; iy <= nyhi_out; iy++) - for (ix = x_lo; ix < x_hi; ix+=4) { - buf1[n++] = vd_brick[iz][iy][ix]; - buf1[n++] = vd_brick[iz][iy][ix+1]; - buf1[n++] = vd_brick[iz][iy][ix+2]; - } - - if (comm->procneigh[0][0] == me) - for (i = 0; i < n; i++) buf2[i] = buf1[i]; - else { - MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[0][1],0,world,&request); - MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[0][0],0,world); - MPI_Wait(&request,&status); - } - - n = 0; - x_lo = (nxhi_in + 1) * 4; - x_hi = nxhi_out * 4 + 1; - for (iz = nzlo_out; iz <= nzhi_out; iz++) - for (iy = nylo_out; iy <= nyhi_out; iy++) - for (ix = x_lo; ix < x_hi; ix+=4) { - vd_brick[iz][iy][ix] = buf2[n++]; - vd_brick[iz][iy][ix+1] = buf2[n++]; - vd_brick[iz][iy][ix+2] = buf2[n++]; - } -} - -/* ---------------------------------------------------------------------- - FFT-based Poisson solver -------------------------------------------------------------------------- */ - -void PPPMGPU::poisson(int eflag, int vflag) -{ - int i,j,k,n; - double eng; - - // transform charge density (r -> k) - - n = 0; - for (i = 0; i < nfft; i++) { - work1[n++] = density_fft[i]; - work1[n++] = 0.0; - } - - fft1->compute(work1,work1,1); - - // if requested, compute energy and virial contribution - - double scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm); - double s2 = scaleinv*scaleinv; - - if (eflag || vflag) { - if (vflag) { - n = 0; - for (i = 0; i < nfft; i++) { - eng = s2 * greensfn[i] * (work1[n]*work1[n] + work1[n+1]*work1[n+1]); - for (j = 0; j < 6; j++) virial[j] += eng*vg[i][j]; - energy += eng; - n += 2; - } - } else { - n = 0; - for (i = 0; i < nfft; i++) { - energy += - s2 * greensfn[i] * (work1[n]*work1[n] + work1[n+1]*work1[n+1]); - n += 2; - } - } - } - - // scale by 1/total-grid-pts to get rho(k) - // multiply by Green's function to get V(k) - - n = 0; - for (i = 0; i < nfft; i++) { - work1[n++] *= scaleinv * greensfn[i]; - work1[n++] *= scaleinv * greensfn[i]; - } - - // compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k) - // FFT leaves data in 3d brick decomposition - // copy it into inner portion of vdx,vdy,vdz arrays - - // x direction gradient - - n = 0; - for (k = nzlo_fft; k <= nzhi_fft; k++) - for (j = nylo_fft; j <= nyhi_fft; j++) - for (i = nxlo_fft; i <= nxhi_fft; i++) { - work2[n] = fkx[i]*work1[n+1]; - work2[n+1] = -fkx[i]*work1[n]; - n += 2; - } - - fft2->compute(work2,work2,-1); - - n = 0; - int x_hi = nxhi_in * 4 + 3; - for (k = nzlo_in; k <= nzhi_in; k++) - for (j = nylo_in; j <= nyhi_in; j++) - for (i = nxlo_in * 4; i < x_hi; i+=4) { - vd_brick[k][j][i] = work2[n]; - n += 2; - } - - // y direction gradient - - n = 0; - for (k = nzlo_fft; k <= nzhi_fft; k++) - for (j = nylo_fft; j <= nyhi_fft; j++) - for (i = nxlo_fft; i <= nxhi_fft; i++) { - work2[n] = fky[j]*work1[n+1]; - work2[n+1] = -fky[j]*work1[n]; - n += 2; - } - - fft2->compute(work2,work2,-1); - - n = 0; - for (k = nzlo_in; k <= nzhi_in; k++) - for (j = nylo_in; j <= nyhi_in; j++) - for (i = nxlo_in * 4 + 1; i < x_hi; i+=4) { - vd_brick[k][j][i] = work2[n]; - n += 2; - } - - // z direction gradient - - n = 0; - for (k = nzlo_fft; k <= nzhi_fft; k++) - for (j = nylo_fft; j <= nyhi_fft; j++) - for (i = nxlo_fft; i <= nxhi_fft; i++) { - work2[n] = fkz[k]*work1[n+1]; - work2[n+1] = -fkz[k]*work1[n]; - n += 2; - } - - fft2->compute(work2,work2,-1); - - n = 0; - for (k = nzlo_in; k <= nzhi_in; k++) - for (j = nylo_in; j <= nyhi_in; j++) - for (i = nxlo_in * 4 + 2; i < x_hi; i+=4) { - vd_brick[k][j][i] = work2[n]; - n += 2; - } -} - -/* ---------------------------------------------------------------------- - map nprocs to NX by NY grid as PX by PY procs - return optimal px,py -------------------------------------------------------------------------- */ - -void PPPMGPU::procs2grid2d(int nprocs, int nx, int ny, int *px, int *py) -{ - // loop thru all possible factorizations of nprocs - // surf = surface area of largest proc sub-domain - // innermost if test minimizes surface area and surface/volume ratio - - int bestsurf = 2 * (nx + ny); - int bestboxx = 0; - int bestboxy = 0; - - int boxx,boxy,surf,ipx,ipy; - - ipx = 1; - while (ipx <= nprocs) { - if (nprocs % ipx == 0) { - ipy = nprocs/ipx; - boxx = nx/ipx; - if (nx % ipx) boxx++; - boxy = ny/ipy; - if (ny % ipy) boxy++; - surf = boxx + boxy; - if (surf < bestsurf || - (surf == bestsurf && boxx*boxy > bestboxx*bestboxy)) { - bestsurf = surf; - bestboxx = boxx; - bestboxy = boxy; - *px = ipx; - *py = ipy; - } - } - ipx++; - } -} - -/* ---------------------------------------------------------------------- - charge assignment into rho1d - dx,dy,dz = distance of particle from "lower left" grid point -------------------------------------------------------------------------- */ - -void PPPMGPU::compute_rho1d(double dx, double dy, double dz) -{ - int k,l; - - for (k = (1-order)/2; k <= order/2; k++) { - rho1d[0][k] = 0.0; - rho1d[1][k] = 0.0; - rho1d[2][k] = 0.0; - for (l = order-1; l >= 0; l--) { - rho1d[0][k] = rho_coeff[l][k] + rho1d[0][k]*dx; - rho1d[1][k] = rho_coeff[l][k] + rho1d[1][k]*dy; - rho1d[2][k] = rho_coeff[l][k] + rho1d[2][k]*dz; - } - } -} - -/* ---------------------------------------------------------------------- - generate coeffients for the weight function of order n - - (n-1) - Wn(x) = Sum wn(k,x) , Sum is over every other integer - k=-(n-1) - For k=-(n-1),-(n-1)+2, ....., (n-1)-2,n-1 - k is odd integers if n is even and even integers if n is odd - --- - | n-1 - | Sum a(l,j)*(x-k/2)**l if abs(x-k/2) < 1/2 - wn(k,x) = < l=0 - | - | 0 otherwise - --- - a coeffients are packed into the array rho_coeff to eliminate zeros - rho_coeff(l,((k+mod(n+1,2))/2) = a(l,k) -------------------------------------------------------------------------- */ - -void PPPMGPU::compute_rho_coeff() -{ - int j,k,l,m; - double s; - - double **a = memory->create_2d_double_array(order,-order,order,"pppm:a"); - - for (k = -order; k <= order; k++) - for (l = 0; l < order; l++) - a[l][k] = 0.0; - - a[0][0] = 1.0; - for (j = 1; j < order; j++) { - for (k = -j; k <= j; k += 2) { - s = 0.0; - for (l = 0; l < j; l++) { - a[l+1][k] = (a[l][k+1]-a[l][k-1]) / (l+1); - s += pow(0.5,(double) l+1) * - (a[l][k-1] + pow(-1.0,(double) l) * a[l][k+1]) / (l+1); - } - a[0][k] = s; - } - } - - m = (1-order)/2; - for (k = -(order-1); k < order; k += 2) { - for (l = 0; l < order; l++) - rho_coeff[l][m] = a[l][k]; - m++; - } - - memory->destroy_2d_double_array(a,-order); -} - -/* ---------------------------------------------------------------------- - Slab-geometry correction term to dampen inter-slab interactions between - periodically repeating slabs. Yields good approximation to 2D Ewald if - adequate empty space is left between repeating slabs (J. Chem. Phys. - 111, 3155). Slabs defined here to be parallel to the xy plane. -------------------------------------------------------------------------- */ - -void PPPMGPU::slabcorr(int eflag) -{ - // compute local contribution to global dipole moment - - double *q = atom->q; - double **x = atom->x; - int nlocal = atom->nlocal; - - double dipole = 0.0; - for (int i = 0; i < nlocal; i++) dipole += q[i]*x[i][2]; - - // sum local contributions to get global dipole moment - - double dipole_all; - MPI_Allreduce(&dipole,&dipole_all,1,MPI_DOUBLE,MPI_SUM,world); - - // compute corrections - - double e_slabcorr = 2.0*PI*dipole_all*dipole_all/volume; - - if (eflag) energy += qqrd2e*scale * e_slabcorr; - - // add on force corrections - - double ffact = -4.0*PI*dipole_all/volume; - double **f = atom->f; - - for (int i = 0; i < nlocal; i++) f[i][2] += qqrd2e*scale * q[i]*ffact; -} - -/* ---------------------------------------------------------------------- - perform and time the 4 FFTs required for N timesteps -------------------------------------------------------------------------- */ - -void PPPMGPU::timing(int n, double &time3d, double &time1d) -{ - double time1,time2; - - for (int i = 0; i < 2*nfft_both; i++) work1[i] = 0.0; - - MPI_Barrier(world); - time1 = MPI_Wtime(); - - for (int i = 0; i < n; i++) { - fft1->compute(work1,work1,1); - fft2->compute(work1,work1,-1); - fft2->compute(work1,work1,-1); - fft2->compute(work1,work1,-1); - } - - MPI_Barrier(world); - time2 = MPI_Wtime(); - time3d = time2 - time1; - - MPI_Barrier(world); - time1 = MPI_Wtime(); - - for (int i = 0; i < n; i++) { - fft1->timing1d(work1,nfft_both,1); - fft2->timing1d(work1,nfft_both,-1); - fft2->timing1d(work1,nfft_both,-1); - fft2->timing1d(work1,nfft_both,-1); - } - - MPI_Barrier(world); - time2 = MPI_Wtime(); - time1d = time2 - time1; -} - -/* ---------------------------------------------------------------------- - Create array using offsets from pinned memory allocation -------------------------------------------------------------------------- */ - -numtyp ***PPPMGPU::create_3d_offset(int n1lo, int n1hi, int n2lo, int n2hi, - int n3lo, int n3hi, const char *name, - numtyp *data, int vec_length) -{ - int i,j; - int n1 = n1hi - n1lo + 1; - int n2 = n2hi - n2lo + 1; - int n3 = n3hi - n3lo + 1; - - numtyp **plane = (numtyp **)memory->smalloc(n1*n2*sizeof(numtyp *),name); - numtyp ***array = (numtyp ***)memory->smalloc(n1*sizeof(numtyp **),name); - - int n = 0; - for (i = 0; i < n1; i++) { - array[i] = &plane[i*n2]; - for (j = 0; j < n2; j++) { - plane[i*n2+j] = &data[n]; - n += n3*vec_length; - } - } - - for (i = 0; i < n1*n2; i++) array[0][i] -= n3lo*vec_length; - for (i = 0; i < n1; i++) array[i] -= n2lo; - return array-n1lo; -} - -/* ---------------------------------------------------------------------- - templated 3d memory offsets -------------------------------------------------------------------------- */ - -void PPPMGPU::destroy_3d_offset(numtyp ***array, int n1_offset, - int n2_offset) -{ - if (array == NULL) return; - memory->sfree(&array[n1_offset][n2_offset]); - memory->sfree(array + n1_offset); -} - /* ---------------------------------------------------------------------- memory usage of local arrays ------------------------------------------------------------------------- */ -double PPPMGPU::memory_usage() +double PPPMGPUSingle::memory_usage() { double bytes = nmax*3 * sizeof(double); int nbrick = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) * (nzhi_out-nzlo_out+1); - bytes += 4 * nbrick * sizeof(double); + bytes += 4 * nbrick * sizeof(grdtyp); bytes += 6 * nfft_both * sizeof(double); bytes += nfft_both*6 * sizeof(double); bytes += 2 * nbuf * sizeof(double); - return bytes + pppm_gpu_bytes(); + return bytes + pppm_gpu_bytes_f(); }