diff --git a/src/GPU/gpu_extra.h b/src/GPU/gpu_extra.h new file mode 100644 index 0000000000..30faef139d --- /dev/null +++ b/src/GPU/gpu_extra.h @@ -0,0 +1,47 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Mike Brown (ORNL) +------------------------------------------------------------------------- */ + +#ifndef LMP_GPU_EXTRA_H +#define LMP_GPU_EXTRA_H + +#include "error.h" + +namespace GPU_EXTRA { + + inline void check_flag(int error_flag, LAMMPS_NS::Error *error, + MPI_Comm &world) { + int all_success; + MPI_Allreduce(&error_flag, &all_success, 1, MPI_INT, MPI_MIN, world); + if (all_success != 0) { + if (all_success == -1) + error->all("Accelerated style in input script but no fix gpu."); + else if (all_success == -2) + error->all("Could not find/initialize a specified accelerator device."); + else if (all_success == -3) + error->all("Insufficient memory on accelerator."); + else if (all_success == -4) + error->all("GPU library not compiled for this accelerator."); + else if (all_success == -5) + error->all("Double precision is not supported on this accelerator."); + else + error->all("Unknown error in GPU library."); + } + } + +} + +#endif diff --git a/src/GPU/pair_lj_expand_gpu.cpp b/src/GPU/pair_lj_expand_gpu.cpp new file mode 100644 index 0000000000..b4c50d55c4 --- /dev/null +++ b/src/GPU/pair_lj_expand_gpu.cpp @@ -0,0 +1,236 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Inderaj Bains (NVIDIA), ibains@nvidia.com +------------------------------------------------------------------------- */ + +#include "math.h" +#include "stdio.h" +#include "stdlib.h" +#include "pair_lj_expand_gpu.h" +#include "lmptype.h" +#include "atom.h" +#include "atom_vec.h" +#include "comm.h" +#include "force.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "integrate.h" +#include "memory.h" +#include "error.h" +#include "neigh_request.h" +#include "universe.h" +#include "update.h" +#include "domain.h" +#include "string.h" +#include "gpu_extra.h" + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +// External functions from cuda library for atom decomposition + +int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1, + double **host_lj2, double **host_lj3, double **host_lj4, + double **offset, double **shift, double *special_lj, + const int nlocal, const int nall, const int max_nbors, + const int maxspecial, const double cell_size, int &gpu_mode, + FILE *screen); +void lje_gpu_clear(); +int ** lje_gpu_compute_n(const int ago, const int inum, const int nall, + double **host_x, int *host_type, double *sublo, + double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, + const double cpu_time, bool &success); +void lje_gpu_compute(const int ago, const int inum, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success); +double lje_gpu_bytes(); + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +PairLJExpandGPU::PairLJExpandGPU(LAMMPS *lmp) : PairLJExpand(lmp), gpu_mode(GPU_PAIR) +{ + respa_enable = 0; + cpu_time = 0.0; +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairLJExpandGPU::~PairLJExpandGPU() +{ + lje_gpu_clear(); +} + +/* ---------------------------------------------------------------------- */ + +void PairLJExpandGPU::compute(int eflag, int vflag) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + else evflag = vflag_fdotr = 0; + + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh, **firstneigh; + if (gpu_mode == GPU_NEIGH) { + inum = atom->nlocal; + firstneigh = lje_gpu_compute_n(neighbor->ago, inum, nall, atom->x, + atom->type, domain->sublo, domain->subhi, + atom->tag, atom->nspecial, atom->special, + eflag, vflag, eflag_atom, vflag_atom, + host_start, &ilist, &numneigh, cpu_time, + success); + } else { + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + lje_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, + ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success); + } + if (!success) + error->one("Out of memory on GPGPU"); + + if (host_startnewton_pair) + error->all("Cannot use newton pair with GPU LJ pair style"); + + // Repeat cutsq calculation because done after call to init_style + double maxcut = -1.0; + double cut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + cut = init_one(i,j); + cut *= cut; + if (cut > maxcut) + maxcut = cut; + cutsq[i][j] = cutsq[j][i] = cut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + double cell_size = sqrt(maxcut) + neighbor->skin; + + int maxspecial=0; + if (atom->molecular) + maxspecial=atom->maxspecial; + int success = lje_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4, + offset, shift, force->special_lj, atom->nlocal, + atom->nlocal+atom->nghost, 300, maxspecial, + cell_size, gpu_mode, screen); + GPU_EXTRA::check_flag(success,error,world); + + if (gpu_mode != GPU_NEIGH) { + int irequest = neighbor->request(this); + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->full = 1; + } +} + +/* ---------------------------------------------------------------------- */ + +double PairLJExpandGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + lje_gpu_bytes(); +} + +/* ---------------------------------------------------------------------- */ + +void PairLJExpandGPU::cpu_compute(int start, int inum, int eflag, int vflag, + int *ilist, int *numneigh, int **firstneigh) +{ + int i,j,ii,jj,jnum,itype,jtype; + double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; + double rsq,r2inv,r6inv,forcelj,factor_lj; + double r,rshift,rshiftsq; + int *jlist; + + double **x = atom->x; + double **f = atom->f; + int *type = atom->type; + int nlocal = atom->nlocal; + int nall = nlocal + atom->nghost; + double *special_lj = force->special_lj; + + // loop over neighbors of my atoms + + for (ii = start; ii < inum; ii++) { + i = ilist[ii]; + xtmp = x[i][0]; + ytmp = x[i][1]; + ztmp = x[i][2]; + itype = type[i]; + jlist = firstneigh[i]; + jnum = numneigh[i]; + + for (jj = 0; jj < jnum; jj++) { + j = jlist[jj]; + factor_lj = special_lj[sbmask(j)]; + j &= NEIGHMASK; + + delx = xtmp - x[j][0]; + dely = ytmp - x[j][1]; + delz = ztmp - x[j][2]; + rsq = delx*delx + dely*dely + delz*delz; + jtype = type[j]; + + if (rsq < cutsq[itype][jtype]) { + r = sqrt(rsq); + rshift = r - shift[itype][jtype]; + rshiftsq = rshift*rshift; + r2inv = 1.0/rshiftsq; + r6inv = r2inv*r2inv*r2inv; + forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]); + fpair = factor_lj*forcelj/rshift/r; + + f[i][0] += delx*fpair; + f[i][1] += dely*fpair; + f[i][2] += delz*fpair; + + if (eflag) { + evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) - + offset[itype][jtype]; + evdwl *= factor_lj; + } + + if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz); + } + } + } +} diff --git a/src/GPU/pair_lj_expand_gpu.h b/src/GPU/pair_lj_expand_gpu.h new file mode 100644 index 0000000000..96619034c7 --- /dev/null +++ b/src/GPU/pair_lj_expand_gpu.h @@ -0,0 +1,47 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(lj/expand/gpu,PairLJExpandGPU) + +#else + +#ifndef LMP_PAIR_LJE_LIGHT_GPU_H +#define LMP_PAIR_LJE_LIGHT_GPU_H + +#include "pair_lj_expand.h" + +namespace LAMMPS_NS { + +class PairLJExpandGPU : public PairLJExpand { + public: + PairLJExpandGPU(LAMMPS *lmp); + ~PairLJExpandGPU(); + void cpu_compute(int, int, int, int, int *, int *, int **); + void compute(int, int); + void init_style(); + double memory_usage(); + + enum { GPU_PAIR, GPU_NEIGH }; + + private: + int gpu_mode; + double cpu_time; + int *gpulist; +}; + +} +#endif +#endif + diff --git a/src/GPU/pair_morse_gpu.cpp b/src/GPU/pair_morse_gpu.cpp new file mode 100644 index 0000000000..711359a300 --- /dev/null +++ b/src/GPU/pair_morse_gpu.cpp @@ -0,0 +1,230 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing author: Mike Brown (SNL) +------------------------------------------------------------------------- */ + +#include "math.h" +#include "stdio.h" +#include "stdlib.h" +#include "pair_morse_gpu.h" +#include "lmptype.h" +#include "atom.h" +#include "atom_vec.h" +#include "comm.h" +#include "force.h" +#include "neighbor.h" +#include "neigh_list.h" +#include "integrate.h" +#include "memory.h" +#include "error.h" +#include "neigh_request.h" +#include "universe.h" +#include "update.h" +#include "domain.h" +#include "string.h" +#include "gpu_extra.h" + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +// External functions from cuda library for atom decomposition + +int mor_gpu_init(const int ntypes, double **cutsq, double **host_morse1, + double **host_r0, double **host_alpha, double **host_d0, + double **offset, double *special_lj, const int nlocal, + const int nall, const int max_nbors, const int maxspecial, + const double cell_size, int &gpu_mode, FILE *screen); +void mor_gpu_clear(); +int ** mor_gpu_compute_n(const int ago, const int inum, + const int nall, double **host_x, int *host_type, + double *sublo, double *subhi, int *tag, int **nspecial, + int **special, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + int **ilist, int **jnum, + const double cpu_time, bool &success); +void mor_gpu_compute(const int ago, const int inum, const int nall, + double **host_x, int *host_type, int *ilist, int *numj, + int **firstneigh, const bool eflag, const bool vflag, + const bool eatom, const bool vatom, int &host_start, + const double cpu_time, bool &success); +double mor_gpu_bytes(); + +using namespace LAMMPS_NS; + +/* ---------------------------------------------------------------------- */ + +PairMorseGPU::PairMorseGPU(LAMMPS *lmp) : PairMorse(lmp), gpu_mode(GPU_PAIR) +{ + cpu_time = 0.0; +} + +/* ---------------------------------------------------------------------- + free all arrays +------------------------------------------------------------------------- */ + +PairMorseGPU::~PairMorseGPU() +{ + mor_gpu_clear(); +} + +/* ---------------------------------------------------------------------- */ + +void PairMorseGPU::compute(int eflag, int vflag) +{ + if (eflag || vflag) ev_setup(eflag,vflag); + else evflag = vflag_fdotr = 0; + + int nall = atom->nlocal + atom->nghost; + int inum, host_start; + + bool success = true; + int *ilist, *numneigh, **firstneigh; + if (gpu_mode == GPU_NEIGH) { + inum = atom->nlocal; + firstneigh = mor_gpu_compute_n(neighbor->ago, inum, nall, + atom->x, atom->type, domain->sublo, + domain->subhi, atom->tag, atom->nspecial, + atom->special, eflag, vflag, eflag_atom, + vflag_atom, host_start, &ilist, &numneigh, + cpu_time, success); + } else { + inum = list->inum; + ilist = list->ilist; + numneigh = list->numneigh; + firstneigh = list->firstneigh; + mor_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, + ilist, numneigh, firstneigh, eflag, vflag, eflag_atom, + vflag_atom, host_start, cpu_time, success); + } + if (!success) + error->one("Out of memory on GPGPU"); + + if (host_startnewton_pair) + error->all("Cannot use newton pair with GPU Morse pair style"); + + // Repeat cutsq calculation because done after call to init_style + double maxcut = -1.0; + double cut; + for (int i = 1; i <= atom->ntypes; i++) { + for (int j = i; j <= atom->ntypes; j++) { + if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { + cut = init_one(i,j); + cut *= cut; + if (cut > maxcut) + maxcut = cut; + cutsq[i][j] = cutsq[j][i] = cut; + } else + cutsq[i][j] = cutsq[j][i] = 0.0; + } + } + double cell_size = sqrt(maxcut) + neighbor->skin; + + int maxspecial=0; + if (atom->molecular) + maxspecial=atom->maxspecial; + int success = mor_gpu_init(atom->ntypes+1, cutsq, morse1, r0, alpha, d0, + offset, force->special_lj, atom->nlocal, + atom->nlocal+atom->nghost, 300, maxspecial, + cell_size, gpu_mode, screen); + GPU_EXTRA::check_flag(success,error,world); + + if (gpu_mode != GPU_NEIGH) { + int irequest = neighbor->request(this); + neighbor->requests[irequest]->half = 0; + neighbor->requests[irequest]->full = 1; + } +} + +/* ---------------------------------------------------------------------- */ + +double PairMorseGPU::memory_usage() +{ + double bytes = Pair::memory_usage(); + return bytes + mor_gpu_bytes(); +} + +/* ---------------------------------------------------------------------- */ + +void PairMorseGPU::cpu_compute(int start, int inum, int eflag, int vflag, + int *ilist, int *numneigh, int **firstneigh) +{ + int i,j,ii,jj,jnum,itype,jtype; + double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair; + double rsq,r,dr,dexp,factor_lj; + int *jlist; + + double **x = atom->x; + double **f = atom->f; + int *type = atom->type; + int nlocal = atom->nlocal; + int nall = nlocal + atom->nghost; + double *special_lj = force->special_lj; + + // loop over neighbors of my atoms + + for (ii = start; ii < inum; ii++) { + i = ilist[ii]; + xtmp = x[i][0]; + ytmp = x[i][1]; + ztmp = x[i][2]; + itype = type[i]; + jlist = firstneigh[i]; + jnum = numneigh[i]; + + for (jj = 0; jj < jnum; jj++) { + j = jlist[jj]; + factor_lj = special_lj[sbmask(j)]; + j &= NEIGHMASK; + + delx = xtmp - x[j][0]; + dely = ytmp - x[j][1]; + delz = ztmp - x[j][2]; + rsq = delx*delx + dely*dely + delz*delz; + jtype = type[j]; + + if (rsq < cutsq[itype][jtype]) { + r = sqrt(rsq); + dr = r - r0[itype][jtype]; + dexp = exp(-alpha[itype][jtype] * dr); + fpair = factor_lj * morse1[itype][jtype] * (dexp*dexp - dexp) / r; + + f[i][0] += delx*fpair; + f[i][1] += dely*fpair; + f[i][2] += delz*fpair; + + if (eflag) { + evdwl = d0[itype][jtype] * (dexp*dexp - 2.0*dexp) - + offset[itype][jtype]; + evdwl *= factor_lj; + } + + if (evflag) ev_tally_full(i,evdwl,0.0,fpair,delx,dely,delz); + } + } + } +} diff --git a/src/GPU/pair_morse_gpu.h b/src/GPU/pair_morse_gpu.h new file mode 100644 index 0000000000..ee9ba576d5 --- /dev/null +++ b/src/GPU/pair_morse_gpu.h @@ -0,0 +1,47 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef PAIR_CLASS + +PairStyle(morse/gpu,PairMorseGPU) + +#else + +#ifndef LMP_PAIR_MORSE_GPU_H +#define LMP_PAIR_MORSE_GPU_H + +#include "pair_morse.h" + +namespace LAMMPS_NS { + +class PairMorseGPU : public PairMorse { + public: + PairMorseGPU(LAMMPS *lmp); + ~PairMorseGPU(); + void cpu_compute(int, int, int, int, int *, int *, int **); + void compute(int, int); + void init_style(); + double memory_usage(); + + enum { GPU_PAIR, GPU_NEIGH }; + + private: + int gpu_mode; + double cpu_time; + int *gpulist; +}; + +} +#endif +#endif + diff --git a/src/GPU/pppm_gpu.cpp b/src/GPU/pppm_gpu.cpp new file mode 100644 index 0000000000..d447419ce7 --- /dev/null +++ b/src/GPU/pppm_gpu.cpp @@ -0,0 +1,1743 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL) +------------------------------------------------------------------------- */ + +#include "mpi.h" +#include "string.h" +#include "stdio.h" +#include "stdlib.h" +#include "math.h" +#include "pppm_gpu.h" +#include "lmptype.h" +#include "atom.h" +#include "comm.h" +#include "neighbor.h" +#include "force.h" +#include "pair.h" +#include "bond.h" +#include "angle.h" +#include "domain.h" +#include "fft3d_wrap.h" +#include "remap_wrap.h" +#include "memory.h" +#include "error.h" + +using namespace LAMMPS_NS; + +#define MAXORDER 7 +#define OFFSET 16384 +#define SMALL 0.00001 +#define LARGE 10000.0 +#define EPS_HOC 1.0e-7 + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +#define PPPMGPUT PPPMGPU + +/* ---------------------------------------------------------------------- */ + +template +PPPMGPUT::PPPMGPU(LAMMPS *lmp, int narg, char **arg) : KSpace(lmp, narg, arg) +{ + if (narg != 1) error->all("Illegal kspace_style pppm command"); + + precision = atof(arg[0]); + PI = 4.0*atan(1.0); + + nfactors = 3; + factors = new int[nfactors]; + factors[0] = 2; + factors[1] = 3; + factors[2] = 5; + + MPI_Comm_rank(world,&me); + MPI_Comm_size(world,&nprocs); + + density_brick = NULL; + vd_brick = NULL; + density_fft = NULL; + greensfn = NULL; + work1 = work2 = NULL; + vg = NULL; + fkx = fky = fkz = NULL; + buf1 = buf2 = NULL; + + gf_b = NULL; + rho1d = rho_coeff = NULL; + + fft1 = fft2 = NULL; + remap = NULL; + + nmax = 0; +} + +/* ---------------------------------------------------------------------- + free all memory +------------------------------------------------------------------------- */ + +template +PPPMGPUT::~PPPMGPU() +{ + delete [] factors; + deallocate(); +} + +/* ---------------------------------------------------------------------- + called once before run +------------------------------------------------------------------------- */ + +template +void PPPMGPUT::base_init() +{ + if (me == 0) { + if (screen) fprintf(screen,"PPPM initialization ...\n"); + if (logfile) fprintf(logfile,"PPPM initialization ...\n"); + } + + // error check + + if (domain->triclinic) + error->all("Cannot (yet) use PPPM with triclinic box"); + if (domain->dimension == 2) error->all("Cannot use PPPM with 2d simulation"); + + if (!atom->q_flag) error->all("Kspace style requires atom attribute q"); + + if (slabflag == 0 && domain->nonperiodic > 0) + error->all("Cannot use nonperiodic boundaries with PPPM"); + if (slabflag == 1) { + if (domain->xperiodic != 1 || domain->yperiodic != 1 || + domain->boundary[2][0] != 1 || domain->boundary[2][1] != 1) + error->all("Incorrect boundaries with slab PPPM"); + } + + if (order > MAXORDER) { + char str[128]; + sprintf(str,"PPPM order cannot be greater than %d",MAXORDER); + error->all(str); + } + + // free all arrays previously allocated + + deallocate(); + + // extract short-range Coulombic cutoff from pair style + + qqrd2e = force->qqrd2e; + scale = 1.0; + + if (force->pair == NULL) + error->all("KSpace style is incompatible with Pair style"); + int itmp; + double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp); + if (p_cutoff == NULL) + error->all("KSpace style is incompatible with Pair style"); + cutoff = *p_cutoff; + + // if kspace is TIP4P, extract TIP4P params from pair style + // bond/angle are not yet init(), so insure equilibrium request is valid + + qdist = 0.0; + + if (strcmp(force->kspace_style,"pppm/tip4p") == 0) { + if (force->pair == NULL) + error->all("KSpace style is incompatible with Pair style"); + double *p_qdist = (double *) force->pair->extract("qdist",itmp); + int *p_typeO = (int *) force->pair->extract("typeO",itmp); + int *p_typeH = (int *) force->pair->extract("typeH",itmp); + int *p_typeA = (int *) force->pair->extract("typeA",itmp); + int *p_typeB = (int *) force->pair->extract("typeB",itmp); + if (!p_qdist || !p_typeO || !p_typeH || !p_typeA || !p_typeB) + error->all("KSpace style is incompatible with Pair style"); + qdist = *p_qdist; + typeO = *p_typeO; + typeH = *p_typeH; + int typeA = *p_typeA; + int typeB = *p_typeB; + + if (force->angle == NULL || force->bond == NULL) + error->all("Bond and angle potentials must be defined for TIP4P"); + if (typeA < 1 || typeA > atom->nangletypes || + force->angle->setflag[typeA] == 0) + error->all("Bad TIP4P angle type for PPPM/TIP4P"); + if (typeB < 1 || typeA > atom->nbondtypes || + force->bond->setflag[typeB] == 0) + error->all("Bad TIP4P bond type for PPPM/TIP4P"); + double theta = force->angle->equilibrium_angle(typeA); + double blen = force->bond->equilibrium_distance(typeB); + alpha = qdist / (2.0 * cos(0.5*theta) * blen); + } + + // compute qsum & qsqsum and warn if not charge-neutral + + qsum = qsqsum = 0.0; + for (int i = 0; i < atom->nlocal; i++) { + qsum += atom->q[i]; + qsqsum += atom->q[i]*atom->q[i]; + } + + double tmp; + MPI_Allreduce(&qsum,&tmp,1,MPI_DOUBLE,MPI_SUM,world); + qsum = tmp; + MPI_Allreduce(&qsqsum,&tmp,1,MPI_DOUBLE,MPI_SUM,world); + qsqsum = tmp; + + if (qsqsum == 0.0) + error->all("Cannot use kspace solver on system with no charge"); + if (fabs(qsum) > SMALL && me == 0) { + char str[128]; + sprintf(str,"System is not charge neutral, net charge = %g",qsum); + error->warning(str); + } + + // setup FFT grid resolution and g_ewald + // normally one iteration thru while loop is all that is required + // if grid stencil extends beyond neighbor proc, reduce order and try again + + int iteration = 0; + + while (order > 0) { + + if (iteration && me == 0) + error->warning("Reducing PPPM order b/c stencil extends " + "beyond neighbor processor"); + iteration++; + + set_grid(); + + if (nx_pppm >= OFFSET || ny_pppm >= OFFSET || nz_pppm >= OFFSET) + error->all("PPPM grid is too large"); + + // global indices of PPPM grid range from 0 to N-1 + // nlo_in,nhi_in = lower/upper limits of the 3d sub-brick of + // global PPPM grid that I own without ghost cells + // for slab PPPM, assign z grid as if it were not extended + + nxlo_in = comm->myloc[0]*nx_pppm / comm->procgrid[0]; + nxhi_in = (comm->myloc[0]+1)*nx_pppm / comm->procgrid[0] - 1; + nylo_in = comm->myloc[1]*ny_pppm / comm->procgrid[1]; + nyhi_in = (comm->myloc[1]+1)*ny_pppm / comm->procgrid[1] - 1; + nzlo_in = comm->myloc[2] * + (static_cast (nz_pppm/slab_volfactor)) / comm->procgrid[2]; + nzhi_in = (comm->myloc[2]+1) * + (static_cast (nz_pppm/slab_volfactor)) / comm->procgrid[2] - 1; + + // nlower,nupper = stencil size for mapping particles to PPPM grid + + nlower = -(order-1)/2; + nupper = order/2; + + // shift values for particle <-> grid mapping + // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1 + + if (order % 2) shift = OFFSET + 0.5; + else shift = OFFSET; + if (order % 2) shiftone = 0.0; + else shiftone = 0.5; + + // nlo_out,nhi_out = lower/upper limits of the 3d sub-brick of + // global PPPM grid that my particles can contribute charge to + // effectively nlo_in,nhi_in + ghost cells + // nlo,nhi = global coords of grid pt to "lower left" of smallest/largest + // position a particle in my box can be at + // dist[3] = particle position bound = subbox + skin/2.0 + qdist + // qdist = offset due to TIP4P fictitious charge + // convert to triclinic if necessary + // nlo_out,nhi_out = nlo,nhi + stencil size for particle mapping + // for slab PPPM, assign z grid as if it were not extended + + triclinic = domain->triclinic; + double *prd,*sublo,*subhi; + + if (triclinic == 0) { + prd = domain->prd; + boxlo = domain->boxlo; + sublo = domain->sublo; + subhi = domain->subhi; + } else { + prd = domain->prd_lamda; + boxlo = domain->boxlo_lamda; + sublo = domain->sublo_lamda; + subhi = domain->subhi_lamda; + } + + double xprd = prd[0]; + double yprd = prd[1]; + double zprd = prd[2]; + double zprd_slab = zprd*slab_volfactor; + + double dist[3]; + double cuthalf = 0.5*neighbor->skin + qdist; + if (triclinic == 0) dist[0] = dist[1] = dist[2] = cuthalf; + else { + dist[0] = cuthalf/domain->prd[0]; + dist[1] = cuthalf/domain->prd[1]; + dist[2] = cuthalf/domain->prd[2]; + } + + int nlo,nhi; + + nlo = static_cast ((sublo[0]-dist[0]-boxlo[0]) * + nx_pppm/xprd + shift) - OFFSET; + nhi = static_cast ((subhi[0]+dist[0]-boxlo[0]) * + nx_pppm/xprd + shift) - OFFSET; + nxlo_out = nlo + nlower; + nxhi_out = nhi + nupper; + + nlo = static_cast ((sublo[1]-dist[1]-boxlo[1]) * + ny_pppm/yprd + shift) - OFFSET; + nhi = static_cast ((subhi[1]+dist[1]-boxlo[1]) * + ny_pppm/yprd + shift) - OFFSET; + nylo_out = nlo + nlower; + nyhi_out = nhi + nupper; + + nlo = static_cast ((sublo[2]-dist[2]-boxlo[2]) * + nz_pppm/zprd_slab + shift) - OFFSET; + nhi = static_cast ((subhi[2]+dist[2]-boxlo[2]) * + nz_pppm/zprd_slab + shift) - OFFSET; + nzlo_out = nlo + nlower; + nzhi_out = nhi + nupper; + + // for slab PPPM, change the grid boundary for processors at +z end + // to include the empty volume between periodically repeating slabs + // for slab PPPM, want charge data communicated from -z proc to +z proc, + // but not vice versa, also want field data communicated from +z proc to + // -z proc, but not vice versa + // this is accomplished by nzhi_in = nzhi_out on +z end (no ghost cells) + + if (slabflag && ((comm->myloc[2]+1) == (comm->procgrid[2]))) { + nzhi_in = nz_pppm - 1; + nzhi_out = nz_pppm - 1; + } + + // nlo_ghost,nhi_ghost = # of planes I will recv from 6 directions + // that overlay domain I own + // proc in that direction tells me via sendrecv() + // if no neighbor proc, value is from self since I have ghosts regardless + + int nplanes; + MPI_Status status; + + nplanes = nxlo_in - nxlo_out; + if (comm->procneigh[0][0] != me) + MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[0][0],0, + &nxhi_ghost,1,MPI_INT,comm->procneigh[0][1],0, + world,&status); + else nxhi_ghost = nplanes; + + nplanes = nxhi_out - nxhi_in; + if (comm->procneigh[0][1] != me) + MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[0][1],0, + &nxlo_ghost,1,MPI_INT,comm->procneigh[0][0], + 0,world,&status); + else nxlo_ghost = nplanes; + + nplanes = nylo_in - nylo_out; + if (comm->procneigh[1][0] != me) + MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[1][0],0, + &nyhi_ghost,1,MPI_INT,comm->procneigh[1][1],0, + world,&status); + else nyhi_ghost = nplanes; + + nplanes = nyhi_out - nyhi_in; + if (comm->procneigh[1][1] != me) + MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[1][1],0, + &nylo_ghost,1,MPI_INT,comm->procneigh[1][0],0, + world,&status); + else nylo_ghost = nplanes; + + nplanes = nzlo_in - nzlo_out; + if (comm->procneigh[2][0] != me) + MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[2][0],0, + &nzhi_ghost,1,MPI_INT,comm->procneigh[2][1],0, + world,&status); + else nzhi_ghost = nplanes; + + nplanes = nzhi_out - nzhi_in; + if (comm->procneigh[2][1] != me) + MPI_Sendrecv(&nplanes,1,MPI_INT,comm->procneigh[2][1],0, + &nzlo_ghost,1,MPI_INT,comm->procneigh[2][0],0, + world,&status); + else nzlo_ghost = nplanes; + + // test that ghost overlap is not bigger than my sub-domain + + int flag = 0; + if (nxlo_ghost > nxhi_in-nxlo_in+1) flag = 1; + if (nxhi_ghost > nxhi_in-nxlo_in+1) flag = 1; + if (nylo_ghost > nyhi_in-nylo_in+1) flag = 1; + if (nyhi_ghost > nyhi_in-nylo_in+1) flag = 1; + if (nzlo_ghost > nzhi_in-nzlo_in+1) flag = 1; + if (nzhi_ghost > nzhi_in-nzlo_in+1) flag = 1; + + int flag_all; + MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world); + + if (flag_all == 0) break; + order--; + } + + if (order == 0) error->all("PPPM order has been reduced to 0"); + + // decomposition of FFT mesh + // global indices range from 0 to N-1 + // proc owns entire x-dimension, clump of columns in y,z dimensions + // npey_fft,npez_fft = # of procs in y,z dims + // if nprocs is small enough, proc can own 1 or more entire xy planes, + // else proc owns 2d sub-blocks of yz plane + // me_y,me_z = which proc (0-npe_fft-1) I am in y,z dimensions + // nlo_fft,nhi_fft = lower/upper limit of the section + // of the global FFT mesh that I own + + int npey_fft,npez_fft; + if (nz_pppm >= nprocs) { + npey_fft = 1; + npez_fft = nprocs; + } else procs2grid2d(nprocs,ny_pppm,nz_pppm,&npey_fft,&npez_fft); + + int me_y = me % npey_fft; + int me_z = me / npey_fft; + + nxlo_fft = 0; + nxhi_fft = nx_pppm - 1; + nylo_fft = me_y*ny_pppm/npey_fft; + nyhi_fft = (me_y+1)*ny_pppm/npey_fft - 1; + nzlo_fft = me_z*nz_pppm/npez_fft; + nzhi_fft = (me_z+1)*nz_pppm/npez_fft - 1; + + // PPPM grid for this proc, including ghosts + + ngrid = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) * + (nzhi_out-nzlo_out+1); + + // FFT arrays on this proc, without ghosts + // nfft = FFT points in FFT decomposition on this proc + // nfft_brick = FFT points in 3d brick-decomposition on this proc + // nfft_both = greater of 2 values + + nfft = (nxhi_fft-nxlo_fft+1) * (nyhi_fft-nylo_fft+1) * + (nzhi_fft-nzlo_fft+1); + int nfft_brick = (nxhi_in-nxlo_in+1) * (nyhi_in-nylo_in+1) * + (nzhi_in-nzlo_in+1); + nfft_both = MAX(nfft,nfft_brick); + + // buffer space for use in brick2fft and fillbrick + // idel = max # of ghost planes to send or recv in +/- dir of each dim + // nx,ny,nz = owned planes (including ghosts) in each dim + // nxx,nyy,nzz = max # of grid cells to send in each dim + // nbuf = max in any dim, augment by 3x for components of vd_xyz in fillbrick + + int idelx,idely,idelz,nx,ny,nz,nxx,nyy,nzz; + + idelx = MAX(nxlo_ghost,nxhi_ghost); + idelx = MAX(idelx,nxhi_out-nxhi_in); + idelx = MAX(idelx,nxlo_in-nxlo_out); + + idely = MAX(nylo_ghost,nyhi_ghost); + idely = MAX(idely,nyhi_out-nyhi_in); + idely = MAX(idely,nylo_in-nylo_out); + + idelz = MAX(nzlo_ghost,nzhi_ghost); + idelz = MAX(idelz,nzhi_out-nzhi_in); + idelz = MAX(idelz,nzlo_in-nzlo_out); + + nx = nxhi_out - nxlo_out + 1; + ny = nyhi_out - nylo_out + 1; + nz = nzhi_out - nzlo_out + 1; + + nxx = idelx * ny * nz; + nyy = idely * nx * nz; + nzz = idelz * nx * ny; + + nbuf = MAX(nxx,nyy); + nbuf = MAX(nbuf,nzz); + nbuf *= 3; + + // print stats + + int ngrid_max,nfft_both_max,nbuf_max; + MPI_Allreduce(&ngrid,&ngrid_max,1,MPI_INT,MPI_MAX,world); + MPI_Allreduce(&nfft_both,&nfft_both_max,1,MPI_INT,MPI_MAX,world); + MPI_Allreduce(&nbuf,&nbuf_max,1,MPI_INT,MPI_MAX,world); + + if (me == 0) { + if (screen) fprintf(screen," brick FFT buffer size/proc = %d %d %d\n", + ngrid_max,nfft_both_max,nbuf_max); + if (logfile) fprintf(logfile," brick FFT buffer size/proc = %d %d %d\n", + ngrid_max,nfft_both_max,nbuf_max); + } + + // allocate K-space dependent memory + + allocate(); + + // pre-compute Green's function denomiator expansion + // pre-compute 1d charge distribution coefficients + + compute_gf_denom(); + compute_rho_coeff(); +} + +/* ---------------------------------------------------------------------- + adjust PPPM coeffs, called initially and whenever volume has changed +------------------------------------------------------------------------- */ + +template +void PPPMGPUT::setup() +{ + int i,j,k,l,m,n; + double *prd; + + // volume-dependent factors + // adjust z dimension for 2d slab PPPM + // z dimension for 3d PPPM is zprd since slab_volfactor = 1.0 + + if (triclinic == 0) prd = domain->prd; + else prd = domain->prd_lamda; + + double xprd = prd[0]; + double yprd = prd[1]; + double zprd = prd[2]; + double zprd_slab = zprd*slab_volfactor; + volume = xprd * yprd * zprd_slab; + + delxinv = nx_pppm/xprd; + delyinv = ny_pppm/yprd; + delzinv = nz_pppm/zprd_slab; + + delvolinv = delxinv*delyinv*delzinv; + + double unitkx = (2.0*PI/xprd); + double unitky = (2.0*PI/yprd); + double unitkz = (2.0*PI/zprd_slab); + + // fkx,fky,fkz for my FFT grid pts + + double per; + + for (i = nxlo_fft; i <= nxhi_fft; i++) { + per = i - nx_pppm*(2*i/nx_pppm); + fkx[i] = unitkx*per; + } + + for (i = nylo_fft; i <= nyhi_fft; i++) { + per = i - ny_pppm*(2*i/ny_pppm); + fky[i] = unitky*per; + } + + for (i = nzlo_fft; i <= nzhi_fft; i++) { + per = i - nz_pppm*(2*i/nz_pppm); + fkz[i] = unitkz*per; + } + + // virial coefficients + + double sqk,vterm; + + n = 0; + for (k = nzlo_fft; k <= nzhi_fft; k++) { + for (j = nylo_fft; j <= nyhi_fft; j++) { + for (i = nxlo_fft; i <= nxhi_fft; i++) { + sqk = fkx[i]*fkx[i] + fky[j]*fky[j] + fkz[k]*fkz[k]; + if (sqk == 0.0) { + vg[n][0] = 0.0; + vg[n][1] = 0.0; + vg[n][2] = 0.0; + vg[n][3] = 0.0; + vg[n][4] = 0.0; + vg[n][5] = 0.0; + } else { + vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald)); + vg[n][0] = 1.0 + vterm*fkx[i]*fkx[i]; + vg[n][1] = 1.0 + vterm*fky[j]*fky[j]; + vg[n][2] = 1.0 + vterm*fkz[k]*fkz[k]; + vg[n][3] = vterm*fkx[i]*fky[j]; + vg[n][4] = vterm*fkx[i]*fkz[k]; + vg[n][5] = vterm*fky[j]*fkz[k]; + } + n++; + } + } + } + + // modified (Hockney-Eastwood) Coulomb Green's function + + int nx,ny,nz,kper,lper,mper; + double snx,sny,snz,snx2,sny2,snz2; + double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz; + double sum1,dot1,dot2; + double numerator,denominator; + + int nbx = static_cast ((g_ewald*xprd/(PI*nx_pppm)) * + pow(-log(EPS_HOC),0.25)); + int nby = static_cast ((g_ewald*yprd/(PI*ny_pppm)) * + pow(-log(EPS_HOC),0.25)); + int nbz = static_cast ((g_ewald*zprd_slab/(PI*nz_pppm)) * + pow(-log(EPS_HOC),0.25)); + + double form = 1.0; + + n = 0; + for (m = nzlo_fft; m <= nzhi_fft; m++) { + mper = m - nz_pppm*(2*m/nz_pppm); + snz = sin(0.5*unitkz*mper*zprd_slab/nz_pppm); + snz2 = snz*snz; + + for (l = nylo_fft; l <= nyhi_fft; l++) { + lper = l - ny_pppm*(2*l/ny_pppm); + sny = sin(0.5*unitky*lper*yprd/ny_pppm); + sny2 = sny*sny; + + for (k = nxlo_fft; k <= nxhi_fft; k++) { + kper = k - nx_pppm*(2*k/nx_pppm); + snx = sin(0.5*unitkx*kper*xprd/nx_pppm); + snx2 = snx*snx; + + sqk = pow(unitkx*kper,2.0) + pow(unitky*lper,2.0) + + pow(unitkz*mper,2.0); + + if (sqk != 0.0) { + numerator = form*12.5663706/sqk; + denominator = gf_denom(snx2,sny2,snz2); + sum1 = 0.0; + for (nx = -nbx; nx <= nbx; nx++) { + qx = unitkx*(kper+nx_pppm*nx); + sx = exp(-.25*pow(qx/g_ewald,2.0)); + wx = 1.0; + argx = 0.5*qx*xprd/nx_pppm; + if (argx != 0.0) wx = pow(sin(argx)/argx,order); + for (ny = -nby; ny <= nby; ny++) { + qy = unitky*(lper+ny_pppm*ny); + sy = exp(-.25*pow(qy/g_ewald,2.0)); + wy = 1.0; + argy = 0.5*qy*yprd/ny_pppm; + if (argy != 0.0) wy = pow(sin(argy)/argy,order); + for (nz = -nbz; nz <= nbz; nz++) { + qz = unitkz*(mper+nz_pppm*nz); + sz = exp(-.25*pow(qz/g_ewald,2.0)); + wz = 1.0; + argz = 0.5*qz*zprd_slab/nz_pppm; + if (argz != 0.0) wz = pow(sin(argz)/argz,order); + + dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz; + dot2 = qx*qx+qy*qy+qz*qz; + sum1 += (dot1/dot2) * sx*sy*sz * pow(wx*wy*wz,2.0); + } + } + } + greensfn[n++] = numerator*sum1/denominator; + } else greensfn[n++] = 0.0; + } + } + } +} + +/* ---------------------------------------------------------------------- + allocate memory that depends on # of K-vectors and order +------------------------------------------------------------------------- */ + +template +void PPPMGPUT::allocate() +{ + memory->create(density_fft,nfft_both,"pppm:density_fft"); + memory->create(greensfn,nfft_both,"pppm:greensfn"); + memory->create(work1,2*nfft_both,"pppm:work1"); + memory->create(work2,2*nfft_both,"pppm:work2"); + memory->create(vg,nfft_both,6,"pppm:vg"); + + memory->create1d_offset(fkx,nxlo_fft,nxhi_fft,"pppm:fkx"); + memory->create1d_offset(fky,nylo_fft,nyhi_fft,"pppm:fky"); + memory->create1d_offset(fkz,nzlo_fft,nzhi_fft,"pppm:fkz"); + + memory->create(buf1,nbuf,"pppm:buf1"); + memory->create(buf2,nbuf,"pppm:buf2"); + + // summation coeffs + + gf_b = new double[order]; + memory->create2d_offset(rho1d,3,-order/2,order/2,"pppm:rho1d"); + memory->create2d_offset(rho_coeff,order,(1-order)/2,order/2,"pppm:rho_coeff"); + + // create 2 FFTs and a Remap + // 1st FFT keeps data in FFT decompostion + // 2nd FFT returns data in 3d brick decomposition + // remap takes data from 3d brick to FFT decomposition + + int tmp; + + fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm, + nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, + nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, + 0,0,&tmp); + + fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm, + nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, + nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, + 0,0,&tmp); + + remap = new Remap(lmp,world, + nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, + nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, + 1,0,0,2); +} + +/* ---------------------------------------------------------------------- + deallocate memory that depends on # of K-vectors and order +------------------------------------------------------------------------- */ + +template +void PPPMGPUT::deallocate() +{ + destroy_3d_offset(density_brick,nzlo_out,nylo_out); + destroy_3d_offset(vd_brick,nzlo_out,nylo_out); + + memory->destroy(density_fft); + memory->destroy(greensfn); + memory->destroy(work1); + memory->destroy(work2); + memory->destroy(vg); + + memory->destroy1d_offset(fkx,nxlo_fft); + memory->destroy1d_offset(fky,nylo_fft); + memory->destroy1d_offset(fkz,nzlo_fft); + + memory->destroy(buf1); + memory->destroy(buf2); + + delete [] gf_b; + memory->destroy2d_offset(rho1d,-order/2); + memory->destroy2d_offset(rho_coeff,(1-order)/2); + + delete fft1; + delete fft2; + delete remap; +} + +/* ---------------------------------------------------------------------- + set size of FFT grid (nx,ny,nz_pppm) and g_ewald +------------------------------------------------------------------------- */ + +template +void PPPMGPUT::set_grid() +{ + // see JCP 109, pg 7698 for derivation of coefficients + // higher order coefficients may be computed if needed + + double **acons; + memory->create(acons,8,7,"pppm:acons"); + + acons[1][0] = 2.0 / 3.0; + acons[2][0] = 1.0 / 50.0; + acons[2][1] = 5.0 / 294.0; + acons[3][0] = 1.0 / 588.0; + acons[3][1] = 7.0 / 1440.0; + acons[3][2] = 21.0 / 3872.0; + acons[4][0] = 1.0 / 4320.0; + acons[4][1] = 3.0 / 1936.0; + acons[4][2] = 7601.0 / 2271360.0; + acons[4][3] = 143.0 / 28800.0; + acons[5][0] = 1.0 / 23232.0; + acons[5][1] = 7601.0 / 13628160.0; + acons[5][2] = 143.0 / 69120.0; + acons[5][3] = 517231.0 / 106536960.0; + acons[5][4] = 106640677.0 / 11737571328.0; + acons[6][0] = 691.0 / 68140800.0; + acons[6][1] = 13.0 / 57600.0; + acons[6][2] = 47021.0 / 35512320.0; + acons[6][3] = 9694607.0 / 2095994880.0; + acons[6][4] = 733191589.0 / 59609088000.0; + acons[6][5] = 326190917.0 / 11700633600.0; + acons[7][0] = 1.0 / 345600.0; + acons[7][1] = 3617.0 / 35512320.0; + acons[7][2] = 745739.0 / 838397952.0; + acons[7][3] = 56399353.0 / 12773376000.0; + acons[7][4] = 25091609.0 / 1560084480.0; + acons[7][5] = 1755948832039.0 / 36229939200000.0; + acons[7][6] = 4887769399.0 / 37838389248.0; + + double q2 = qsqsum / force->dielectric; + bigint natoms = atom->natoms; + + // use xprd,yprd,zprd even if triclinic so grid size is the same + // adjust z dimension for 2d slab PPPM + // 3d PPPM just uses zprd since slab_volfactor = 1.0 + + double xprd = domain->xprd; + double yprd = domain->yprd; + double zprd = domain->zprd; + double zprd_slab = zprd*slab_volfactor; + + // make initial g_ewald estimate + // based on desired error and real space cutoff + // fluid-occupied volume used to estimate real-space error + // zprd used rather than zprd_slab + + double hx,hy,hz; + + if (!gewaldflag) + g_ewald = sqrt(-log(precision*sqrt(natoms*cutoff*xprd*yprd*zprd) / + (2.0*q2))) / cutoff; + + // set optimal nx_pppm,ny_pppm,nz_pppm based on order and precision + // nz_pppm uses extended zprd_slab instead of zprd + // h = 1/g_ewald is upper bound on h such that h*g_ewald <= 1 + // reduce it until precision target is met + + if (!gridflag) { + double err; + hx = hy = hz = 1/g_ewald; + + nx_pppm = static_cast (xprd/hx + 1); + ny_pppm = static_cast (yprd/hy + 1); + nz_pppm = static_cast (zprd_slab/hz + 1); + + err = rms(hx,xprd,natoms,q2,acons); + while (err > precision) { + err = rms(hx,xprd,natoms,q2,acons); + nx_pppm++; + hx = xprd/nx_pppm; + } + + err = rms(hy,yprd,natoms,q2,acons); + while (err > precision) { + err = rms(hy,yprd,natoms,q2,acons); + ny_pppm++; + hy = yprd/ny_pppm; + } + + err = rms(hz,zprd_slab,natoms,q2,acons); + while (err > precision) { + err = rms(hz,zprd_slab,natoms,q2,acons); + nz_pppm++; + hz = zprd_slab/nz_pppm; + } + } + + // boost grid size until it is factorable + + while (!factorable(nx_pppm)) nx_pppm++; + while (!factorable(ny_pppm)) ny_pppm++; + while (!factorable(nz_pppm)) nz_pppm++; + + // adjust g_ewald for new grid size + + hx = xprd/nx_pppm; + hy = yprd/ny_pppm; + hz = zprd_slab/nz_pppm; + + if (!gewaldflag) { + double gew1,gew2,dgew,f,fmid,hmin,rtb; + int ncount; + + gew1 = 0.0; + g_ewald = gew1; + f = diffpr(hx,hy,hz,q2,acons); + + hmin = MIN(hx,MIN(hy,hz)); + gew2 = 10/hmin; + g_ewald = gew2; + fmid = diffpr(hx,hy,hz,q2,acons); + + if (f*fmid >= 0.0) error->all("Cannot compute PPPM G"); + rtb = f < 0.0 ? (dgew=gew2-gew1,gew1) : (dgew=gew1-gew2,gew2); + ncount = 0; + while (fabs(dgew) > SMALL && fmid != 0.0) { + dgew *= 0.5; + g_ewald = rtb + dgew; + fmid = diffpr(hx,hy,hz,q2,acons); + if (fmid <= 0.0) rtb = g_ewald; + ncount++; + if (ncount > LARGE) error->all("Cannot compute PPPM G"); + } + } + + // final RMS precision + + double lprx = rms(hx,xprd,natoms,q2,acons); + double lpry = rms(hy,yprd,natoms,q2,acons); + double lprz = rms(hz,zprd_slab,natoms,q2,acons); + double lpr = sqrt(lprx*lprx + lpry*lpry + lprz*lprz) / sqrt(3.0); + double spr = 2.0*q2 * exp(-g_ewald*g_ewald*cutoff*cutoff) / + sqrt(natoms*cutoff*xprd*yprd*zprd_slab); + + // free local memory + + memory->destroy(acons); + + // print info + + if (me == 0) { + if (screen) { + fprintf(screen," G vector = %g\n",g_ewald); + fprintf(screen," grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm); + fprintf(screen," stencil order = %d\n",order); + fprintf(screen," RMS precision = %g\n",MAX(lpr,spr)); + } + if (logfile) { + fprintf(logfile," G vector = %g\n",g_ewald); + fprintf(logfile," grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm); + fprintf(logfile," stencil order = %d\n",order); + fprintf(logfile," RMS precision = %g\n",MAX(lpr,spr)); + } + } +} + +/* ---------------------------------------------------------------------- + check if all factors of n are in list of factors + return 1 if yes, 0 if no +------------------------------------------------------------------------- */ + +template +int PPPMGPUT::factorable(int n) +{ + int i; + + while (n > 1) { + for (i = 0; i < nfactors; i++) { + if (n % factors[i] == 0) { + n /= factors[i]; + break; + } + } + if (i == nfactors) return 0; + } + + return 1; +} + +/* ---------------------------------------------------------------------- + compute RMS precision for a dimension +------------------------------------------------------------------------- */ + +template +double PPPMGPUT::rms(double h, double prd, bigint natoms, + double q2, double **acons) +{ + double sum = 0.0; + for (int m = 0; m < order; m++) + sum += acons[order][m] * pow(h*g_ewald,2.0*m); + double value = q2 * pow(h*g_ewald,order) * + sqrt(g_ewald*prd*sqrt(2.0*PI)*sum/natoms) / (prd*prd); + return value; +} + +/* ---------------------------------------------------------------------- + compute difference in real-space and kspace RMS precision +------------------------------------------------------------------------- */ + +template +double PPPMGPUT::diffpr(double hx, double hy, double hz, double q2, + double **acons) +{ + double lprx,lpry,lprz,kspace_prec,real_prec; + double xprd = domain->xprd; + double yprd = domain->yprd; + double zprd = domain->zprd; + bigint natoms = atom->natoms; + + lprx = rms(hx,xprd,natoms,q2,acons); + lpry = rms(hy,yprd,natoms,q2,acons); + lprz = rms(hz,zprd*slab_volfactor,natoms,q2,acons); + kspace_prec = sqrt(lprx*lprx + lpry*lpry + lprz*lprz) / sqrt(3.0); + real_prec = 2.0*q2 * exp(-g_ewald*g_ewald*cutoff*cutoff) / + sqrt(natoms*cutoff*xprd*yprd*zprd); + double value = kspace_prec - real_prec; + return value; +} + +/* ---------------------------------------------------------------------- + denominator for Hockney-Eastwood Green's function + of x,y,z = sin(kx*deltax/2), etc + + inf n-1 + S(n,k) = Sum W(k+pi*j)**2 = Sum b(l)*(z*z)**l + j=-inf l=0 + + = -(z*z)**n /(2n-1)! * (d/dx)**(2n-1) cot(x) at z = sin(x) + gf_b = denominator expansion coeffs +------------------------------------------------------------------------- */ + +template +double PPPMGPUT::gf_denom(double x, double y, double z) +{ + double sx,sy,sz; + sz = sy = sx = 0.0; + for (int l = order-1; l >= 0; l--) { + sx = gf_b[l] + sx*x; + sy = gf_b[l] + sy*y; + sz = gf_b[l] + sz*z; + } + double s = sx*sy*sz; + return s*s; +} + +/* ---------------------------------------------------------------------- + pre-compute Green's function denominator expansion coeffs, Gamma(2n) +------------------------------------------------------------------------- */ + +template +void PPPMGPUT::compute_gf_denom() +{ + int k,l,m; + + for (l = 1; l < order; l++) gf_b[l] = 0.0; + gf_b[0] = 1.0; + + for (m = 1; m < order; m++) { + for (l = m; l > 0; l--) + gf_b[l] = 4.0 * (gf_b[l]*(l-m)*(l-m-0.5)-gf_b[l-1]*(l-m-1)*(l-m-1)); + gf_b[0] = 4.0 * (gf_b[0]*(l-m)*(l-m-0.5)); + } + + int ifact = 1; + for (k = 1; k < 2*order; k++) ifact *= k; + double gaminv = 1.0/ifact; + for (l = 0; l < order; l++) gf_b[l] *= gaminv; +} + +/* ---------------------------------------------------------------------- + ghost-swap to accumulate full density in brick decomposition + remap density from 3d brick decomposition to FFT decomposition +------------------------------------------------------------------------- */ + +template +void PPPMGPUT::brick2fft() +{ + int i,n,ix,iy,iz; + MPI_Request request; + MPI_Status status; + + // pack my ghosts for +x processor + // pass data to self or +x processor + // unpack and sum recv data into my real cells + + n = 0; + for (iz = nzlo_out; iz <= nzhi_out; iz++) + for (iy = nylo_out; iy <= nyhi_out; iy++) + for (ix = nxhi_in+1; ix <= nxhi_out; ix++) + buf1[n++] = density_brick[iz][iy][ix]; + + if (comm->procneigh[0][1] == me) + for (i = 0; i < n; i++) buf2[i] = buf1[i]; + else { + MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[0][0],0,world,&request); + MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[0][1],0,world); + MPI_Wait(&request,&status); + } + + n = 0; + for (iz = nzlo_out; iz <= nzhi_out; iz++) + for (iy = nylo_out; iy <= nyhi_out; iy++) + for (ix = nxlo_in; ix < nxlo_in+nxlo_ghost; ix++) + density_brick[iz][iy][ix] += buf2[n++]; + + // pack my ghosts for -x processor + // pass data to self or -x processor + // unpack and sum recv data into my real cells + + n = 0; + for (iz = nzlo_out; iz <= nzhi_out; iz++) + for (iy = nylo_out; iy <= nyhi_out; iy++) + for (ix = nxlo_out; ix < nxlo_in; ix++) + buf1[n++] = density_brick[iz][iy][ix]; + + if (comm->procneigh[0][0] == me) + for (i = 0; i < n; i++) buf2[i] = buf1[i]; + else { + MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[0][1],0,world,&request); + MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[0][0],0,world); + MPI_Wait(&request,&status); + } + + n = 0; + for (iz = nzlo_out; iz <= nzhi_out; iz++) + for (iy = nylo_out; iy <= nyhi_out; iy++) + for (ix = nxhi_in-nxhi_ghost+1; ix <= nxhi_in; ix++) + density_brick[iz][iy][ix] += buf2[n++]; + + // pack my ghosts for +y processor + // pass data to self or +y processor + // unpack and sum recv data into my real cells + + n = 0; + for (iz = nzlo_out; iz <= nzhi_out; iz++) + for (iy = nyhi_in+1; iy <= nyhi_out; iy++) + for (ix = nxlo_in; ix <= nxhi_in; ix++) + buf1[n++] = density_brick[iz][iy][ix]; + + if (comm->procneigh[1][1] == me) + for (i = 0; i < n; i++) buf2[i] = buf1[i]; + else { + MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[1][0],0,world,&request); + MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[1][1],0,world); + MPI_Wait(&request,&status); + } + + n = 0; + for (iz = nzlo_out; iz <= nzhi_out; iz++) + for (iy = nylo_in; iy < nylo_in+nylo_ghost; iy++) + for (ix = nxlo_in; ix <= nxhi_in; ix++) + density_brick[iz][iy][ix] += buf2[n++]; + + // pack my ghosts for -y processor + // pass data to self or -y processor + // unpack and sum recv data into my real cells + + n = 0; + for (iz = nzlo_out; iz <= nzhi_out; iz++) + for (iy = nylo_out; iy < nylo_in; iy++) + for (ix = nxlo_in; ix <= nxhi_in; ix++) + buf1[n++] = density_brick[iz][iy][ix]; + + if (comm->procneigh[1][0] == me) + for (i = 0; i < n; i++) buf2[i] = buf1[i]; + else { + MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[1][1],0,world,&request); + MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[1][0],0,world); + MPI_Wait(&request,&status); + } + + n = 0; + for (iz = nzlo_out; iz <= nzhi_out; iz++) + for (iy = nyhi_in-nyhi_ghost+1; iy <= nyhi_in; iy++) + for (ix = nxlo_in; ix <= nxhi_in; ix++) + density_brick[iz][iy][ix] += buf2[n++]; + + // pack my ghosts for +z processor + // pass data to self or +z processor + // unpack and sum recv data into my real cells + + n = 0; + for (iz = nzhi_in+1; iz <= nzhi_out; iz++) + for (iy = nylo_in; iy <= nyhi_in; iy++) + for (ix = nxlo_in; ix <= nxhi_in; ix++) + buf1[n++] = density_brick[iz][iy][ix]; + + if (comm->procneigh[2][1] == me) + for (i = 0; i < n; i++) buf2[i] = buf1[i]; + else { + MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[2][0],0,world,&request); + MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[2][1],0,world); + MPI_Wait(&request,&status); + } + + n = 0; + for (iz = nzlo_in; iz < nzlo_in+nzlo_ghost; iz++) + for (iy = nylo_in; iy <= nyhi_in; iy++) + for (ix = nxlo_in; ix <= nxhi_in; ix++) + density_brick[iz][iy][ix] += buf2[n++]; + + // pack my ghosts for -z processor + // pass data to self or -z processor + // unpack and sum recv data into my real cells + + n = 0; + for (iz = nzlo_out; iz < nzlo_in; iz++) + for (iy = nylo_in; iy <= nyhi_in; iy++) + for (ix = nxlo_in; ix <= nxhi_in; ix++) + buf1[n++] = density_brick[iz][iy][ix]; + + if (comm->procneigh[2][0] == me) + for (i = 0; i < n; i++) buf2[i] = buf1[i]; + else { + MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[2][1],0,world,&request); + MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[2][0],0,world); + MPI_Wait(&request,&status); + } + + n = 0; + for (iz = nzhi_in-nzhi_ghost+1; iz <= nzhi_in; iz++) + for (iy = nylo_in; iy <= nyhi_in; iy++) + for (ix = nxlo_in; ix <= nxhi_in; ix++) + density_brick[iz][iy][ix] += buf2[n++]; + + // remap from 3d brick decomposition to FFT decomposition + // copy grabs inner portion of density from 3d brick + // remap could be done as pre-stage of FFT, + // but this works optimally on only double values, not complex values + + n = 0; + for (iz = nzlo_in; iz <= nzhi_in; iz++) + for (iy = nylo_in; iy <= nyhi_in; iy++) + for (ix = nxlo_in; ix <= nxhi_in; ix++) + density_fft[n++] = density_brick[iz][iy][ix]; + + remap->perform(density_fft,density_fft,work1); +} + +/* ---------------------------------------------------------------------- + ghost-swap to fill ghost cells of my brick with field values +------------------------------------------------------------------------- */ + +template +void PPPMGPUT::fillbrick() +{ + int i,n,ix,iy,iz; + MPI_Request request; + MPI_Status status; + + // pack my real cells for +z processor + // pass data to self or +z processor + // unpack and sum recv data into my ghost cells + + n = 0; + int x_lo = nxlo_in * 4; + int x_hi = nxhi_in * 4 + 1; + for (iz = nzhi_in-nzhi_ghost+1; iz <= nzhi_in; iz++) + for (iy = nylo_in; iy <= nyhi_in; iy++) + for (ix = x_lo; ix < x_hi; ix+=4) { + buf1[n++] = vd_brick[iz][iy][ix]; + buf1[n++] = vd_brick[iz][iy][ix+1]; + buf1[n++] = vd_brick[iz][iy][ix+2]; + } + + if (comm->procneigh[2][1] == me) + for (i = 0; i < n; i++) buf2[i] = buf1[i]; + else { + MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[2][0],0,world,&request); + MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[2][1],0,world); + MPI_Wait(&request,&status); + } + + n = 0; + for (iz = nzlo_out; iz < nzlo_in; iz++) + for (iy = nylo_in; iy <= nyhi_in; iy++) + for (ix = x_lo; ix < x_hi; ix+=4) { + vd_brick[iz][iy][ix] = buf2[n++]; + vd_brick[iz][iy][ix+1] = buf2[n++]; + vd_brick[iz][iy][ix+2] = buf2[n++]; + } + + // pack my real cells for -z processor + // pass data to self or -z processor + // unpack and sum recv data into my ghost cells + + n = 0; + for (iz = nzlo_in; iz < nzlo_in+nzlo_ghost; iz++) + for (iy = nylo_in; iy <= nyhi_in; iy++) + for (ix = x_lo; ix < x_hi; ix+=4) { + buf1[n++] = vd_brick[iz][iy][ix]; + buf1[n++] = vd_brick[iz][iy][ix+1]; + buf1[n++] = vd_brick[iz][iy][ix+2]; + } + + if (comm->procneigh[2][0] == me) + for (i = 0; i < n; i++) buf2[i] = buf1[i]; + else { + MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[2][1],0,world,&request); + MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[2][0],0,world); + MPI_Wait(&request,&status); + } + + n = 0; + for (iz = nzhi_in+1; iz <= nzhi_out; iz++) + for (iy = nylo_in; iy <= nyhi_in; iy++) + for (ix = x_lo; ix < x_hi; ix+=4) { + vd_brick[iz][iy][ix] = buf2[n++]; + vd_brick[iz][iy][ix+1] = buf2[n++]; + vd_brick[iz][iy][ix+2] = buf2[n++]; + } + + // pack my real cells for +y processor + // pass data to self or +y processor + // unpack and sum recv data into my ghost cells + + n = 0; + for (iz = nzlo_out; iz <= nzhi_out; iz++) + for (iy = nyhi_in-nyhi_ghost+1; iy <= nyhi_in; iy++) + for (ix = x_lo; ix < x_hi; ix+=4) { + buf1[n++] = vd_brick[iz][iy][ix]; + buf1[n++] = vd_brick[iz][iy][ix+1]; + buf1[n++] = vd_brick[iz][iy][ix+2]; + } + + if (comm->procneigh[1][1] == me) + for (i = 0; i < n; i++) buf2[i] = buf1[i]; + else { + MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[1][0],0,world,&request); + MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[1][1],0,world); + MPI_Wait(&request,&status); + } + + n = 0; + for (iz = nzlo_out; iz <= nzhi_out; iz++) + for (iy = nylo_out; iy < nylo_in; iy++) + for (ix = x_lo; ix < x_hi; ix+=4) { + vd_brick[iz][iy][ix] = buf2[n++]; + vd_brick[iz][iy][ix+1] = buf2[n++]; + vd_brick[iz][iy][ix+2] = buf2[n++]; + } + + // pack my real cells for -y processor + // pass data to self or -y processor + // unpack and sum recv data into my ghost cells + + n = 0; + for (iz = nzlo_out; iz <= nzhi_out; iz++) + for (iy = nylo_in; iy < nylo_in+nylo_ghost; iy++) + for (ix = x_lo; ix < x_hi; ix+=4) { + buf1[n++] = vd_brick[iz][iy][ix]; + buf1[n++] = vd_brick[iz][iy][ix+1]; + buf1[n++] = vd_brick[iz][iy][ix+2]; + } + + if (comm->procneigh[1][0] == me) + for (i = 0; i < n; i++) buf2[i] = buf1[i]; + else { + MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[1][1],0,world,&request); + MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[1][0],0,world); + MPI_Wait(&request,&status); + } + + n = 0; + for (iz = nzlo_out; iz <= nzhi_out; iz++) + for (iy = nyhi_in+1; iy <= nyhi_out; iy++) + for (ix = x_lo; ix < x_hi; ix+=4) { + vd_brick[iz][iy][ix] = buf2[n++]; + vd_brick[iz][iy][ix+1] = buf2[n++]; + vd_brick[iz][iy][ix+2] = buf2[n++]; + } + + // pack my real cells for +x processor + // pass data to self or +x processor + // unpack and sum recv data into my ghost cells + + n = 0; + x_lo = (nxhi_in-nxhi_ghost+1) * 4; + for (iz = nzlo_out; iz <= nzhi_out; iz++) + for (iy = nylo_out; iy <= nyhi_out; iy++) + for (ix = x_lo; ix < x_hi; ix+=4) { + buf1[n++] = vd_brick[iz][iy][ix]; + buf1[n++] = vd_brick[iz][iy][ix+1]; + buf1[n++] = vd_brick[iz][iy][ix+2]; + } + + if (comm->procneigh[0][1] == me) + for (i = 0; i < n; i++) buf2[i] = buf1[i]; + else { + MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[0][0],0,world,&request); + MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[0][1],0,world); + MPI_Wait(&request,&status); + } + + n = 0; + x_lo = nxlo_out * 4; + x_hi = nxlo_in * 4; + for (iz = nzlo_out; iz <= nzhi_out; iz++) + for (iy = nylo_out; iy <= nyhi_out; iy++) + for (ix = x_lo; ix < x_hi; ix+=4) { + vd_brick[iz][iy][ix] = buf2[n++]; + vd_brick[iz][iy][ix+1] = buf2[n++]; + vd_brick[iz][iy][ix+2] = buf2[n++]; + } + + // pack my real cells for -x processor + // pass data to self or -x processor + // unpack and sum recv data into my ghost cells + + n = 0; + x_lo = x_hi; + x_hi = (nxlo_in+nxlo_ghost) * 4; + for (iz = nzlo_out; iz <= nzhi_out; iz++) + for (iy = nylo_out; iy <= nyhi_out; iy++) + for (ix = x_lo; ix < x_hi; ix+=4) { + buf1[n++] = vd_brick[iz][iy][ix]; + buf1[n++] = vd_brick[iz][iy][ix+1]; + buf1[n++] = vd_brick[iz][iy][ix+2]; + } + + if (comm->procneigh[0][0] == me) + for (i = 0; i < n; i++) buf2[i] = buf1[i]; + else { + MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[0][1],0,world,&request); + MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[0][0],0,world); + MPI_Wait(&request,&status); + } + + n = 0; + x_lo = (nxhi_in + 1) * 4; + x_hi = nxhi_out * 4 + 1; + for (iz = nzlo_out; iz <= nzhi_out; iz++) + for (iy = nylo_out; iy <= nyhi_out; iy++) + for (ix = x_lo; ix < x_hi; ix+=4) { + vd_brick[iz][iy][ix] = buf2[n++]; + vd_brick[iz][iy][ix+1] = buf2[n++]; + vd_brick[iz][iy][ix+2] = buf2[n++]; + } +} + +/* ---------------------------------------------------------------------- + FFT-based Poisson solver +------------------------------------------------------------------------- */ + +template +void PPPMGPUT::poisson(int eflag, int vflag) +{ + int i,j,k,n; + double eng; + + // transform charge density (r -> k) + + n = 0; + for (i = 0; i < nfft; i++) { + work1[n++] = density_fft[i]; + work1[n++] = 0.0; + } + + fft1->compute(work1,work1,1); + + // if requested, compute energy and virial contribution + + double scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm); + double s2 = scaleinv*scaleinv; + + if (eflag || vflag) { + if (vflag) { + n = 0; + for (i = 0; i < nfft; i++) { + eng = s2 * greensfn[i] * (work1[n]*work1[n] + work1[n+1]*work1[n+1]); + for (j = 0; j < 6; j++) virial[j] += eng*vg[i][j]; + energy += eng; + n += 2; + } + } else { + n = 0; + for (i = 0; i < nfft; i++) { + energy += + s2 * greensfn[i] * (work1[n]*work1[n] + work1[n+1]*work1[n+1]); + n += 2; + } + } + } + + // scale by 1/total-grid-pts to get rho(k) + // multiply by Green's function to get V(k) + + n = 0; + for (i = 0; i < nfft; i++) { + work1[n++] *= scaleinv * greensfn[i]; + work1[n++] *= scaleinv * greensfn[i]; + } + + // compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k) + // FFT leaves data in 3d brick decomposition + // copy it into inner portion of vdx,vdy,vdz arrays + + // x direction gradient + + n = 0; + for (k = nzlo_fft; k <= nzhi_fft; k++) + for (j = nylo_fft; j <= nyhi_fft; j++) + for (i = nxlo_fft; i <= nxhi_fft; i++) { + work2[n] = fkx[i]*work1[n+1]; + work2[n+1] = -fkx[i]*work1[n]; + n += 2; + } + + fft2->compute(work2,work2,-1); + + n = 0; + int x_hi = nxhi_in * 4 + 3; + for (k = nzlo_in; k <= nzhi_in; k++) + for (j = nylo_in; j <= nyhi_in; j++) + for (i = nxlo_in * 4; i < x_hi; i+=4) { + vd_brick[k][j][i] = work2[n]; + n += 2; + } + + // y direction gradient + + n = 0; + for (k = nzlo_fft; k <= nzhi_fft; k++) + for (j = nylo_fft; j <= nyhi_fft; j++) + for (i = nxlo_fft; i <= nxhi_fft; i++) { + work2[n] = fky[j]*work1[n+1]; + work2[n+1] = -fky[j]*work1[n]; + n += 2; + } + + fft2->compute(work2,work2,-1); + + n = 0; + for (k = nzlo_in; k <= nzhi_in; k++) + for (j = nylo_in; j <= nyhi_in; j++) + for (i = nxlo_in * 4 + 1; i < x_hi; i+=4) { + vd_brick[k][j][i] = work2[n]; + n += 2; + } + + // z direction gradient + + n = 0; + for (k = nzlo_fft; k <= nzhi_fft; k++) + for (j = nylo_fft; j <= nyhi_fft; j++) + for (i = nxlo_fft; i <= nxhi_fft; i++) { + work2[n] = fkz[k]*work1[n+1]; + work2[n+1] = -fkz[k]*work1[n]; + n += 2; + } + + fft2->compute(work2,work2,-1); + + n = 0; + for (k = nzlo_in; k <= nzhi_in; k++) + for (j = nylo_in; j <= nyhi_in; j++) + for (i = nxlo_in * 4 + 2; i < x_hi; i+=4) { + vd_brick[k][j][i] = work2[n]; + n += 2; + } +} + +/* ---------------------------------------------------------------------- + map nprocs to NX by NY grid as PX by PY procs - return optimal px,py +------------------------------------------------------------------------- */ + +template +void PPPMGPUT::procs2grid2d(int nprocs, int nx, int ny, int *px, int *py) +{ + // loop thru all possible factorizations of nprocs + // surf = surface area of largest proc sub-domain + // innermost if test minimizes surface area and surface/volume ratio + + int bestsurf = 2 * (nx + ny); + int bestboxx = 0; + int bestboxy = 0; + + int boxx,boxy,surf,ipx,ipy; + + ipx = 1; + while (ipx <= nprocs) { + if (nprocs % ipx == 0) { + ipy = nprocs/ipx; + boxx = nx/ipx; + if (nx % ipx) boxx++; + boxy = ny/ipy; + if (ny % ipy) boxy++; + surf = boxx + boxy; + if (surf < bestsurf || + (surf == bestsurf && boxx*boxy > bestboxx*bestboxy)) { + bestsurf = surf; + bestboxx = boxx; + bestboxy = boxy; + *px = ipx; + *py = ipy; + } + } + ipx++; + } +} + +/* ---------------------------------------------------------------------- + charge assignment into rho1d + dx,dy,dz = distance of particle from "lower left" grid point +------------------------------------------------------------------------- */ + +template +void PPPMGPUT::compute_rho1d(double dx, double dy, double dz) +{ + int k,l; + + for (k = (1-order)/2; k <= order/2; k++) { + rho1d[0][k] = 0.0; + rho1d[1][k] = 0.0; + rho1d[2][k] = 0.0; + for (l = order-1; l >= 0; l--) { + rho1d[0][k] = rho_coeff[l][k] + rho1d[0][k]*dx; + rho1d[1][k] = rho_coeff[l][k] + rho1d[1][k]*dy; + rho1d[2][k] = rho_coeff[l][k] + rho1d[2][k]*dz; + } + } +} + +/* ---------------------------------------------------------------------- + generate coeffients for the weight function of order n + + (n-1) + Wn(x) = Sum wn(k,x) , Sum is over every other integer + k=-(n-1) + For k=-(n-1),-(n-1)+2, ....., (n-1)-2,n-1 + k is odd integers if n is even and even integers if n is odd + --- + | n-1 + | Sum a(l,j)*(x-k/2)**l if abs(x-k/2) < 1/2 + wn(k,x) = < l=0 + | + | 0 otherwise + --- + a coeffients are packed into the array rho_coeff to eliminate zeros + rho_coeff(l,((k+mod(n+1,2))/2) = a(l,k) +------------------------------------------------------------------------- */ + +template +void PPPMGPUT::compute_rho_coeff() +{ + int j,k,l,m; + double s; + + double **a; + memory->create2d_offset(a,order,-order,order,"pppm:a"); + + for (k = -order; k <= order; k++) + for (l = 0; l < order; l++) + a[l][k] = 0.0; + + a[0][0] = 1.0; + for (j = 1; j < order; j++) { + for (k = -j; k <= j; k += 2) { + s = 0.0; + for (l = 0; l < j; l++) { + a[l+1][k] = (a[l][k+1]-a[l][k-1]) / (l+1); + s += pow(0.5,(double) l+1) * + (a[l][k-1] + pow(-1.0,(double) l) * a[l][k+1]) / (l+1); + } + a[0][k] = s; + } + } + + m = (1-order)/2; + for (k = -(order-1); k < order; k += 2) { + for (l = 0; l < order; l++) + rho_coeff[l][m] = a[l][k]; + m++; + } + + memory->destroy2d_offset(a,-order); +} + +/* ---------------------------------------------------------------------- + Slab-geometry correction term to dampen inter-slab interactions between + periodically repeating slabs. Yields good approximation to 2D Ewald if + adequate empty space is left between repeating slabs (J. Chem. Phys. + 111, 3155). Slabs defined here to be parallel to the xy plane. +------------------------------------------------------------------------- */ + +template +void PPPMGPUT::slabcorr(int eflag) +{ + // compute local contribution to global dipole moment + + double *q = atom->q; + double **x = atom->x; + int nlocal = atom->nlocal; + + double dipole = 0.0; + for (int i = 0; i < nlocal; i++) dipole += q[i]*x[i][2]; + + // sum local contributions to get global dipole moment + + double dipole_all; + MPI_Allreduce(&dipole,&dipole_all,1,MPI_DOUBLE,MPI_SUM,world); + + // compute corrections + + double e_slabcorr = 2.0*PI*dipole_all*dipole_all/volume; + + if (eflag) energy += qqrd2e*scale * e_slabcorr; + + // add on force corrections + + double ffact = -4.0*PI*dipole_all/volume; + double **f = atom->f; + + for (int i = 0; i < nlocal; i++) f[i][2] += qqrd2e*scale * q[i]*ffact; +} + +/* ---------------------------------------------------------------------- + perform and time the 4 FFTs required for N timesteps +------------------------------------------------------------------------- */ + +template +void PPPMGPUT::timing(int n, double &time3d, double &time1d) +{ + double time1,time2; + + for (int i = 0; i < 2*nfft_both; i++) work1[i] = 0.0; + + MPI_Barrier(world); + time1 = MPI_Wtime(); + + for (int i = 0; i < n; i++) { + fft1->compute(work1,work1,1); + fft2->compute(work1,work1,-1); + fft2->compute(work1,work1,-1); + fft2->compute(work1,work1,-1); + } + + MPI_Barrier(world); + time2 = MPI_Wtime(); + time3d = time2 - time1; + + MPI_Barrier(world); + time1 = MPI_Wtime(); + + for (int i = 0; i < n; i++) { + fft1->timing1d(work1,nfft_both,1); + fft2->timing1d(work1,nfft_both,-1); + fft2->timing1d(work1,nfft_both,-1); + fft2->timing1d(work1,nfft_both,-1); + } + + MPI_Barrier(world); + time2 = MPI_Wtime(); + time1d = time2 - time1; +} + +/* ---------------------------------------------------------------------- + Create array using offsets from pinned memory allocation +------------------------------------------------------------------------- */ + +template +grdtyp ***PPPMGPUT::create_3d_offset(int n1lo, int n1hi, int n2lo, int n2hi, + int n3lo, int n3hi, const char *name, + grdtyp *data, int vec_length) +{ + int i,j; + int n1 = n1hi - n1lo + 1; + int n2 = n2hi - n2lo + 1; + int n3 = n3hi - n3lo + 1; + + grdtyp **plane = (grdtyp **)memory->smalloc(n1*n2*sizeof(grdtyp *),name); + grdtyp ***array = (grdtyp ***)memory->smalloc(n1*sizeof(grdtyp **),name); + + int n = 0; + for (i = 0; i < n1; i++) { + array[i] = &plane[i*n2]; + for (j = 0; j < n2; j++) { + plane[i*n2+j] = &data[n]; + n += n3*vec_length; + } + } + + for (i = 0; i < n1*n2; i++) array[0][i] -= n3lo*vec_length; + for (i = 0; i < n1; i++) array[i] -= n2lo; + return array-n1lo; +} + +/* ---------------------------------------------------------------------- + templated 3d memory offsets +------------------------------------------------------------------------- */ + +template +void PPPMGPUT::destroy_3d_offset(grdtyp ***array, int n1_offset, + int n2_offset) +{ + if (array == NULL) return; + memory->sfree(&array[n1_offset][n2_offset]); + memory->sfree(array + n1_offset); +} + +template class PPPMGPU; +template class PPPMGPU; diff --git a/src/GPU/pppm_gpu.h b/src/GPU/pppm_gpu.h new file mode 100644 index 0000000000..ba0b3e3e93 --- /dev/null +++ b/src/GPU/pppm_gpu.h @@ -0,0 +1,103 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifndef LMP_PPPM_GPU_H +#define LMP_PPPM_GPU_H + +#include "kspace.h" +#include "lmptype.h" + +namespace LAMMPS_NS { + +template +class PPPMGPU : public KSpace { + public: + PPPMGPU(class LAMMPS *, int, char **); + virtual ~PPPMGPU(); + virtual void init() = 0; + void base_init(); + void setup(); + virtual void compute(int, int) = 0; + void timing(int, double &, double &); + virtual double memory_usage() = 0; + + protected: + int me,nprocs; + double PI; + double precision; + int nfactors; + int *factors; + double qsum,qsqsum; + double qqrd2e; + double cutoff; + double volume; + double delxinv,delyinv,delzinv,delvolinv; + double shift,shiftone; + + int nxlo_in,nylo_in,nzlo_in,nxhi_in,nyhi_in,nzhi_in; + int nxlo_out,nylo_out,nzlo_out,nxhi_out,nyhi_out,nzhi_out; + int nxlo_ghost,nxhi_ghost,nylo_ghost,nyhi_ghost,nzlo_ghost,nzhi_ghost; + int nxlo_fft,nylo_fft,nzlo_fft,nxhi_fft,nyhi_fft,nzhi_fft; + int nlower,nupper; + int ngrid,nfft,nbuf,nfft_both; + + grdtyp ***density_brick; + grdtyp ***vd_brick; + double *greensfn; + double **vg; + double *fkx,*fky,*fkz; + double *density_fft; + double *work1,*work2; + double *buf1,*buf2; + + double *gf_b; + double **rho1d,**rho_coeff; + + class FFT3d *fft1,*fft2; + class Remap *remap; + + int nmax; + + int triclinic; // domain settings, orthog or triclinic + double *boxlo; + // TIP4P settings + int typeH,typeO; // atom types of TIP4P water H and O atoms + double qdist; // distance from O site to negative charge + double alpha; // geometric factor + + void set_grid(); + void allocate(); + void deallocate(); + int factorable(int); + double rms(double, double, bigint, double, double **); + double diffpr(double, double, double, double, double **); + void compute_gf_denom(); + double gf_denom(double, double, double); + void brick2fft(); + void fillbrick(); + void poisson(int, int); + void procs2grid2d(int,int,int,int *, int*); + void compute_rho1d(double, double, double); + void compute_rho_coeff(); + void slabcorr(int); + + double poisson_time; + + grdtyp ***create_3d_offset(int, int, int, int, int, int, const char *, + grdtyp *, int); + void destroy_3d_offset(grdtyp ***, int, int); +}; + +} + +#endif diff --git a/src/GPU/pppm_gpu_double.cpp b/src/GPU/pppm_gpu_double.cpp new file mode 100644 index 0000000000..4ddc7e5ad5 --- /dev/null +++ b/src/GPU/pppm_gpu_double.cpp @@ -0,0 +1,217 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL) +------------------------------------------------------------------------- */ + +#include "mpi.h" +#include "string.h" +#include "stdio.h" +#include "stdlib.h" +#include "math.h" +#include "pppm_gpu_double.h" +#include "lmptype.h" +#include "atom.h" +#include "comm.h" +#include "neighbor.h" +#include "force.h" +#include "pair.h" +#include "bond.h" +#include "angle.h" +#include "domain.h" +#include "fft3d_wrap.h" +#include "remap_wrap.h" +#include "memory.h" +#include "error.h" +#include "gpu_extra.h" + +#define grdtyp double + +// External functions from cuda library for atom decomposition + +grdtyp* pppm_gpu_init_d(const int nlocal, const int nall, FILE *screen, + const int order, const int nxlo_out, + const int nylo_out, const int nzlo_out, + const int nxhi_out, const int nyhi_out, + const int nzhi_out, double **rho_coeff, + grdtyp **_vd_brick, const double slab_volfactor, + const int nx_pppm, const int ny_pppm, + const int nz_pppm, int &success); +void pppm_gpu_clear_d(const double poisson_time); +int pppm_gpu_spread_d(const int ago, const int nlocal, const int nall, + double **host_x, int *host_type, bool &success, + double *host_q, double *boxlo, const double delxinv, + const double delyinv, const double delzinv); +void pppm_gpu_interp_d(const grdtyp qqrd2e_scale); +double pppm_gpu_bytes_d(); + +using namespace LAMMPS_NS; + +#define MAXORDER 7 +#define OFFSET 16384 +#define SMALL 0.00001 +#define LARGE 10000.0 +#define EPS_HOC 1.0e-7 + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +/* ---------------------------------------------------------------------- */ + +PPPMGPUDouble::PPPMGPUDouble(LAMMPS *lmp, int narg, char **arg) : + PPPMGPU(lmp, narg, arg) +{ +} + +/* ---------------------------------------------------------------------- + free all memory +------------------------------------------------------------------------- */ + +PPPMGPUDouble::~PPPMGPUDouble() +{ + pppm_gpu_clear_d(poisson_time); +} + +/* ---------------------------------------------------------------------- + called once before run +------------------------------------------------------------------------- */ + +void PPPMGPUDouble::init() +{ + base_init(); + + if (order>8) + error->all("Cannot use order greater than 8 with pppm/gpu."); + pppm_gpu_clear_d(poisson_time); + + int success; + grdtyp *data, *h_brick; + h_brick = pppm_gpu_init_d(atom->nlocal, atom->nlocal+atom->nghost, screen, + order, nxlo_out, nylo_out, nzlo_out, nxhi_out, + nyhi_out, nzhi_out, rho_coeff, &data, + slab_volfactor, nx_pppm, ny_pppm, nz_pppm, + success); + + GPU_EXTRA::check_flag(success,error,world); + + density_brick = + create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out, + nxlo_out,nxhi_out,"pppm:density_brick",h_brick,1); + vd_brick = + create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out, + nxlo_out,nxhi_out,"pppm:vd_brick",data,4); + + poisson_time=0; +} + +/* ---------------------------------------------------------------------- + compute the PPPMGPU long-range force, energy, virial +------------------------------------------------------------------------- */ + +void PPPMGPUDouble::compute(int eflag, int vflag) +{ + bool success = true; + int flag=pppm_gpu_spread_d(neighbor->ago, atom->nlocal, atom->nlocal + + atom->nghost, atom->x, atom->type, success, + atom->q, domain->boxlo, delxinv, delyinv, + delzinv); + if (!success) + error->one("Out of memory on GPGPU"); + if (flag != 0) + error->one("Out of range atoms - cannot compute PPPM"); + + int i; + + // convert atoms from box to lamda coords + + if (triclinic == 0) boxlo = domain->boxlo; + else { + boxlo = domain->boxlo_lamda; + domain->x2lamda(atom->nlocal); + } + + energy = 0.0; + if (vflag) for (i = 0; i < 6; i++) virial[i] = 0.0; + + double t3=MPI_Wtime(); + + // all procs communicate density values from their ghost cells + // to fully sum contribution in their 3d bricks + // remap from 3d decomposition to FFT decomposition + + brick2fft(); + + // compute potential gradient on my FFT grid and + // portion of e_long on this proc's FFT grid + // return gradients (electric fields) in 3d brick decomposition + + poisson(eflag,vflag); + + // all procs communicate E-field values to fill ghost cells + // surrounding their 3d bricks + + fillbrick(); + + poisson_time+=MPI_Wtime()-t3; + + // calculate the force on my particles + + grdtyp qqrd2e_scale=qqrd2e*scale; + pppm_gpu_interp_d(qqrd2e_scale); + + // sum energy across procs and add in volume-dependent term + + if (eflag) { + double energy_all; + MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world); + energy = energy_all; + + energy *= 0.5*volume; + energy -= g_ewald*qsqsum/1.772453851 + + 0.5*PI*qsum*qsum / (g_ewald*g_ewald*volume); + energy *= qqrd2e*scale; + } + + // sum virial across procs + + if (vflag) { + double virial_all[6]; + MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world); + for (i = 0; i < 6; i++) virial[i] = 0.5*qqrd2e*scale*volume*virial_all[i]; + } + + // 2d slab correction + + if (slabflag) slabcorr(eflag); + + // convert atoms back from lamda to box coords + + if (triclinic) domain->lamda2x(atom->nlocal); +} + +/* ---------------------------------------------------------------------- + memory usage of local arrays +------------------------------------------------------------------------- */ + +double PPPMGPUDouble::memory_usage() +{ + double bytes = nmax*3 * sizeof(double); + int nbrick = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) * + (nzhi_out-nzlo_out+1); + bytes += 4 * nbrick * sizeof(grdtyp); + bytes += 6 * nfft_both * sizeof(double); + bytes += nfft_both*6 * sizeof(double); + bytes += 2 * nbuf * sizeof(double); + return bytes + pppm_gpu_bytes_d(); +} diff --git a/src/GPU/pppm_gpu_double.h b/src/GPU/pppm_gpu_double.h new file mode 100644 index 0000000000..a64612a7c0 --- /dev/null +++ b/src/GPU/pppm_gpu_double.h @@ -0,0 +1,42 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef KSPACE_CLASS + +KSpaceStyle(pppm/gpu/double,PPPMGPUDouble) + +#else + +#ifndef LMP_PPPM_GPU_DOUBLE_H +#define LMP_PPPM_GPU_DOUBLE_H + +#include "pppm_gpu.h" +#include "lmptype.h" + +namespace LAMMPS_NS { + +class PPPMGPUDouble : public PPPMGPU { + public: + PPPMGPUDouble(class LAMMPS *, int, char **); + ~PPPMGPUDouble(); + void init(); + void compute(int, int); + double memory_usage(); + + protected: +}; + +} + +#endif +#endif diff --git a/src/GPU/pppm_gpu_single.cpp b/src/GPU/pppm_gpu_single.cpp new file mode 100644 index 0000000000..cee6b438bf --- /dev/null +++ b/src/GPU/pppm_gpu_single.cpp @@ -0,0 +1,216 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Mike Brown (ORNL) +------------------------------------------------------------------------- */ + +#include "mpi.h" +#include "string.h" +#include "stdio.h" +#include "stdlib.h" +#include "math.h" +#include "pppm_gpu_single.h" +#include "lmptype.h" +#include "atom.h" +#include "comm.h" +#include "neighbor.h" +#include "force.h" +#include "pair.h" +#include "bond.h" +#include "angle.h" +#include "domain.h" +#include "fft3d_wrap.h" +#include "remap_wrap.h" +#include "memory.h" +#include "error.h" +#include "gpu_extra.h" + +#define grdtyp float + +// External functions from cuda library for atom decomposition + +grdtyp* pppm_gpu_init_f(const int nlocal, const int nall, FILE *screen, + const int order, const int nxlo_out, + const int nylo_out, const int nzlo_out, + const int nxhi_out, const int nyhi_out, + const int nzhi_out, double **rho_coeff, + grdtyp **_vd_brick, const double slab_volfactor, + const int nx_pppm, const int ny_pppm, + const int nz_pppm, int &success); +void pppm_gpu_clear_f(const double poisson_time); +int pppm_gpu_spread_f(const int ago, const int nlocal, const int nall, + double **host_x, int *host_type, bool &success, + double *host_q, double *boxlo, const double delxinv, + const double delyinv, const double delzinv); +void pppm_gpu_interp_f(const grdtyp qqrd2e_scale); +double pppm_gpu_bytes_f(); + +using namespace LAMMPS_NS; + +#define MAXORDER 7 +#define OFFSET 16384 +#define SMALL 0.00001 +#define LARGE 10000.0 +#define EPS_HOC 1.0e-7 + +#define MIN(a,b) ((a) < (b) ? (a) : (b)) +#define MAX(a,b) ((a) > (b) ? (a) : (b)) + +/* ---------------------------------------------------------------------- */ + +PPPMGPUSingle::PPPMGPUSingle(LAMMPS *lmp, int narg, char **arg) : + PPPMGPU(lmp, narg, arg) +{ +} + +/* ---------------------------------------------------------------------- + free all memory +------------------------------------------------------------------------- */ + +PPPMGPUSingle::~PPPMGPUSingle() +{ + pppm_gpu_clear_f(poisson_time); +} + +/* ---------------------------------------------------------------------- + called once before run +------------------------------------------------------------------------- */ + +void PPPMGPUSingle::init() +{ + base_init(); + + if (order>8) + error->all("Cannot use order greater than 8 with pppm/gpu."); + pppm_gpu_clear_f(poisson_time); + + int success; + grdtyp *data, *h_brick; + h_brick = pppm_gpu_init_f(atom->nlocal, atom->nlocal+atom->nghost, screen, + order, nxlo_out, nylo_out, nzlo_out, nxhi_out, + nyhi_out, nzhi_out, rho_coeff, &data, + slab_volfactor,nx_pppm,ny_pppm,nz_pppm,success); + + GPU_EXTRA::check_flag(success,error,world); + + density_brick = + create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out, + nxlo_out,nxhi_out,"pppm:density_brick",h_brick,1); + vd_brick = + create_3d_offset(nzlo_out,nzhi_out,nylo_out,nyhi_out, + nxlo_out,nxhi_out,"pppm:vd_brick",data,4); + + poisson_time=0; +} + +/* ---------------------------------------------------------------------- + compute the PPPMGPU long-range force, energy, virial +------------------------------------------------------------------------- */ + +void PPPMGPUSingle::compute(int eflag, int vflag) +{ + bool success = true; + int flag=pppm_gpu_spread_f(neighbor->ago, atom->nlocal, atom->nlocal + + atom->nghost, atom->x, atom->type, success, + atom->q, domain->boxlo, delxinv, delyinv, + delzinv); + if (!success) + error->one("Out of memory on GPGPU"); + if (flag != 0) + error->one("Out of range atoms - cannot compute PPPM"); + + int i; + + // convert atoms from box to lamda coords + + if (triclinic == 0) boxlo = domain->boxlo; + else { + boxlo = domain->boxlo_lamda; + domain->x2lamda(atom->nlocal); + } + + energy = 0.0; + if (vflag) for (i = 0; i < 6; i++) virial[i] = 0.0; + + double t3=MPI_Wtime(); + + // all procs communicate density values from their ghost cells + // to fully sum contribution in their 3d bricks + // remap from 3d decomposition to FFT decomposition + + brick2fft(); + + // compute potential gradient on my FFT grid and + // portion of e_long on this proc's FFT grid + // return gradients (electric fields) in 3d brick decomposition + + poisson(eflag,vflag); + + // all procs communicate E-field values to fill ghost cells + // surrounding their 3d bricks + + fillbrick(); + + poisson_time+=MPI_Wtime()-t3; + + // calculate the force on my particles + + grdtyp qqrd2e_scale=qqrd2e*scale; + pppm_gpu_interp_f(qqrd2e_scale); + + // sum energy across procs and add in volume-dependent term + + if (eflag) { + double energy_all; + MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world); + energy = energy_all; + + energy *= 0.5*volume; + energy -= g_ewald*qsqsum/1.772453851 + + 0.5*PI*qsum*qsum / (g_ewald*g_ewald*volume); + energy *= qqrd2e*scale; + } + + // sum virial across procs + + if (vflag) { + double virial_all[6]; + MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world); + for (i = 0; i < 6; i++) virial[i] = 0.5*qqrd2e*scale*volume*virial_all[i]; + } + + // 2d slab correction + + if (slabflag) slabcorr(eflag); + + // convert atoms back from lamda to box coords + + if (triclinic) domain->lamda2x(atom->nlocal); +} + +/* ---------------------------------------------------------------------- + memory usage of local arrays +------------------------------------------------------------------------- */ + +double PPPMGPUSingle::memory_usage() +{ + double bytes = nmax*3 * sizeof(double); + int nbrick = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) * + (nzhi_out-nzlo_out+1); + bytes += 4 * nbrick * sizeof(grdtyp); + bytes += 6 * nfft_both * sizeof(double); + bytes += nfft_both*6 * sizeof(double); + bytes += 2 * nbuf * sizeof(double); + return bytes + pppm_gpu_bytes_f(); +} diff --git a/src/GPU/pppm_gpu_single.h b/src/GPU/pppm_gpu_single.h new file mode 100644 index 0000000000..c17454dca4 --- /dev/null +++ b/src/GPU/pppm_gpu_single.h @@ -0,0 +1,42 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef KSPACE_CLASS + +KSpaceStyle(pppm/gpu/single,PPPMGPUSingle) + +#else + +#ifndef LMP_PPPM_GPU_SINGLE_H +#define LMP_PPPM_GPU_SINGLE_H + +#include "pppm_gpu.h" +#include "lmptype.h" + +namespace LAMMPS_NS { + +class PPPMGPUSingle : public PPPMGPU { + public: + PPPMGPUSingle(class LAMMPS *, int, char **); + ~PPPMGPUSingle(); + void init(); + void compute(int, int); + double memory_usage(); + + protected: +}; + +} + +#endif +#endif