From 8b76e47d6bb084cbef50332ef1af0caad5d75854 Mon Sep 17 00:00:00 2001 From: Steve Plimpton Date: Wed, 5 Aug 2020 16:44:56 -0600 Subject: [PATCH] support for tiled decompositions in PPPM --- src/KSPACE/gridcomm2.cpp | 1093 ++++++++++++ src/KSPACE/gridcomm2.h | 201 +++ src/KSPACE/pppm2.cpp | 3524 ++++++++++++++++++++++++++++++++++++++ src/KSPACE/pppm2.h | 360 ++++ src/force.cpp | 6 +- src/kspace.h | 5 + 6 files changed, 5186 insertions(+), 3 deletions(-) create mode 100644 src/KSPACE/gridcomm2.cpp create mode 100644 src/KSPACE/gridcomm2.h create mode 100644 src/KSPACE/pppm2.cpp create mode 100644 src/KSPACE/pppm2.h diff --git a/src/KSPACE/gridcomm2.cpp b/src/KSPACE/gridcomm2.cpp new file mode 100644 index 0000000000..ce9a1e7568 --- /dev/null +++ b/src/KSPACE/gridcomm2.cpp @@ -0,0 +1,1093 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include "gridcomm2.h" +#include +#include "comm.h" +#include "kspace.h" +#include "irregular.h" +#include "memory.h" + +using namespace LAMMPS_NS; + +enum{REGULAR,TILED}; + +#define SWAPDELTA 8 + +// NOTE: gridcomm needs to be world for TILED, will it work with MSM? +// NOTE: Tiled implementation here only works for RCB, not general tiled + +/* ---------------------------------------------------------------------- + gcomm = MPI communicator that shares this grid + does not have to be world, see MSM + gn xyz = size of global grid + i xyz lohi = portion of global grid this proc owns, 0 <= index < N + o xyz lohi = owned grid portion + ghost grid cells needed in all directions + if o indices are < 0 or hi indices are >= N, + then grid is treated as periodic in that dimension, + communication is done across the periodic boundaries +------------------------------------------------------------------------- */ + +GridComm2::GridComm2(LAMMPS *lmp, MPI_Comm gcomm, + int gnx, int gny, int gnz, + int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi, + int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi) + : Pointers(lmp) +{ + gridcomm = gcomm; + MPI_Comm_rank(gridcomm,&me); + MPI_Comm_size(gridcomm,&nprocs); + + nx = gnx; + ny = gny; + nz = gnz; + + inxlo = ixlo; + inxhi = ixhi; + inylo = iylo; + inyhi = iyhi; + inzlo = izlo; + inzhi = izhi; + + outxlo = oxlo; + outxhi = oxhi; + outylo = oylo; + outyhi = oyhi; + outzlo = ozlo; + outzhi = ozhi; + + // layout == REGULAR or TILED + // for REGULAR, proc xyz lohi = my 6 neighbor procs + + layout = REGULAR; + if (comm->layout == Comm::LAYOUT_TILED) layout = TILED; + + outxlo_max = oxlo; + outxhi_max = oxhi; + outylo_max = oylo; + outyhi_max = oyhi; + outzlo_max = ozlo; + outzhi_max = ozhi; + + if (layout == REGULAR) { + int (*procneigh)[2] = comm->procneigh; + + procxlo = procneigh[0][0]; + procxhi = procneigh[0][1]; + procylo = procneigh[1][0]; + procyhi = procneigh[1][1]; + proczlo = procneigh[2][0]; + proczhi = procneigh[2][1]; + } + + nswap = maxswap = 0; + swap = NULL; + + nsend = nrecv = ncopy = 0; + send = NULL; + recv = NULL; + copy = NULL; + requests = NULL; +} + +/* ---------------------------------------------------------------------- + same as first constructor except o xyz lohi max are added arguments + this is for case when caller stores grid in a larger array than o xyz lohi + only affects indices() method which generates indices into the caller's array +------------------------------------------------------------------------- */ + +GridComm2::GridComm2(LAMMPS *lmp, MPI_Comm gcomm, + int gnx, int gny, int gnz, + int ixlo, int ixhi, int iylo, int iyhi, int izlo, int izhi, + int oxlo, int oxhi, int oylo, int oyhi, int ozlo, int ozhi, + int oxlo_max, int oxhi_max, int oylo_max, int oyhi_max, + int ozlo_max, int ozhi_max) + : Pointers(lmp) +{ + gridcomm = gcomm; + MPI_Comm_rank(gridcomm,&me); + MPI_Comm_size(gridcomm,&nprocs); + + nx = gnx; + ny = gny; + nz = gnz; + + inxlo = ixlo; + inxhi = ixhi; + inylo = iylo; + inyhi = iyhi; + inzlo = izlo; + inzhi = izhi; + + outxlo = oxlo; + outxhi = oxhi; + outylo = oylo; + outyhi = oyhi; + outzlo = ozlo; + outzhi = ozhi; + + outxlo_max = oxlo_max; + outxhi_max = oxhi_max; + outylo_max = oylo_max; + outyhi_max = oyhi_max; + outzlo_max = ozlo_max; + outzhi_max = ozhi_max; + + // layout == REGULAR or TILED + // for REGULAR, proc xyz lohi = my 6 neighbor procs + + layout = REGULAR; + if (comm->layout == Comm::LAYOUT_TILED) layout = TILED; + + if (layout == REGULAR) { + int (*procneigh)[2] = comm->procneigh; + + procxlo = procneigh[0][0]; + procxhi = procneigh[0][1]; + procylo = procneigh[1][0]; + procyhi = procneigh[1][1]; + proczlo = procneigh[2][0]; + proczhi = procneigh[2][1]; + } + + nswap = maxswap = 0; + swap = NULL; + + nsend = nrecv = ncopy = 0; + send = NULL; + recv = NULL; + copy = NULL; + requests = NULL; +} + +/* ---------------------------------------------------------------------- */ + +GridComm2::~GridComm2() +{ + // regular comm data struct + + for (int i = 0; i < nswap; i++) { + memory->destroy(swap[i].packlist); + memory->destroy(swap[i].unpacklist); + } + memory->sfree(swap); + + // tiled comm data structs + + for (int i = 0; i < nsend; i++) + memory->destroy(send[i].packlist); + memory->sfree(send); + + for (int i = 0; i < nrecv; i++) + memory->destroy(recv[i].unpacklist); + memory->sfree(recv); + + for (int i = 0; i < ncopy; i++) { + memory->destroy(copy[i].packlist); + memory->destroy(copy[i].unpacklist); + } + memory->sfree(copy); + + delete [] requests; +} + +/* ---------------------------------------------------------------------- */ + +void GridComm2::setup(int &nbuf1, int &nbuf2) +{ + if (layout == REGULAR) setup_regular(nbuf1,nbuf2); + else setup_tiled(nbuf1,nbuf2); +} + +/* ---------------------------------------------------------------------- */ + +void GridComm2::setup_regular(int &nbuf1, int &nbuf2) +{ + int nsent,sendfirst,sendlast,recvfirst,recvlast; + int sendplanes,recvplanes; + int notdoneme,notdone; + + // notify 6 neighbor procs how many ghost grid planes I need from them + // ghost xyz lo = # of my lower grid planes that proc xyz lo needs as its ghosts + // ghost xyz hi = # of my upper grid planes that proc xyz hi needs as its ghosts + // if this proc is its own neighbor across periodic bounary, value is from self + + int nplanes = inxlo - outxlo; + if (procxlo != me) + MPI_Sendrecv(&nplanes,1,MPI_INT,procxlo,0, + &ghostxhi,1,MPI_INT,procxhi,0,gridcomm,MPI_STATUS_IGNORE); + else ghostxhi = nplanes; + + nplanes = outxhi - inxhi; + if (procxhi != me) + MPI_Sendrecv(&nplanes,1,MPI_INT,procxhi,0, + &ghostxlo,1,MPI_INT,procxlo,0,gridcomm,MPI_STATUS_IGNORE); + else ghostxlo = nplanes; + + nplanes = inylo - outylo; + if (procylo != me) + MPI_Sendrecv(&nplanes,1,MPI_INT,procylo,0, + &ghostyhi,1,MPI_INT,procyhi,0,gridcomm,MPI_STATUS_IGNORE); + else ghostyhi = nplanes; + + nplanes = outyhi - inyhi; + if (procyhi != me) + MPI_Sendrecv(&nplanes,1,MPI_INT,procyhi,0, + &ghostylo,1,MPI_INT,procylo,0,gridcomm,MPI_STATUS_IGNORE); + else ghostylo = nplanes; + + nplanes = inzlo - outzlo; + if (proczlo != me) + MPI_Sendrecv(&nplanes,1,MPI_INT,proczlo,0, + &ghostzhi,1,MPI_INT,proczhi,0,gridcomm,MPI_STATUS_IGNORE); + else ghostzhi = nplanes; + + nplanes = outzhi - inzhi; + if (proczhi != me) + MPI_Sendrecv(&nplanes,1,MPI_INT,proczhi,0, + &ghostzlo,1,MPI_INT,proczlo,0,gridcomm,MPI_STATUS_IGNORE); + else ghostzlo = nplanes; + + // setup swaps = exchange of grid data with one of 6 neighobr procs + // can be more than one in a direction if ghost region extends beyond neigh proc + // all procs have same swap count, but swapsize npack/nunpack can be empty + + nswap = 0; + + // send own grid pts to -x processor, recv ghost grid pts from +x processor + + nsent = 0; + sendfirst = inxlo; + sendlast = inxhi; + recvfirst = inxhi+1; + notdone = 1; + + while (notdone) { + if (nswap == maxswap) grow_swap(); + + swap[nswap].sendproc = procxlo; + swap[nswap].recvproc = procxhi; + sendplanes = MIN(sendlast-sendfirst+1,ghostxlo-nsent); + swap[nswap].npack = + indices(swap[nswap].packlist, + sendfirst,sendfirst+sendplanes-1,inylo,inyhi,inzlo,inzhi); + + if (procxlo != me) + MPI_Sendrecv(&sendplanes,1,MPI_INT,procxlo,0, + &recvplanes,1,MPI_INT,procxhi,0,gridcomm,MPI_STATUS_IGNORE); + else recvplanes = sendplanes; + + swap[nswap].nunpack = + indices(swap[nswap].unpacklist, + recvfirst,recvfirst+recvplanes-1,inylo,inyhi,inzlo,inzhi); + + nsent += sendplanes; + sendfirst += sendplanes; + sendlast += recvplanes; + recvfirst += recvplanes; + nswap++; + + if (nsent < ghostxlo) notdoneme = 1; + else notdoneme = 0; + MPI_Allreduce(¬doneme,¬done,1,MPI_INT,MPI_SUM,gridcomm); + } + + // send own grid pts to +x processor, recv ghost grid pts from -x processor + + nsent = 0; + sendfirst = inxlo; + sendlast = inxhi; + recvlast = inxlo-1; + notdone = 1; + + while (notdone) { + if (nswap == maxswap) grow_swap(); + + swap[nswap].sendproc = procxhi; + swap[nswap].recvproc = procxlo; + sendplanes = MIN(sendlast-sendfirst+1,ghostxhi-nsent); + swap[nswap].npack = + indices(swap[nswap].packlist, + sendlast-sendplanes+1,sendlast,inylo,inyhi,inzlo,inzhi); + + if (procxhi != me) + MPI_Sendrecv(&sendplanes,1,MPI_INT,procxhi,0, + &recvplanes,1,MPI_INT,procxlo,0,gridcomm,MPI_STATUS_IGNORE); + else recvplanes = sendplanes; + + swap[nswap].nunpack = + indices(swap[nswap].unpacklist, + recvlast-recvplanes+1,recvlast,inylo,inyhi,inzlo,inzhi); + + nsent += sendplanes; + sendfirst -= recvplanes; + sendlast -= sendplanes; + recvlast -= recvplanes; + nswap++; + + if (nsent < ghostxhi) notdoneme = 1; + else notdoneme = 0; + MPI_Allreduce(¬doneme,¬done,1,MPI_INT,MPI_SUM,gridcomm); + } + + // send own grid pts to -y processor, recv ghost grid pts from +y processor + + nsent = 0; + sendfirst = inylo; + sendlast = inyhi; + recvfirst = inyhi+1; + notdone = 1; + + while (notdone) { + if (nswap == maxswap) grow_swap(); + + swap[nswap].sendproc = procylo; + swap[nswap].recvproc = procyhi; + sendplanes = MIN(sendlast-sendfirst+1,ghostylo-nsent); + swap[nswap].npack = + indices(swap[nswap].packlist, + outxlo,outxhi,sendfirst,sendfirst+sendplanes-1,inzlo,inzhi); + + if (procylo != me) + MPI_Sendrecv(&sendplanes,1,MPI_INT,procylo,0, + &recvplanes,1,MPI_INT,procyhi,0,gridcomm,MPI_STATUS_IGNORE); + else recvplanes = sendplanes; + + swap[nswap].nunpack = + indices(swap[nswap].unpacklist, + outxlo,outxhi,recvfirst,recvfirst+recvplanes-1,inzlo,inzhi); + + nsent += sendplanes; + sendfirst += sendplanes; + sendlast += recvplanes; + recvfirst += recvplanes; + nswap++; + + if (nsent < ghostylo) notdoneme = 1; + else notdoneme = 0; + MPI_Allreduce(¬doneme,¬done,1,MPI_INT,MPI_SUM,gridcomm); + } + + // send own grid pts to +y processor, recv ghost grid pts from -y processor + + nsent = 0; + sendfirst = inylo; + sendlast = inyhi; + recvlast = inylo-1; + notdone = 1; + + while (notdone) { + if (nswap == maxswap) grow_swap(); + + swap[nswap].sendproc = procyhi; + swap[nswap].recvproc = procylo; + sendplanes = MIN(sendlast-sendfirst+1,ghostyhi-nsent); + swap[nswap].npack = + indices(swap[nswap].packlist, + outxlo,outxhi,sendlast-sendplanes+1,sendlast,inzlo,inzhi); + + if (procyhi != me) + MPI_Sendrecv(&sendplanes,1,MPI_INT,procyhi,0, + &recvplanes,1,MPI_INT,procylo,0,gridcomm,MPI_STATUS_IGNORE); + else recvplanes = sendplanes; + + swap[nswap].nunpack = + indices(swap[nswap].unpacklist, + outxlo,outxhi,recvlast-recvplanes+1,recvlast,inzlo,inzhi); + + nsent += sendplanes; + sendfirst -= recvplanes; + sendlast -= sendplanes; + recvlast -= recvplanes; + nswap++; + + if (nsent < ghostyhi) notdoneme = 1; + else notdoneme = 0; + MPI_Allreduce(¬doneme,¬done,1,MPI_INT,MPI_SUM,gridcomm); + } + + // send own grid pts to -z processor, recv ghost grid pts from +z processor + + nsent = 0; + sendfirst = inzlo; + sendlast = inzhi; + recvfirst = inzhi+1; + notdone = 1; + + while (notdone) { + if (nswap == maxswap) grow_swap(); + + swap[nswap].sendproc = proczlo; + swap[nswap].recvproc = proczhi; + sendplanes = MIN(sendlast-sendfirst+1,ghostzlo-nsent); + swap[nswap].npack = + indices(swap[nswap].packlist, + outxlo,outxhi,outylo,outyhi,sendfirst,sendfirst+sendplanes-1); + + if (proczlo != me) + MPI_Sendrecv(&sendplanes,1,MPI_INT,proczlo,0, + &recvplanes,1,MPI_INT,proczhi,0,gridcomm,MPI_STATUS_IGNORE); + else recvplanes = sendplanes; + + swap[nswap].nunpack = + indices(swap[nswap].unpacklist, + outxlo,outxhi,outylo,outyhi,recvfirst,recvfirst+recvplanes-1); + + nsent += sendplanes; + sendfirst += sendplanes; + sendlast += recvplanes; + recvfirst += recvplanes; + nswap++; + + if (nsent < ghostzlo) notdoneme = 1; + else notdoneme = 0; + MPI_Allreduce(¬doneme,¬done,1,MPI_INT,MPI_SUM,gridcomm); + } + + // send own grid pts to +z processor, recv ghost grid pts from -z processor + + nsent = 0; + sendfirst = inzlo; + sendlast = inzhi; + recvlast = inzlo-1; + notdone = 1; + + while (notdone) { + if (nswap == maxswap) grow_swap(); + + swap[nswap].sendproc = proczhi; + swap[nswap].recvproc = proczlo; + sendplanes = MIN(sendlast-sendfirst+1,ghostzhi-nsent); + swap[nswap].npack = + indices(swap[nswap].packlist, + outxlo,outxhi,outylo,outyhi,sendlast-sendplanes+1,sendlast); + + if (proczhi != me) + MPI_Sendrecv(&sendplanes,1,MPI_INT,proczhi,0, + &recvplanes,1,MPI_INT,proczlo,0,gridcomm,MPI_STATUS_IGNORE); + else recvplanes = sendplanes; + + swap[nswap].nunpack = + indices(swap[nswap].unpacklist, + outxlo,outxhi,outylo,outyhi,recvlast-recvplanes+1,recvlast); + + nsent += sendplanes; + sendfirst -= recvplanes; + sendlast -= sendplanes; + recvlast -= recvplanes; + nswap++; + + if (nsent < ghostzhi) notdoneme = 1; + else notdoneme = 0; + MPI_Allreduce(¬doneme,¬done,1,MPI_INT,MPI_SUM,gridcomm); + } + + // ngrid = max of any forward/reverse pack/unpack grid points + + int ngrid = 0; + for (int i = 0; i < nswap; i++) { + ngrid = MAX(ngrid,swap[i].npack); + ngrid = MAX(ngrid,swap[i].nunpack); + } + + nbuf1 = nbuf2 = ngrid; +} + +/* ---------------------------------------------------------------------- +------------------------------------------------------------------------- */ + +void GridComm2::setup_tiled(int &nbuf1, int &nbuf2) +{ + int i,m; + double xlo,xhi,ylo,yhi,zlo,zhi; + int ghostbox[6],pbc[3]; + + // setup RCB tree of cut info for grid + // access CommTiled to get cut dimension + // cut = this proc's inlo in that dim + // dim is -1 for proc 0, but never accessed + + rcbinfo = (RCBinfo *) + memory->smalloc(nprocs*sizeof(RCBinfo),"GridComm:rcbinfo"); + RCBinfo rcbone; + rcbone.dim = comm->rcbcutdim; + if (rcbone.dim <= 0) rcbone.cut = inxlo; + else if (rcbone.dim == 1) rcbone.cut = inylo; + else if (rcbone.dim == 2) rcbone.cut = inzlo; + MPI_Allgather(&rcbone,sizeof(RCBinfo),MPI_CHAR, + rcbinfo,sizeof(RCBinfo),MPI_CHAR,gridcomm); + + // find overlaps of my extended ghost box with all other procs + // accounts for crossings of periodic boundaries + // noverlap = # of overlaps, including self + // overlap = vector of overlap info using Overlap data struct + + ghostbox[0] = outxlo; + ghostbox[1] = outxhi; + ghostbox[2] = outylo; + ghostbox[3] = outyhi; + ghostbox[4] = outzlo; + ghostbox[5] = outzhi; + + pbc[0] = pbc[1] = pbc[2] = 0; + + memory->create(overlap_procs,nprocs,"GridComm:overlap_procs"); + noverlap = maxoverlap = 0; + overlap = NULL; + + ghost_box_drop(ghostbox,pbc); + + // send each proc an overlap message + // content: me, index of my overlap, box that overlaps with its owned cells + // ncopy = # of overlaps with myself, across a periodic boundary + + int *proclist; + memory->create(proclist,noverlap,"GridComm:proclist"); + srequest = (Request *) + memory->smalloc(noverlap*sizeof(Request),"GridComm:srequest"); + + int nsend_request = 0; + ncopy = 0; + + for (m = 0; m < noverlap; m++) { + if (overlap[m].proc == me) ncopy++; + else { + proclist[nsend_request] = overlap[m].proc; + srequest[nsend_request].sender = me; + srequest[nsend_request].index = m; + for (i = 0; i < 6; i++) + srequest[nsend_request].box[i] = overlap[m].box[i]; + nsend_request++; + } + } + + Irregular *irregular = new Irregular(lmp); + int nrecv_request = irregular->create_data(nsend_request,proclist,1); + Request *rrequest = + (Request *) memory->smalloc(nrecv_request*sizeof(Request),"GridComm:rrequest"); + irregular->exchange_data((char *) srequest,sizeof(Request),(char *) rrequest); + irregular->destroy_data(); + + // compute overlaps between received ghost boxes and my owned box + // overlap box used to setup my Send data struct and respond to requests + + send = (Send *) memory->smalloc(nrecv_request*sizeof(Send),"GridComm:send"); + sresponse = (Response *) + memory->smalloc(nrecv_request*sizeof(Response),"GridComm:sresponse"); + memory->destroy(proclist); + memory->create(proclist,nrecv_request,"GridComm:proclist"); + + for (m = 0; m < nrecv_request; m++) { + send[m].proc = rrequest[m].sender; + xlo = MAX(rrequest[m].box[0],inxlo); + xhi = MIN(rrequest[m].box[1],inxhi); + ylo = MAX(rrequest[m].box[2],inylo); + yhi = MIN(rrequest[m].box[3],inyhi); + zlo = MAX(rrequest[m].box[4],inzlo); + zhi = MIN(rrequest[m].box[5],inzhi); + send[m].npack = indices(send[m].packlist,xlo,xhi,ylo,yhi,zlo,zhi); + + proclist[m] = rrequest[m].sender; + sresponse[m].index = rrequest[m].index; + sresponse[m].box[0] = xlo; + sresponse[m].box[1] = xhi; + sresponse[m].box[2] = ylo; + sresponse[m].box[3] = yhi; + sresponse[m].box[4] = zlo; + sresponse[m].box[5] = zhi; + } + + nsend = nrecv_request; + + // reply to each Request message with a Response message + // content: index for the overlap on requestor, overlap box on my owned grid + + int nsend_response = nrecv_request; + int nrecv_response = irregular->create_data(nsend_response,proclist,1); + Response *rresponse = + (Response *) memory->smalloc(nrecv_response*sizeof(Response),"GridComm:rresponse"); + irregular->exchange_data((char *) sresponse,sizeof(Response),(char *) rresponse); + irregular->destroy_data(); + delete irregular; + + // process received responses + // box used to setup my Recv data struct after unwrapping via PBC + // adjacent = 0 if any box of ghost cells does not adjoin my owned cells + + recv = (Recv *) memory->smalloc(nrecv_response*sizeof(Recv),"CommGrid:recv"); + adjacent = 1; + + for (i = 0; i < nrecv_response; i++) { + m = rresponse[i].index; + recv[i].proc = overlap[m].proc; + xlo = rresponse[i].box[0] + overlap[m].pbc[0] * nx; + xhi = rresponse[i].box[1] + overlap[m].pbc[0] * nx; + ylo = rresponse[i].box[2] + overlap[m].pbc[1] * ny; + yhi = rresponse[i].box[3] + overlap[m].pbc[1] * ny; + zlo = rresponse[i].box[4] + overlap[m].pbc[2] * nz; + zhi = rresponse[i].box[5] + overlap[m].pbc[2] * nz; + recv[i].nunpack = indices(recv[i].unpacklist,xlo,xhi,ylo,yhi,zlo,zhi); + + if (xlo != inxhi+1 && xhi != inxlo-1 && + ylo != inyhi+1 && yhi != inylo-1 && + zlo != inzhi+1 && zhi != inzlo-1) adjacent = 0; + } + + nrecv = nrecv_response; + + // create Copy data struct from overlaps with self + + copy = (Copy *) memory->smalloc(ncopy*sizeof(Copy),"CommGrid:copy"); + + ncopy = 0; + for (m = 0; m < noverlap; m++) { + if (overlap[m].proc != me) continue; + xlo = overlap[m].box[0]; + xhi = overlap[m].box[1]; + ylo = overlap[m].box[2]; + yhi = overlap[m].box[3]; + zlo = overlap[m].box[4]; + zhi = overlap[m].box[5]; + copy[ncopy].npack = indices(copy[ncopy].packlist,xlo,xhi,ylo,yhi,zlo,zhi); + xlo = overlap[m].box[0] + overlap[m].pbc[0] * nx; + xhi = overlap[m].box[1] + overlap[m].pbc[0] * nx; + ylo = overlap[m].box[2] + overlap[m].pbc[1] * ny; + yhi = overlap[m].box[3] + overlap[m].pbc[1] * ny; + zlo = overlap[m].box[4] + overlap[m].pbc[2] * nz; + zhi = overlap[m].box[5] + overlap[m].pbc[2] * nz; + copy[ncopy].nunpack = indices(copy[ncopy].unpacklist,xlo,xhi,ylo,yhi,zlo,zhi); + ncopy++; + } + + // set offsets for received data + + int offset = 0; + for (m = 0; m < nsend; m++) { + send[m].offset = offset; + offset += send[m].npack; + } + + offset = 0; + for (m = 0; m < nrecv; m++) { + recv[m].offset = offset; + offset += recv[m].nunpack; + } + + // length of MPI requests vector is max of nsend, nrecv + + int nrequest = MAX(nsend,nrecv); + requests = new MPI_Request[nrequest]; + + // clean-up + + memory->sfree(rcbinfo); + memory->destroy(proclist); + memory->destroy(overlap_procs); + memory->sfree(overlap); + memory->sfree(srequest); + memory->sfree(rrequest); + memory->sfree(sresponse); + memory->sfree(rresponse); + + // nbuf1 = largest pack or unpack in any Send or Recv or Copy + // nbuf2 = larget of sum of all packs or unpacks in Send or Recv + + nbuf1 = 0; + + for (m = 0; m < ncopy; m++) { + nbuf1 = MAX(nbuf1,copy[m].npack); + nbuf1 = MAX(nbuf1,copy[m].nunpack); + } + + int nbufs = 0; + for (m = 0; m < nsend; m++) { + nbuf1 = MAX(nbuf1,send[m].npack); + nbufs += send[m].npack; + } + + int nbufr = 0; + for (m = 0; m < nrecv; m++) { + nbuf1 = MAX(nbuf1,recv[m].nunpack); + nbufr += recv[m].nunpack; + } + + nbuf2 = MAX(nbufs,nbufr); +} + +/* ---------------------------------------------------------------------- +------------------------------------------------------------------------- */ + +void GridComm2::ghost_box_drop(int *box, int *pbc) +{ + int i,m; + + // newbox12 and newpbc are initially copies of caller box and pbc + + int newbox1[6],newbox2[6],newpbc[3]; + + for (i = 0; i < 6; i++) newbox1[i] = newbox2[i] = box[i]; + for (i = 0; i < 3; i++) newpbc[i] = pbc[i]; + + // 6 if tests to see if box needs to be split across a periodic boundary + // final else is no split + + int splitflag = 1; + + if (box[0] < 0) { + newbox1[0] = 0; + newbox2[0] = box[0] + nx; + newbox2[1] = nx - 1; + newpbc[0]--; + } else if (box[1] >= nx) { + newbox1[1] = nx - 1; + newbox2[0] = 0; + newbox2[1] = box[1] - nx; + newpbc[0]++; + } else if (box[2] < 0) { + newbox1[2] = 0; + newbox2[2] = box[2] + ny; + newbox2[3] = ny - 1; + newpbc[1]--; + } else if (box[3] >= ny) { + newbox1[3] = ny - 1; + newbox2[2] = 0; + newbox2[3] = box[3] - ny; + newpbc[1]++; + } else if (box[4] < 0) { + newbox1[4] = 0; + newbox2[4] = box[4] + nz; + newbox2[5] = nz - 1; + newpbc[2]--; + } else if (box[5] >= nz) { + newbox1[5] = nz - 1; + newbox2[4] = 0; + newbox2[5] = box[5] - nz; + newpbc[2]++; + + // box is not split, drop on RCB tree + // returns nprocs = # of procs it overlaps, including self + // returns proc_overlap = list of proc IDs it overlaps + // skip self overlap if no crossing of periodic boundaries + + } else { + splitflag = 0; + int np = 0; + box_drop_grid(box,0,nprocs-1,np,overlap_procs); + for (m = 0; m < np; m++) { + if (noverlap == maxoverlap) grow_overlap(); + if (overlap_procs[m] == me && + pbc[0] == 0 && pbc[1] == 0 && pbc[2] == 0) continue; + overlap[noverlap].proc = overlap_procs[m]; + for (i = 0; i < 6; i++) overlap[noverlap].box[i] = box[i]; + for (i = 0; i < 3; i++) overlap[noverlap].pbc[i] = pbc[i]; + noverlap++; + } + } + + // recurse with 2 split boxes + + if (splitflag) { + ghost_box_drop(newbox1,pbc); + ghost_box_drop(newbox2,newpbc); + } +} + +/* ---------------------------------------------------------------------- +------------------------------------------------------------------------- */ + +void GridComm2::box_drop_grid(int *box, int proclower, int procupper, + int &np, int *plist) +{ + // end recursion when partition is a single proc + // add proclower to plist + + if (proclower == procupper) { + plist[np++] = proclower; + return; + } + + // drop box on each side of cut it extends beyond + // use < and >= criteria so does not include a box it only touches + // procmid = 1st processor in upper half of partition + // = location in tree that stores this cut + // cut = index of first grid cell in upper partition + // dim = 0,1,2 dimension of cut + + int procmid = proclower + (procupper - proclower) / 2 + 1; + int dim = rcbinfo[procmid].dim; + int cut = rcbinfo[procmid].cut; + + if (box[2*dim] < cut) box_drop_grid(box,proclower,procmid-1,np,plist); + if (box[2*dim+1] >= cut) box_drop_grid(box,procmid,procupper,np,plist); +} + +/* ---------------------------------------------------------------------- + check if all procs only need ghost info from adjacent procs + return 1 if yes, 0 if no +------------------------------------------------------------------------- */ + +int GridComm2::ghost_adjacent() +{ + if (layout == REGULAR) return ghost_adjacent_regular(); + return ghost_adjacent_tiled(); +} + +/* ---------------------------------------------------------------------- + adjacent = 0 if a proc's ghost xyz lohi values exceed its subdomain size + return 0 if adjacent=0 for any proc, else 1 +------------------------------------------------------------------------- */ + +int GridComm2::ghost_adjacent_regular() +{ + adjacent = 1; + if (ghostxlo > inxhi-inxlo+1) adjacent = 0; + if (ghostxhi > inxhi-inxlo+1) adjacent = 0; + if (ghostylo > inyhi-inylo+1) adjacent = 0; + if (ghostyhi > inyhi-inylo+1) adjacent = 0; + if (ghostzlo > inzhi-inzlo+1) adjacent = 0; + if (ghostzhi > inzhi-inzlo+1) adjacent = 0; + + int adjacent_all; + MPI_Allreduce(&adjacent,&adjacent_all,1,MPI_INT,MPI_MIN,gridcomm); + return adjacent_all; +} + +/* ---------------------------------------------------------------------- + adjacent = 0 if a proc's received ghosts were flagged + as non-adjacent in setup_tiled() + return 0 if adjacent=0 for any proc, else 1 +------------------------------------------------------------------------- */ + +int GridComm2::ghost_adjacent_tiled() +{ + int adjacent_all; + MPI_Allreduce(&adjacent,&adjacent_all,1,MPI_INT,MPI_MIN,gridcomm); + return adjacent_all; +} + +/* ---------------------------------------------------------------------- + use swap list in forward order to acquire copy of all needed ghost grid pts +------------------------------------------------------------------------- */ + +void GridComm2::forward_comm(KSpace *kspace, int nper, int nbyte, int which, + void *buf1, void *buf2, MPI_Datatype datatype) +{ + if (layout == REGULAR) + forward_comm_regular(kspace,nper,nbyte,which,buf1,buf2,datatype); + else + forward_comm_tiled(kspace,nper,nbyte,which,buf1,buf2,datatype); +} + +/* ---------------------------------------------------------------------- */ + +void GridComm2::forward_comm_regular(KSpace *kspace, int nper, int nbyte, int which, + void *buf1, void *buf2, MPI_Datatype datatype) +{ + int m; + MPI_Request request; + + for (m = 0; m < nswap; m++) { + if (swap[m].sendproc == me) + kspace->pack_forward2(which,buf2,swap[m].npack,swap[m].packlist); + else + kspace->pack_forward2(which,buf1,swap[m].npack,swap[m].packlist); + + if (swap[m].sendproc != me) { + if (swap[m].nunpack) MPI_Irecv(buf2,nper*swap[m].nunpack,datatype, + swap[m].recvproc,0,gridcomm,&request); + if (swap[m].npack) MPI_Send(buf1,nper*swap[m].npack,datatype, + swap[m].sendproc,0,gridcomm); + if (swap[m].nunpack) MPI_Wait(&request,MPI_STATUS_IGNORE); + } + + kspace->unpack_forward2(which,buf2,swap[m].nunpack,swap[m].unpacklist); + } +} + +/* ---------------------------------------------------------------------- */ + +void GridComm2::forward_comm_tiled(KSpace *kspace, int nper, int nbyte, int which, + void *buf1, void *vbuf2, MPI_Datatype datatype) +{ + int i,m,offset; + + char *buf2 = (char *) vbuf2; + + // post all receives + + for (m = 0; m < nrecv; m++) { + offset = nper * recv[m].offset * nbyte; + MPI_Irecv((void *) &buf2[offset],nper*recv[m].nunpack,datatype, + recv[m].proc,0,gridcomm,&requests[m]); + } + + // perform all sends to other procs + + for (m = 0; m < nsend; m++) { + kspace->pack_forward2(which,buf1,send[m].npack,send[m].packlist); + MPI_Send(buf1,nper*send[m].npack,datatype,send[m].proc,0,gridcomm); + } + + // perform all copies to self + + for (m = 0; m < ncopy; m++) { + kspace->pack_forward2(which,buf1,copy[m].npack,copy[m].packlist); + kspace->unpack_forward2(which,buf1,copy[m].nunpack,copy[m].unpacklist); + } + + // unpack all received data + + for (i = 0; i < nrecv; i++) { + MPI_Waitany(nrecv,requests,&m,MPI_STATUS_IGNORE); + offset = nper * recv[m].offset * nbyte; + kspace->unpack_forward2(which,(void *) &buf2[offset], + recv[m].nunpack,recv[m].unpacklist); + } +} + +/* ---------------------------------------------------------------------- + use swap list in reverse order to compute fully summed value + for each owned grid pt that some other proc has copy of as a ghost grid pt +------------------------------------------------------------------------- */ + +void GridComm2::reverse_comm(KSpace *kspace, int nper, int nbyte, int which, + void *buf1, void *buf2, MPI_Datatype datatype) +{ + if (layout == REGULAR) + reverse_comm_regular(kspace,nper,nbyte,which,buf1,buf2,datatype); + else + reverse_comm_tiled(kspace,nper,nbyte,which,buf1,buf2,datatype); +} + +/* ---------------------------------------------------------------------- */ + +void GridComm2::reverse_comm_regular(KSpace *kspace, int nper, int nbyte, int which, + void *buf1, void *buf2, MPI_Datatype datatype) +{ + int m; + MPI_Request request; + + for (m = nswap-1; m >= 0; m--) { + if (swap[m].recvproc == me) + kspace->pack_reverse2(which,buf2,swap[m].nunpack,swap[m].unpacklist); + else + kspace->pack_reverse2(which,buf1,swap[m].nunpack,swap[m].unpacklist); + + if (swap[m].recvproc != me) { + if (swap[m].npack) MPI_Irecv(buf2,nper*swap[m].npack,datatype, + swap[m].sendproc,0,gridcomm,&request); + if (swap[m].nunpack) MPI_Send(buf1,nper*swap[m].nunpack,datatype, + swap[m].recvproc,0,gridcomm); + if (swap[m].npack) MPI_Wait(&request,MPI_STATUS_IGNORE); + } + + kspace->unpack_reverse2(which,buf2,swap[m].npack,swap[m].packlist); + } +} + +/* ---------------------------------------------------------------------- */ + +void GridComm2::reverse_comm_tiled(KSpace *kspace, int nper, int nbyte, int which, + void *buf1, void *vbuf2, MPI_Datatype datatype) +{ + int i,m,offset; + + char *buf2 = (char *) vbuf2; + + // post all receives + + for (m = 0; m < nsend; m++) { + offset = nper * send[m].offset * nbyte; + MPI_Irecv((void *) &buf2[offset],nper*send[m].npack,datatype, + send[m].proc,0,gridcomm,&requests[m]); + } + + // perform all sends to other procs + + for (m = 0; m < nrecv; m++) { + kspace->pack_reverse2(which,buf1,recv[m].nunpack,recv[m].unpacklist); + MPI_Send(buf1,nper*recv[m].nunpack,datatype,recv[m].proc,0,gridcomm); + } + + // perform all copies to self + + for (m = 0; m < ncopy; m++) { + kspace->pack_reverse2(which,buf1,copy[m].nunpack,copy[m].unpacklist); + kspace->unpack_reverse2(which,buf1,copy[m].npack,copy[m].packlist); + } + + // unpack all received data + + for (i = 0; i < nsend; i++) { + MPI_Waitany(nsend,requests,&m,MPI_STATUS_IGNORE); + offset = nper * send[m].offset * nbyte; + kspace->unpack_reverse2(which,(void *) &buf2[offset], + send[m].npack,send[m].packlist); + } +} + +/* ---------------------------------------------------------------------- + create swap stencil for grid own/ghost communication + swaps covers all 3 dimensions and both directions + swaps cover multiple iterations in a direction if need grid pts + from further away than nearest-neighbor proc + same swap list used by forward and reverse communication +------------------------------------------------------------------------- */ + +void GridComm2::grow_swap() +{ + maxswap += SWAPDELTA; + swap = (Swap *) + memory->srealloc(swap,maxswap*sizeof(Swap),"CommGrid:swap"); +} + +/* ---------------------------------------------------------------------- + create swap stencil for grid own/ghost communication + swaps covers all 3 dimensions and both directions + swaps cover multiple iterations in a direction if need grid pts + from further away than nearest-neighbor proc + same swap list used by forward and reverse communication +------------------------------------------------------------------------- */ + +void GridComm2::grow_overlap() +{ + maxoverlap += SWAPDELTA; + overlap = (Overlap *) + memory->srealloc(overlap,maxoverlap*sizeof(Overlap),"CommGrid:overlap"); +} + +/* ---------------------------------------------------------------------- + create 1d list of offsets into 3d array section (xlo:xhi,ylo:yhi,zlo:zhi) + assume 3d array is allocated as (outxlo_max:outxhi_max,outylo_max:outyhi_max, + outzlo_max:outzhi_max) +------------------------------------------------------------------------- */ + +int GridComm2::indices(int *&list, + int xlo, int xhi, int ylo, int yhi, int zlo, int zhi) +{ + int nmax = (xhi-xlo+1) * (yhi-ylo+1) * (zhi-zlo+1); + memory->create(list,nmax,"CommGrid:indices"); + if (nmax == 0) return 0; + + int nx = (outxhi_max-outxlo_max+1); + int ny = (outyhi_max-outylo_max+1); + + int n = 0; + int ix,iy,iz; + for (iz = zlo; iz <= zhi; iz++) + for (iy = ylo; iy <= yhi; iy++) + for (ix = xlo; ix <= xhi; ix++) + list[n++] = (iz-outzlo_max)*ny*nx + (iy-outylo_max)*nx + (ix-outxlo_max); + + return nmax; +} diff --git a/src/KSPACE/gridcomm2.h b/src/KSPACE/gridcomm2.h new file mode 100644 index 0000000000..eeba990d2d --- /dev/null +++ b/src/KSPACE/gridcomm2.h @@ -0,0 +1,201 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifndef LMP_GRIDCOMM2_H +#define LMP_GRIDCOMM2_H + +#include "pointers.h" + +namespace LAMMPS_NS { + +class GridComm2 : protected Pointers { + public: + GridComm2(class LAMMPS *, MPI_Comm, int, int, int, + int, int, int, int, int, int, + int, int, int, int, int, int); + GridComm2(class LAMMPS *, MPI_Comm, int, int, int, + int, int, int, int, int, int, + int, int, int, int, int, int, + int, int, int, int, int, int); + ~GridComm2(); + void setup(int &, int &); + int ghost_adjacent(); + void forward_comm(class KSpace *, int, int, int, void *, void *, MPI_Datatype); + void reverse_comm(class KSpace *, int, int, int, void *, void *, MPI_Datatype); + + private: + int me,nprocs; + int layout; // REGULAR or TILED + MPI_Comm gridcomm; + + // inputs from caller via constructor + + int nx,ny,nz; // size of global grid in all 3 dims + int inxlo,inxhi; // inclusive extent of my grid chunk + int inylo,inyhi; // 0 <= in <= N-1 + int inzlo,inzhi; + int outxlo,outxhi; // inclusive extent of my grid chunk plus + int outylo,outyhi; // ghost cells in all 6 directions + int outzlo,outzhi; // lo indices can be < 0, hi indices can be >= N + int outxlo_max,outxhi_max; // ?? + int outylo_max,outyhi_max; + int outzlo_max,outzhi_max; + + // ------------------------------------------- + // internal variables for REGULAR layout + // ------------------------------------------- + + int procxlo,procxhi; // 6 neighbor procs that adjoin me + int procylo,procyhi; // not used for comm_style = tiled + int proczlo,proczhi; + + int ghostxlo,ghostxhi; // # of my owned grid planes needed + int ghostylo,ghostyhi; // by neighobr procs in each dir as their ghost planes + int ghostzlo,ghostzhi; + + // swap = exchange of owned and ghost grid cells between 2 procs, including self + + struct Swap { + int sendproc; // proc to send to for forward comm + int recvproc; // proc to recv from for forward comm + int npack; // # of datums to pack + int nunpack; // # of datums to unpack + int *packlist; // 3d array offsets to pack + int *unpacklist; // 3d array offsets to unpack + }; + + int nswap,maxswap; + Swap *swap; + + // ------------------------------------------- + // internal variables for TILED layout + // ------------------------------------------- + + int *overlap_procs; + MPI_Request *requests; + + // RCB tree of cut info + // each proc contributes one value, except proc 0 + + struct RCBinfo { + int dim; // 0,1,2 = which dim the cut is in + int cut; // grid index of lowest cell in upper half of cut + }; + + RCBinfo *rcbinfo; + + // overlap = a proc whose owned cells overlap with my extended ghost box + // includes overlaps across periodic boundaries, can also be self + + struct Overlap { + int proc; // proc whose owned cells overlap my ghost cells + int box[6]; // box that overlaps otherproc's owned cells + // this box is wholly contained within global grid + int pbc[3]; // PBC offsets to convert box to a portion of my ghost box + // my ghost box may extend beyond global grid + }; + + int noverlap,maxoverlap; + Overlap *overlap; + + // request = sent to each proc whose owned cells overlap my ghost cells + + struct Request { + int sender; // sending proc + int index; // index of overlap on sender + int box[6]; // box that overlaps receiver's owned cells + // wholly contained within global grid + }; + + Request *srequest,*rrequest; + + // response = reply from each proc whose owned cells overlap my ghost cells + + struct Response { + int index; // index of my overlap for the initial request + int box[6]; // box that overlaps responder's owned cells + // wholly contained within global grid + // has to unwrapped by PBC to map to my ghost cells + }; + + Response *sresponse,*rresponse; + + // send = proc to send a subset of my owned cells to, for forward comm + // for reverse comm, proc I receive ghost overlaps with my owned cells from + // offset used in reverse comm to recv a message in middle of a large buffer + + struct Send { + int proc; + int npack; + int *packlist; + int offset; + }; + + // recv = proc to recv a subset of my ghost cells from, for forward comm + // for reverse comm, proc I send a subset of my ghost cells to + // offset used in forward comm to recv a message in middle of a large buffer + + struct Recv { + int proc; + int nunpack; + int *unpacklist; + int offset; + }; + + int adjacent; // 0 on a proc who receives ghosts from a non-neighbor proc + + // copy = subset of my owned cells to copy into subset of my ghost cells + // that describes forward comm, for reverse comm it is the opposite + + struct Copy { + int npack; + int nunpack; + int *packlist; + int *unpacklist; + }; + + int nsend,nrecv,ncopy; + Send *send; + Recv *recv; + Copy *copy; + + // ------------------------------------------- + // internal methods + // ------------------------------------------- + + void setup_regular(int &, int &); + void setup_tiled(int &, int &); + void ghost_box_drop(int *, int *); + void box_drop_grid(int *, int, int, int &, int *); + + int ghost_adjacent_regular(); + int ghost_adjacent_tiled(); + + void forward_comm_regular(class KSpace *, int, int, int, + void *, void *, MPI_Datatype); + void forward_comm_tiled(class KSpace *, int, int, int, + void *, void *, MPI_Datatype); + void reverse_comm_regular(class KSpace *, int, int, int, + void *, void *, MPI_Datatype); + void reverse_comm_tiled(class KSpace *, int, int, int, + void *, void *, MPI_Datatype); + + void grow_swap(); + void grow_overlap(); + + int indices(int *&, int, int, int, int, int, int); +}; + +} + +#endif diff --git a/src/KSPACE/pppm2.cpp b/src/KSPACE/pppm2.cpp new file mode 100644 index 0000000000..19baa13f16 --- /dev/null +++ b/src/KSPACE/pppm2.cpp @@ -0,0 +1,3524 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + Contributing authors: Roy Pollock (LLNL), Paul Crozier (SNL) + per-atom energy/virial & group/group energy/force added by Stan Moore (BYU) + analytic diff (2 FFT) option added by Rolf Isele-Holder (Aachen University) + triclinic added by Stan Moore (SNL) +------------------------------------------------------------------------- */ + +#include "pppm2.h" +#include +#include +#include +#include +#include "atom.h" +#include "comm.h" +#include "gridcomm2.h" +#include "neighbor.h" +#include "force.h" +#include "pair.h" +#include "bond.h" +#include "angle.h" +#include "domain.h" +#include "fft3d_wrap.h" +#include "remap_wrap.h" +#include "memory.h" +#include "error.h" +#include "utils.h" +#include "fmt/format.h" + +#include "math_const.h" +#include "math_special.h" + +using namespace LAMMPS_NS; +using namespace MathConst; +using namespace MathSpecial; + +#define MAXORDER 7 +#define OFFSET 16384 +#define LARGE 10000.0 +#define SMALL 0.00001 +#define EPS_HOC 1.0e-7 + +enum{REVERSE_RHO}; +enum{FORWARD_IK,FORWARD_AD,FORWARD_IK_PERATOM,FORWARD_AD_PERATOM}; + +#ifdef FFT_SINGLE +#define ZEROF 0.0f +#define ONEF 1.0f +#else +#define ZEROF 0.0 +#define ONEF 1.0 +#endif + +/* ---------------------------------------------------------------------- */ + +PPPM2::PPPM2(LAMMPS *lmp) : KSpace(lmp), + factors(NULL), density_brick(NULL), vdx_brick(NULL), vdy_brick(NULL), vdz_brick(NULL), + u_brick(NULL), v0_brick(NULL), v1_brick(NULL), v2_brick(NULL), v3_brick(NULL), + v4_brick(NULL), v5_brick(NULL), greensfn(NULL), vg(NULL), fkx(NULL), fky(NULL), + fkz(NULL), density_fft(NULL), work1(NULL), work2(NULL), gf_b(NULL), rho1d(NULL), + rho_coeff(NULL), drho1d(NULL), drho_coeff(NULL), + sf_precoeff1(NULL), sf_precoeff2(NULL), sf_precoeff3(NULL), + sf_precoeff4(NULL), sf_precoeff5(NULL), sf_precoeff6(NULL), + acons(NULL), density_A_brick(NULL), density_B_brick(NULL), density_A_fft(NULL), + density_B_fft(NULL), fft1(NULL), fft2(NULL), remap(NULL), gc(NULL), + gc_buf1(NULL), gc_buf2(NULL), part2grid(NULL), boxlo(NULL) +{ + peratom_allocate_flag = 0; + group_allocate_flag = 0; + + pppmflag = 1; + group_group_enable = 1; + triclinic = domain->triclinic; + + nfactors = 3; + factors = new int[nfactors]; + factors[0] = 2; + factors[1] = 3; + factors[2] = 5; + + MPI_Comm_rank(world,&me); + MPI_Comm_size(world,&nprocs); + + nfft_both = 0; + nxhi_in = nxlo_in = nxhi_out = nxlo_out = 0; + nyhi_in = nylo_in = nyhi_out = nylo_out = 0; + nzhi_in = nzlo_in = nzhi_out = nzlo_out = 0; + + density_brick = vdx_brick = vdy_brick = vdz_brick = NULL; + density_fft = NULL; + u_brick = NULL; + v0_brick = v1_brick = v2_brick = v3_brick = v4_brick = v5_brick = NULL; + greensfn = NULL; + work1 = work2 = NULL; + vg = NULL; + fkx = fky = fkz = NULL; + + sf_precoeff1 = sf_precoeff2 = sf_precoeff3 = + sf_precoeff4 = sf_precoeff5 = sf_precoeff6 = NULL; + + density_A_brick = density_B_brick = NULL; + density_A_fft = density_B_fft = NULL; + + gf_b = NULL; + rho1d = rho_coeff = drho1d = drho_coeff = NULL; + + fft1 = fft2 = NULL; + remap = NULL; + gc = NULL; + gc_buf1 = gc_buf2 = NULL; + + nmax = 0; + part2grid = NULL; + + // define acons coefficients for estimation of kspace errors + // see JCP 109, pg 7698 for derivation of coefficients + // higher order coefficients may be computed if needed + + memory->create(acons,8,7,"pppm:acons"); + acons[1][0] = 2.0 / 3.0; + acons[2][0] = 1.0 / 50.0; + acons[2][1] = 5.0 / 294.0; + acons[3][0] = 1.0 / 588.0; + acons[3][1] = 7.0 / 1440.0; + acons[3][2] = 21.0 / 3872.0; + acons[4][0] = 1.0 / 4320.0; + acons[4][1] = 3.0 / 1936.0; + acons[4][2] = 7601.0 / 2271360.0; + acons[4][3] = 143.0 / 28800.0; + acons[5][0] = 1.0 / 23232.0; + acons[5][1] = 7601.0 / 13628160.0; + acons[5][2] = 143.0 / 69120.0; + acons[5][3] = 517231.0 / 106536960.0; + acons[5][4] = 106640677.0 / 11737571328.0; + acons[6][0] = 691.0 / 68140800.0; + acons[6][1] = 13.0 / 57600.0; + acons[6][2] = 47021.0 / 35512320.0; + acons[6][3] = 9694607.0 / 2095994880.0; + acons[6][4] = 733191589.0 / 59609088000.0; + acons[6][5] = 326190917.0 / 11700633600.0; + acons[7][0] = 1.0 / 345600.0; + acons[7][1] = 3617.0 / 35512320.0; + acons[7][2] = 745739.0 / 838397952.0; + acons[7][3] = 56399353.0 / 12773376000.0; + acons[7][4] = 25091609.0 / 1560084480.0; + acons[7][5] = 1755948832039.0 / 36229939200000.0; + acons[7][6] = 4887769399.0 / 37838389248.0; +} + +/* ---------------------------------------------------------------------- */ + +void PPPM2::settings(int narg, char **arg) +{ + if (narg < 1) error->all(FLERR,"Illegal kspace_style pppm command"); + accuracy_relative = fabs(force->numeric(FLERR,arg[0])); +} + +/* ---------------------------------------------------------------------- + free all memory +------------------------------------------------------------------------- */ + +PPPM2::~PPPM2() +{ + if (copymode) return; + + delete [] factors; + deallocate(); + if (peratom_allocate_flag) deallocate_peratom(); + if (group_allocate_flag) deallocate_groups(); + memory->destroy(part2grid); + memory->destroy(acons); +} + +/* ---------------------------------------------------------------------- + called once before run +------------------------------------------------------------------------- */ + +void PPPM2::init() +{ + if (me == 0) utils::logmesg(lmp,"PPPM initialization ...\n"); + + // error check + + triclinic_check(); + + if (triclinic != domain->triclinic) + error->all(FLERR,"Must redefine kspace_style after changing to triclinic box"); + + if (domain->triclinic && differentiation_flag == 1) + error->all(FLERR,"Cannot (yet) use PPPM with triclinic box " + "and kspace_modify diff ad"); + if (domain->triclinic && slabflag) + error->all(FLERR,"Cannot (yet) use PPPM with triclinic box and " + "slab correction"); + if (domain->dimension == 2) + error->all(FLERR,"Cannot use PPPM with 2d simulation"); + + if (!atom->q_flag) + error->all(FLERR,"Kspace style requires atom attribute q"); + + if (slabflag == 0 && domain->nonperiodic > 0) + error->all(FLERR,"Cannot use non-periodic boundaries with PPPM"); + if (slabflag) { + if (domain->xperiodic != 1 || domain->yperiodic != 1 || + domain->boundary[2][0] != 1 || domain->boundary[2][1] != 1) + error->all(FLERR,"Incorrect boundaries with slab PPPM"); + } + + if (order < 2 || order > MAXORDER) + error->all(FLERR,fmt::format("PPPM order cannot be < 2 or > {}",MAXORDER)); + + // compute two charge force + + two_charge(); + + // extract short-range Coulombic cutoff from pair style + + triclinic = domain->triclinic; + pair_check(); + + int itmp = 0; + double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp); + if (p_cutoff == NULL) + error->all(FLERR,"KSpace style is incompatible with Pair style"); + cutoff = *p_cutoff; + + // if kspace is TIP4P, extract TIP4P params from pair style + // bond/angle are not yet init(), so insure equilibrium request is valid + + qdist = 0.0; + + if (tip4pflag) { + if (me == 0) utils::logmesg(lmp," extracting TIP4P info from pair style\n"); + + double *p_qdist = (double *) force->pair->extract("qdist",itmp); + int *p_typeO = (int *) force->pair->extract("typeO",itmp); + int *p_typeH = (int *) force->pair->extract("typeH",itmp); + int *p_typeA = (int *) force->pair->extract("typeA",itmp); + int *p_typeB = (int *) force->pair->extract("typeB",itmp); + if (!p_qdist || !p_typeO || !p_typeH || !p_typeA || !p_typeB) + error->all(FLERR,"Pair style is incompatible with TIP4P KSpace style"); + qdist = *p_qdist; + typeO = *p_typeO; + typeH = *p_typeH; + int typeA = *p_typeA; + int typeB = *p_typeB; + + if (force->angle == NULL || force->bond == NULL || + force->angle->setflag == NULL || force->bond->setflag == NULL) + error->all(FLERR,"Bond and angle potentials must be defined for TIP4P"); + if (typeA < 1 || typeA > atom->nangletypes || + force->angle->setflag[typeA] == 0) + error->all(FLERR,"Bad TIP4P angle type for PPPM/TIP4P"); + if (typeB < 1 || typeB > atom->nbondtypes || + force->bond->setflag[typeB] == 0) + error->all(FLERR,"Bad TIP4P bond type for PPPM/TIP4P"); + double theta = force->angle->equilibrium_angle(typeA); + double blen = force->bond->equilibrium_distance(typeB); + alpha = qdist / (cos(0.5*theta) * blen); + } + + // compute qsum & qsqsum and warn if not charge-neutral + + scale = 1.0; + qqrd2e = force->qqrd2e; + qsum_qsq(); + natoms_original = atom->natoms; + + // set accuracy (force units) from accuracy_relative or accuracy_absolute + + if (accuracy_absolute >= 0.0) accuracy = accuracy_absolute; + else accuracy = accuracy_relative * two_charge_force; + + // free all arrays previously allocated + + deallocate(); + if (peratom_allocate_flag) deallocate_peratom(); + if (group_allocate_flag) deallocate_groups(); + + // setup FFT grid resolution and g_ewald + // normally one iteration thru while loop is all that is required + // if grid stencil does not extend beyond neighbor proc + // or overlap is allowed, then done + // else reduce order and try again + + int (*procneigh)[2] = comm->procneigh; + + GridComm2 *gctmp = NULL; + int iteration = 0; + + while (order >= minorder) { + if (iteration && me == 0) + error->warning(FLERR,"Reducing PPPM order b/c stencil extends " + "beyond nearest neighbor processor"); + + if (stagger_flag && !differentiation_flag) compute_gf_denom(); + set_grid_global(); + set_grid_local(); + if (overlap_allowed) break; + + gctmp = new GridComm2(lmp,world,nx_pppm,ny_pppm,nz_pppm, + nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, + nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out); + + int tmp1,tmp2; + gctmp->setup(tmp1,tmp2); + if (gctmp->ghost_adjacent()) break; + delete gctmp; + + order--; + iteration++; + } + + if (order < minorder) error->all(FLERR,"PPPM order < minimum allowed order"); + if (!overlap_allowed && !gctmp->ghost_adjacent()) + error->all(FLERR,"PPPM grid stencil extends " + "beyond nearest neighbor processor"); + if (gctmp) delete gctmp; + + // adjust g_ewald + + if (!gewaldflag) adjust_gewald(); + + // calculate the final accuracy + + double estimated_accuracy = final_accuracy(); + + // print stats + + int ngrid_max,nfft_both_max; + MPI_Allreduce(&ngrid,&ngrid_max,1,MPI_INT,MPI_MAX,world); + MPI_Allreduce(&nfft_both,&nfft_both_max,1,MPI_INT,MPI_MAX,world); + + if (me == 0) { + std::string mesg = fmt::format(" G vector (1/distance) = {:.8g}\n",g_ewald); + mesg += fmt::format(" grid = {} {} {}\n",nx_pppm,ny_pppm,nz_pppm); + mesg += fmt::format(" stencil order = {}\n",order); + mesg += fmt::format(" estimated absolute RMS force accuracy = {:.8g}\n", + estimated_accuracy); + mesg += fmt::format(" estimated relative force accuracy = {:.8g}\n", + estimated_accuracy/two_charge_force); + mesg += " using " LMP_FFT_PREC " precision " LMP_FFT_LIB "\n"; + mesg += fmt::format(" 3d grid and FFT values/proc = {} {}\n", + ngrid_max,nfft_both_max); + utils::logmesg(lmp,mesg); + } + + // allocate K-space dependent memory + // don't invoke allocate peratom() or group(), will be allocated when needed + + allocate(); + + // pre-compute Green's function denomiator expansion + // pre-compute 1d charge distribution coefficients + + compute_gf_denom(); + if (differentiation_flag == 1) compute_sf_precoeff(); + compute_rho_coeff(); +} + +/* ---------------------------------------------------------------------- + adjust PPPM coeffs, called initially and whenever volume has changed +------------------------------------------------------------------------- */ + +void PPPM2::setup() +{ + if (triclinic) { + setup_triclinic(); + return; + } + + // perform some checks to avoid illegal boundaries with read_data + + if (slabflag == 0 && domain->nonperiodic > 0) + error->all(FLERR,"Cannot use non-periodic boundaries with PPPM"); + if (slabflag) { + if (domain->xperiodic != 1 || domain->yperiodic != 1 || + domain->boundary[2][0] != 1 || domain->boundary[2][1] != 1) + error->all(FLERR,"Incorrect boundaries with slab PPPM"); + } + + int i,j,k,n; + double *prd; + + // volume-dependent factors + // adjust z dimension for 2d slab PPPM + // z dimension for 3d PPPM is zprd since slab_volfactor = 1.0 + + if (triclinic == 0) prd = domain->prd; + else prd = domain->prd_lamda; + + double xprd = prd[0]; + double yprd = prd[1]; + double zprd = prd[2]; + double zprd_slab = zprd*slab_volfactor; + volume = xprd * yprd * zprd_slab; + + delxinv = nx_pppm/xprd; + delyinv = ny_pppm/yprd; + delzinv = nz_pppm/zprd_slab; + + delvolinv = delxinv*delyinv*delzinv; + + double unitkx = (MY_2PI/xprd); + double unitky = (MY_2PI/yprd); + double unitkz = (MY_2PI/zprd_slab); + + // fkx,fky,fkz for my FFT grid pts + + double per; + + for (i = nxlo_fft; i <= nxhi_fft; i++) { + per = i - nx_pppm*(2*i/nx_pppm); + fkx[i] = unitkx*per; + } + + for (i = nylo_fft; i <= nyhi_fft; i++) { + per = i - ny_pppm*(2*i/ny_pppm); + fky[i] = unitky*per; + } + + for (i = nzlo_fft; i <= nzhi_fft; i++) { + per = i - nz_pppm*(2*i/nz_pppm); + fkz[i] = unitkz*per; + } + + // virial coefficients + + double sqk,vterm; + + n = 0; + for (k = nzlo_fft; k <= nzhi_fft; k++) { + for (j = nylo_fft; j <= nyhi_fft; j++) { + for (i = nxlo_fft; i <= nxhi_fft; i++) { + sqk = fkx[i]*fkx[i] + fky[j]*fky[j] + fkz[k]*fkz[k]; + if (sqk == 0.0) { + vg[n][0] = 0.0; + vg[n][1] = 0.0; + vg[n][2] = 0.0; + vg[n][3] = 0.0; + vg[n][4] = 0.0; + vg[n][5] = 0.0; + } else { + vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald)); + vg[n][0] = 1.0 + vterm*fkx[i]*fkx[i]; + vg[n][1] = 1.0 + vterm*fky[j]*fky[j]; + vg[n][2] = 1.0 + vterm*fkz[k]*fkz[k]; + vg[n][3] = vterm*fkx[i]*fky[j]; + vg[n][4] = vterm*fkx[i]*fkz[k]; + vg[n][5] = vterm*fky[j]*fkz[k]; + } + n++; + } + } + } + + if (differentiation_flag == 1) compute_gf_ad(); + else compute_gf_ik(); +} + +/* ---------------------------------------------------------------------- + adjust PPPM coeffs, called initially and whenever volume has changed + for a triclinic system +------------------------------------------------------------------------- */ + +void PPPM2::setup_triclinic() +{ + int i,j,k,n; + double *prd; + + // volume-dependent factors + // adjust z dimension for 2d slab PPPM + // z dimension for 3d PPPM is zprd since slab_volfactor = 1.0 + + prd = domain->prd; + + double xprd = prd[0]; + double yprd = prd[1]; + double zprd = prd[2]; + double zprd_slab = zprd*slab_volfactor; + volume = xprd * yprd * zprd_slab; + + // use lamda (0-1) coordinates + + delxinv = nx_pppm; + delyinv = ny_pppm; + delzinv = nz_pppm; + delvolinv = delxinv*delyinv*delzinv/volume; + + // fkx,fky,fkz for my FFT grid pts + + double per_i,per_j,per_k; + + n = 0; + for (k = nzlo_fft; k <= nzhi_fft; k++) { + per_k = k - nz_pppm*(2*k/nz_pppm); + for (j = nylo_fft; j <= nyhi_fft; j++) { + per_j = j - ny_pppm*(2*j/ny_pppm); + for (i = nxlo_fft; i <= nxhi_fft; i++) { + per_i = i - nx_pppm*(2*i/nx_pppm); + + double unitk_lamda[3]; + unitk_lamda[0] = 2.0*MY_PI*per_i; + unitk_lamda[1] = 2.0*MY_PI*per_j; + unitk_lamda[2] = 2.0*MY_PI*per_k; + x2lamdaT(&unitk_lamda[0],&unitk_lamda[0]); + fkx[n] = unitk_lamda[0]; + fky[n] = unitk_lamda[1]; + fkz[n] = unitk_lamda[2]; + n++; + } + } + } + + // virial coefficients + + double sqk,vterm; + + for (n = 0; n < nfft; n++) { + sqk = fkx[n]*fkx[n] + fky[n]*fky[n] + fkz[n]*fkz[n]; + if (sqk == 0.0) { + vg[n][0] = 0.0; + vg[n][1] = 0.0; + vg[n][2] = 0.0; + vg[n][3] = 0.0; + vg[n][4] = 0.0; + vg[n][5] = 0.0; + } else { + vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald)); + vg[n][0] = 1.0 + vterm*fkx[n]*fkx[n]; + vg[n][1] = 1.0 + vterm*fky[n]*fky[n]; + vg[n][2] = 1.0 + vterm*fkz[n]*fkz[n]; + vg[n][3] = vterm*fkx[n]*fky[n]; + vg[n][4] = vterm*fkx[n]*fkz[n]; + vg[n][5] = vterm*fky[n]*fkz[n]; + } + } + + compute_gf_ik_triclinic(); +} + +/* ---------------------------------------------------------------------- + reset local grid arrays and communication stencils + called by fix balance b/c it changed sizes of processor sub-domains +------------------------------------------------------------------------- */ + +void PPPM2::setup_grid() +{ + // free all arrays previously allocated + + deallocate(); + if (peratom_allocate_flag) deallocate_peratom(); + if (group_allocate_flag) deallocate_groups(); + + // reset portion of global grid that each proc owns + + set_grid_local(); + + // reallocate K-space dependent memory + // check if grid communication is now overlapping if not allowed + // don't invoke allocate peratom() or group(), will be allocated when needed + + allocate(); + + if (!overlap_allowed && !gc->ghost_adjacent()) + error->all(FLERR,"PPPM grid stencil extends " + "beyond nearest neighbor processor"); + + // pre-compute Green's function denomiator expansion + // pre-compute 1d charge distribution coefficients + + compute_gf_denom(); + if (differentiation_flag == 1) compute_sf_precoeff(); + compute_rho_coeff(); + + // pre-compute volume-dependent coeffs for portion of grid I now own + + setup(); +} + +/* ---------------------------------------------------------------------- + compute the PPPM long-range force, energy, virial +------------------------------------------------------------------------- */ + +void PPPM2::compute(int eflag, int vflag) +{ + int i,j; + + // set energy/virial flags + // invoke allocate_peratom() if needed for first time + + ev_init(eflag,vflag); + + if (evflag_atom && !peratom_allocate_flag) allocate_peratom(); + + // if atom count has changed, update qsum and qsqsum + + if (atom->natoms != natoms_original) { + qsum_qsq(); + natoms_original = atom->natoms; + } + + // return if there are no charges + + if (qsqsum == 0.0) return; + + // convert atoms from box to lamda coords + + if (triclinic == 0) boxlo = domain->boxlo; + else { + boxlo = domain->boxlo_lamda; + domain->x2lamda(atom->nlocal); + } + + // extend size of per-atom arrays if necessary + + if (atom->nmax > nmax) { + memory->destroy(part2grid); + nmax = atom->nmax; + memory->create(part2grid,nmax,3,"pppm:part2grid"); + } + + // find grid points for all my particles + // map my particle charge onto my local 3d density grid + + particle_map(); + make_rho(); + + // all procs communicate density values from their ghost cells + // to fully sum contribution in their 3d bricks + // remap from 3d decomposition to FFT decomposition + + gc->reverse_comm(this,1,sizeof(FFT_SCALAR),REVERSE_RHO, + gc_buf1,gc_buf2,MPI_FFT_SCALAR); + brick2fft(); + + // compute potential gradient on my FFT grid and + // portion of e_long on this proc's FFT grid + // return gradients (electric fields) in 3d brick decomposition + // also performs per-atom calculations via poisson_peratom() + + poisson(); + + // all procs communicate E-field values + // to fill ghost cells surrounding their 3d bricks + + if (differentiation_flag == 1) + gc->forward_comm(this,1,sizeof(FFT_SCALAR),FORWARD_AD, + gc_buf1,gc_buf2,MPI_FFT_SCALAR); + else + gc->forward_comm(this,3,sizeof(FFT_SCALAR),FORWARD_IK, + gc_buf1,gc_buf2,MPI_FFT_SCALAR); + + // extra per-atom energy/virial communication + + if (evflag_atom) { + if (differentiation_flag == 1 && vflag_atom) + gc->forward_comm(this,6,sizeof(FFT_SCALAR),FORWARD_AD_PERATOM, + gc_buf1,gc_buf2,MPI_FFT_SCALAR); + else if (differentiation_flag == 0) + gc->forward_comm(this,7,sizeof(FFT_SCALAR),FORWARD_IK_PERATOM, + gc_buf1,gc_buf2,MPI_FFT_SCALAR); + } + + // calculate the force on my particles + + fieldforce(); + + // extra per-atom energy/virial communication + + if (evflag_atom) fieldforce_peratom(); + + // sum global energy across procs and add in volume-dependent term + + const double qscale = qqrd2e * scale; + + if (eflag_global) { + double energy_all; + MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world); + energy = energy_all; + + energy *= 0.5*volume; + energy -= g_ewald*qsqsum/MY_PIS + + MY_PI2*qsum*qsum / (g_ewald*g_ewald*volume); + energy *= qscale; + } + + // sum global virial across procs + + if (vflag_global) { + double virial_all[6]; + MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world); + for (i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i]; + } + + // per-atom energy/virial + // energy includes self-energy correction + // ntotal accounts for TIP4P tallying eatom/vatom for ghost atoms + + if (evflag_atom) { + double *q = atom->q; + int nlocal = atom->nlocal; + int ntotal = nlocal; + if (tip4pflag) ntotal += atom->nghost; + + if (eflag_atom) { + for (i = 0; i < nlocal; i++) { + eatom[i] *= 0.5; + eatom[i] -= g_ewald*q[i]*q[i]/MY_PIS + MY_PI2*q[i]*qsum / + (g_ewald*g_ewald*volume); + eatom[i] *= qscale; + } + for (i = nlocal; i < ntotal; i++) eatom[i] *= 0.5*qscale; + } + + if (vflag_atom) { + for (i = 0; i < ntotal; i++) + for (j = 0; j < 6; j++) vatom[i][j] *= 0.5*qscale; + } + } + + // 2d slab correction + + if (slabflag == 1) slabcorr(); + + // convert atoms back from lamda to box coords + + if (triclinic) domain->lamda2x(atom->nlocal); +} + +/* ---------------------------------------------------------------------- + allocate memory that depends on # of K-vectors and order +------------------------------------------------------------------------- */ + +void PPPM2::allocate() +{ + memory->create3d_offset(density_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out, + nxlo_out,nxhi_out,"pppm:density_brick"); + + memory->create(density_fft,nfft_both,"pppm:density_fft"); + memory->create(greensfn,nfft_both,"pppm:greensfn"); + memory->create(work1,2*nfft_both,"pppm:work1"); + memory->create(work2,2*nfft_both,"pppm:work2"); + memory->create(vg,nfft_both,6,"pppm:vg"); + + if (triclinic == 0) { + memory->create1d_offset(fkx,nxlo_fft,nxhi_fft,"pppm:fkx"); + memory->create1d_offset(fky,nylo_fft,nyhi_fft,"pppm:fky"); + memory->create1d_offset(fkz,nzlo_fft,nzhi_fft,"pppm:fkz"); + } else { + memory->create(fkx,nfft_both,"pppm:fkx"); + memory->create(fky,nfft_both,"pppm:fky"); + memory->create(fkz,nfft_both,"pppm:fkz"); + } + + if (differentiation_flag == 1) { + memory->create3d_offset(u_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out, + nxlo_out,nxhi_out,"pppm:u_brick"); + + memory->create(sf_precoeff1,nfft_both,"pppm:sf_precoeff1"); + memory->create(sf_precoeff2,nfft_both,"pppm:sf_precoeff2"); + memory->create(sf_precoeff3,nfft_both,"pppm:sf_precoeff3"); + memory->create(sf_precoeff4,nfft_both,"pppm:sf_precoeff4"); + memory->create(sf_precoeff5,nfft_both,"pppm:sf_precoeff5"); + memory->create(sf_precoeff6,nfft_both,"pppm:sf_precoeff6"); + + } else { + memory->create3d_offset(vdx_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out, + nxlo_out,nxhi_out,"pppm:vdx_brick"); + memory->create3d_offset(vdy_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out, + nxlo_out,nxhi_out,"pppm:vdy_brick"); + memory->create3d_offset(vdz_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out, + nxlo_out,nxhi_out,"pppm:vdz_brick"); + } + + // summation coeffs + + order_allocated = order; + if (!stagger_flag) memory->create(gf_b,order,"pppm:gf_b"); + memory->create2d_offset(rho1d,3,-order/2,order/2,"pppm:rho1d"); + memory->create2d_offset(drho1d,3,-order/2,order/2,"pppm:drho1d"); + memory->create2d_offset(rho_coeff,order,(1-order)/2,order/2,"pppm:rho_coeff"); + memory->create2d_offset(drho_coeff,order,(1-order)/2,order/2, + "pppm:drho_coeff"); + + // create 2 FFTs and a Remap + // 1st FFT keeps data in FFT decomposition + // 2nd FFT returns data in 3d brick decomposition + // remap takes data from 3d brick to FFT decomposition + + int tmp; + + fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm, + nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, + nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, + 0,0,&tmp,collective_flag); + + fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm, + nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, + nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, + 0,0,&tmp,collective_flag); + + remap = new Remap(lmp,world, + nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, + nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft, + 1,0,0,FFT_PRECISION,collective_flag); + + // create ghost grid object for rho and electric field communication + // also create 2 bufs for ghost grid cell comm, passed to GridComm methods + + gc = new GridComm2(lmp,world,nx_pppm,ny_pppm,nz_pppm, + nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in, + nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out); + + gc->setup(ngc_buf1,ngc_buf2); + + if (differentiation_flag) npergrid = 1; + else npergrid = 3; + + memory->create(gc_buf1,npergrid*ngc_buf1,"pppm:gc_buf1"); + memory->create(gc_buf2,npergrid*ngc_buf2,"pppm:gc_buf2"); +} + +/* ---------------------------------------------------------------------- + deallocate memory that depends on # of K-vectors and order +------------------------------------------------------------------------- */ + +void PPPM2::deallocate() +{ + memory->destroy3d_offset(density_brick,nzlo_out,nylo_out,nxlo_out); + + if (differentiation_flag == 1) { + memory->destroy3d_offset(u_brick,nzlo_out,nylo_out,nxlo_out); + memory->destroy(sf_precoeff1); + memory->destroy(sf_precoeff2); + memory->destroy(sf_precoeff3); + memory->destroy(sf_precoeff4); + memory->destroy(sf_precoeff5); + memory->destroy(sf_precoeff6); + } else { + memory->destroy3d_offset(vdx_brick,nzlo_out,nylo_out,nxlo_out); + memory->destroy3d_offset(vdy_brick,nzlo_out,nylo_out,nxlo_out); + memory->destroy3d_offset(vdz_brick,nzlo_out,nylo_out,nxlo_out); + } + + memory->destroy(density_fft); + memory->destroy(greensfn); + memory->destroy(work1); + memory->destroy(work2); + memory->destroy(vg); + + if (triclinic == 0) { + memory->destroy1d_offset(fkx,nxlo_fft); + memory->destroy1d_offset(fky,nylo_fft); + memory->destroy1d_offset(fkz,nzlo_fft); + } else { + memory->destroy(fkx); + memory->destroy(fky); + memory->destroy(fkz); + } + + memory->destroy(gf_b); + if (stagger_flag) gf_b = NULL; + memory->destroy2d_offset(rho1d,-order_allocated/2); + memory->destroy2d_offset(drho1d,-order_allocated/2); + memory->destroy2d_offset(rho_coeff,(1-order_allocated)/2); + memory->destroy2d_offset(drho_coeff,(1-order_allocated)/2); + + delete fft1; + delete fft2; + delete remap; + delete gc; + memory->destroy(gc_buf1); + memory->destroy(gc_buf2); +} + +/* ---------------------------------------------------------------------- + allocate per-atom memory that depends on # of K-vectors and order +------------------------------------------------------------------------- */ + +void PPPM2::allocate_peratom() +{ + peratom_allocate_flag = 1; + + if (differentiation_flag != 1) + memory->create3d_offset(u_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out, + nxlo_out,nxhi_out,"pppm:u_brick"); + + memory->create3d_offset(v0_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out, + nxlo_out,nxhi_out,"pppm:v0_brick"); + + memory->create3d_offset(v1_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out, + nxlo_out,nxhi_out,"pppm:v1_brick"); + memory->create3d_offset(v2_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out, + nxlo_out,nxhi_out,"pppm:v2_brick"); + memory->create3d_offset(v3_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out, + nxlo_out,nxhi_out,"pppm:v3_brick"); + memory->create3d_offset(v4_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out, + nxlo_out,nxhi_out,"pppm:v4_brick"); + memory->create3d_offset(v5_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out, + nxlo_out,nxhi_out,"pppm:v5_brick"); + + // use same GC ghost grid object for peratom grid communication + // but need to reallocate a larger gc_buf1 and gc_buf2 + + if (differentiation_flag) npergrid = 6; + else npergrid = 7; + + memory->destroy(gc_buf1); + memory->destroy(gc_buf2); + memory->create(gc_buf1,npergrid*ngc_buf1,"pppm:gc_buf1"); + memory->create(gc_buf2,npergrid*ngc_buf2,"pppm:gc_buf2"); +} + +/* ---------------------------------------------------------------------- + deallocate per-atom memory that depends on # of K-vectors and order +------------------------------------------------------------------------- */ + +void PPPM2::deallocate_peratom() +{ + peratom_allocate_flag = 0; + + memory->destroy3d_offset(v0_brick,nzlo_out,nylo_out,nxlo_out); + memory->destroy3d_offset(v1_brick,nzlo_out,nylo_out,nxlo_out); + memory->destroy3d_offset(v2_brick,nzlo_out,nylo_out,nxlo_out); + memory->destroy3d_offset(v3_brick,nzlo_out,nylo_out,nxlo_out); + memory->destroy3d_offset(v4_brick,nzlo_out,nylo_out,nxlo_out); + memory->destroy3d_offset(v5_brick,nzlo_out,nylo_out,nxlo_out); + + if (differentiation_flag != 1) + memory->destroy3d_offset(u_brick,nzlo_out,nylo_out,nxlo_out); +} + +/* ---------------------------------------------------------------------- + set global size of PPPM grid = nx,ny,nz_pppm + used for charge accumulation, FFTs, and electric field interpolation +------------------------------------------------------------------------- */ + +void PPPM2::set_grid_global() +{ + // use xprd,yprd,zprd (even if triclinic, and then scale later) + // adjust z dimension for 2d slab PPPM + // 3d PPPM just uses zprd since slab_volfactor = 1.0 + + double xprd = domain->xprd; + double yprd = domain->yprd; + double zprd = domain->zprd; + double zprd_slab = zprd*slab_volfactor; + + // make initial g_ewald estimate + // based on desired accuracy and real space cutoff + // fluid-occupied volume used to estimate real-space error + // zprd used rather than zprd_slab + + double h; + bigint natoms = atom->natoms; + + if (!gewaldflag) { + if (accuracy <= 0.0) + error->all(FLERR,"KSpace accuracy must be > 0"); + if (q2 == 0.0) + error->all(FLERR,"Must use kspace_modify gewald for uncharged system"); + g_ewald = accuracy*sqrt(natoms*cutoff*xprd*yprd*zprd) / (2.0*q2); + if (g_ewald >= 1.0) g_ewald = (1.35 - 0.15*log(accuracy))/cutoff; + else g_ewald = sqrt(-log(g_ewald)) / cutoff; + } + + // set optimal nx_pppm,ny_pppm,nz_pppm based on order and accuracy + // nz_pppm uses extended zprd_slab instead of zprd + // reduce it until accuracy target is met + + if (!gridflag) { + + if (differentiation_flag == 1 || stagger_flag) { + + h = h_x = h_y = h_z = 4.0/g_ewald; + int count = 0; + while (1) { + + // set grid dimensions + + nx_pppm = static_cast (xprd/h_x); + ny_pppm = static_cast (yprd/h_y); + nz_pppm = static_cast (zprd_slab/h_z); + + if (nx_pppm <= 1) nx_pppm = 2; + if (ny_pppm <= 1) ny_pppm = 2; + if (nz_pppm <= 1) nz_pppm = 2; + + // estimate Kspace force error + + double df_kspace = compute_df_kspace(); + + // break loop if the accuracy has been reached or + // too many loops have been performed + + count++; + if (df_kspace <= accuracy) break; + + if (count > 500) error->all(FLERR, "Could not compute grid size"); + h *= 0.95; + h_x = h_y = h_z = h; + } + + } else { + + double err; + h_x = h_y = h_z = 1.0/g_ewald; + + nx_pppm = static_cast (xprd/h_x) + 1; + ny_pppm = static_cast (yprd/h_y) + 1; + nz_pppm = static_cast (zprd_slab/h_z) + 1; + + err = estimate_ik_error(h_x,xprd,natoms); + while (err > accuracy) { + err = estimate_ik_error(h_x,xprd,natoms); + nx_pppm++; + h_x = xprd/nx_pppm; + } + + err = estimate_ik_error(h_y,yprd,natoms); + while (err > accuracy) { + err = estimate_ik_error(h_y,yprd,natoms); + ny_pppm++; + h_y = yprd/ny_pppm; + } + + err = estimate_ik_error(h_z,zprd_slab,natoms); + while (err > accuracy) { + err = estimate_ik_error(h_z,zprd_slab,natoms); + nz_pppm++; + h_z = zprd_slab/nz_pppm; + } + } + + // scale grid for triclinic skew + + if (triclinic) { + double tmp[3]; + tmp[0] = nx_pppm/xprd; + tmp[1] = ny_pppm/yprd; + tmp[2] = nz_pppm/zprd; + lamda2xT(&tmp[0],&tmp[0]); + nx_pppm = static_cast(tmp[0]) + 1; + ny_pppm = static_cast(tmp[1]) + 1; + nz_pppm = static_cast(tmp[2]) + 1; + } + } + + // boost grid size until it is factorable + + while (!factorable(nx_pppm)) nx_pppm++; + while (!factorable(ny_pppm)) ny_pppm++; + while (!factorable(nz_pppm)) nz_pppm++; + + if (triclinic == 0) { + h_x = xprd/nx_pppm; + h_y = yprd/ny_pppm; + h_z = zprd_slab/nz_pppm; + } else { + double tmp[3]; + tmp[0] = nx_pppm; + tmp[1] = ny_pppm; + tmp[2] = nz_pppm; + x2lamdaT(&tmp[0],&tmp[0]); + h_x = 1.0/tmp[0]; + h_y = 1.0/tmp[1]; + h_z = 1.0/tmp[2]; + } + + if (nx_pppm >= OFFSET || ny_pppm >= OFFSET || nz_pppm >= OFFSET) + error->all(FLERR,"PPPM grid is too large"); +} + +/* ---------------------------------------------------------------------- + check if all factors of n are in list of factors + return 1 if yes, 0 if no +------------------------------------------------------------------------- */ + +int PPPM2::factorable(int n) +{ + int i; + + while (n > 1) { + for (i = 0; i < nfactors; i++) { + if (n % factors[i] == 0) { + n /= factors[i]; + break; + } + } + if (i == nfactors) return 0; + } + + return 1; +} + +/* ---------------------------------------------------------------------- + compute estimated kspace force error +------------------------------------------------------------------------- */ + +double PPPM2::compute_df_kspace() +{ + double xprd = domain->xprd; + double yprd = domain->yprd; + double zprd = domain->zprd; + double zprd_slab = zprd*slab_volfactor; + bigint natoms = atom->natoms; + double df_kspace = 0.0; + if (differentiation_flag == 1 || stagger_flag) { + double qopt = compute_qopt(); + df_kspace = sqrt(qopt/natoms)*q2/(xprd*yprd*zprd_slab); + } else { + double lprx = estimate_ik_error(h_x,xprd,natoms); + double lpry = estimate_ik_error(h_y,yprd,natoms); + double lprz = estimate_ik_error(h_z,zprd_slab,natoms); + df_kspace = sqrt(lprx*lprx + lpry*lpry + lprz*lprz) / sqrt(3.0); + } + return df_kspace; +} + +/* ---------------------------------------------------------------------- + compute qopt +------------------------------------------------------------------------- */ + +double PPPM2::compute_qopt() +{ + int k,l,m,nx,ny,nz; + double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz; + double u1,u2,sqk; + double sum1,sum2,sum3,sum4,dot2; + + double *prd = domain->prd; + + const double xprd = prd[0]; + const double yprd = prd[1]; + const double zprd = prd[2]; + const double zprd_slab = zprd*slab_volfactor; + volume = xprd * yprd * zprd_slab; + + const double unitkx = (MY_2PI/xprd); + const double unitky = (MY_2PI/yprd); + const double unitkz = (MY_2PI/zprd_slab); + + const int twoorder = 2*order; + + // loop over entire FFT grid + // each proc calculates contributions from every Pth grid point + + bigint ngridtotal = (bigint) nx_pppm * ny_pppm * nz_pppm; + int nxy_pppm = nx_pppm * ny_pppm; + + double qopt = 0.0; + + for (bigint i = me; i < ngridtotal; i += nprocs) { + k = i % nx_pppm; + l = (i/nx_pppm) % ny_pppm; + m = i / nxy_pppm; + + const int kper = k - nx_pppm*(2*k/nx_pppm); + const int lper = l - ny_pppm*(2*l/ny_pppm); + const int mper = m - nz_pppm*(2*m/nz_pppm); + + sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper); + if (sqk == 0.0) continue; + + sum1 = sum2 = sum3 = sum4 = 0.0; + + for (nx = -2; nx <= 2; nx++) { + qx = unitkx*(kper+nx_pppm*nx); + sx = exp(-0.25*square(qx/g_ewald)); + argx = 0.5*qx*xprd/nx_pppm; + wx = powsinxx(argx,twoorder); + qx *= qx; + + for (ny = -2; ny <= 2; ny++) { + qy = unitky*(lper+ny_pppm*ny); + sy = exp(-0.25*square(qy/g_ewald)); + argy = 0.5*qy*yprd/ny_pppm; + wy = powsinxx(argy,twoorder); + qy *= qy; + + for (nz = -2; nz <= 2; nz++) { + qz = unitkz*(mper+nz_pppm*nz); + sz = exp(-0.25*square(qz/g_ewald)); + argz = 0.5*qz*zprd_slab/nz_pppm; + wz = powsinxx(argz,twoorder); + qz *= qz; + + dot2 = qx+qy+qz; + u1 = sx*sy*sz; + u2 = wx*wy*wz; + + sum1 += u1*u1/dot2*MY_4PI*MY_4PI; + sum2 += u1 * u2 * MY_4PI; + sum3 += u2; + sum4 += dot2*u2; + } + } + } + + sum2 *= sum2; + qopt += sum1 - sum2/(sum3*sum4); + } + + // sum qopt over all procs + + double qopt_all; + MPI_Allreduce(&qopt,&qopt_all,1,MPI_DOUBLE,MPI_SUM,world); + return qopt_all; +} + +/* ---------------------------------------------------------------------- + estimate kspace force error for ik method +------------------------------------------------------------------------- */ + +double PPPM2::estimate_ik_error(double h, double prd, bigint natoms) +{ + double sum = 0.0; + if (natoms == 0) return 0.0; + for (int m = 0; m < order; m++) + sum += acons[order][m] * pow(h*g_ewald,2.0*m); + double value = q2 * pow(h*g_ewald,(double)order) * + sqrt(g_ewald*prd*sqrt(MY_2PI)*sum/natoms) / (prd*prd); + + return value; +} + +/* ---------------------------------------------------------------------- + adjust the g_ewald parameter to near its optimal value + using a Newton-Raphson solver +------------------------------------------------------------------------- */ + +void PPPM2::adjust_gewald() +{ + double dx; + + for (int i = 0; i < LARGE; i++) { + dx = newton_raphson_f() / derivf(); + g_ewald -= dx; + if (fabs(newton_raphson_f()) < SMALL) return; + } + error->all(FLERR, "Could not compute g_ewald"); +} + +/* ---------------------------------------------------------------------- + calculate f(x) using Newton-Raphson solver +------------------------------------------------------------------------- */ + +double PPPM2::newton_raphson_f() +{ + double xprd = domain->xprd; + double yprd = domain->yprd; + double zprd = domain->zprd; + bigint natoms = atom->natoms; + + double df_rspace = 2.0*q2*exp(-g_ewald*g_ewald*cutoff*cutoff) / + sqrt(natoms*cutoff*xprd*yprd*zprd); + + double df_kspace = compute_df_kspace(); + + return df_rspace - df_kspace; +} + +/* ---------------------------------------------------------------------- + calculate numerical derivative f'(x) using forward difference + [f(x + h) - f(x)] / h +------------------------------------------------------------------------- */ + +double PPPM2::derivf() +{ + double h = 0.000001; //Derivative step-size + double df,f1,f2,g_ewald_old; + + f1 = newton_raphson_f(); + g_ewald_old = g_ewald; + g_ewald += h; + f2 = newton_raphson_f(); + g_ewald = g_ewald_old; + df = (f2 - f1)/h; + + return df; +} + +/* ---------------------------------------------------------------------- + calculate the final estimate of the accuracy +------------------------------------------------------------------------- */ + +double PPPM2::final_accuracy() +{ + double xprd = domain->xprd; + double yprd = domain->yprd; + double zprd = domain->zprd; + bigint natoms = atom->natoms; + if (natoms == 0) natoms = 1; // avoid division by zero + + double df_kspace = compute_df_kspace(); + double q2_over_sqrt = q2 / sqrt(natoms*cutoff*xprd*yprd*zprd); + double df_rspace = 2.0 * q2_over_sqrt * exp(-g_ewald*g_ewald*cutoff*cutoff); + double df_table = estimate_table_accuracy(q2_over_sqrt,df_rspace); + double estimated_accuracy = sqrt(df_kspace*df_kspace + df_rspace*df_rspace + + df_table*df_table); + + return estimated_accuracy; +} + +/* ---------------------------------------------------------------------- + set local subset of PPPM/FFT grid that I own + n xyz lo/hi in = 3d brick that I own (inclusive) + n xyz lo/hi out = 3d brick + ghost cells in 6 directions (inclusive) + n xyz lo/hi fft = FFT columns that I own (all of x dim, 2d decomp in yz) +------------------------------------------------------------------------- */ + +void PPPM2::set_grid_local() +{ + // global indices of PPPM grid range from 0 to N-1 + // nlo_in,nhi_in = lower/upper limits of the 3d sub-brick of + // global PPPM grid that I own without ghost cells + // for slab PPPM, assign z grid as if it were not extended + // both non-tiled and tiled proc layouts use 0-1 fractional sumdomain info + + if (comm->layout != Comm::LAYOUT_TILED) { + nxlo_in = static_cast (comm->xsplit[comm->myloc[0]] * nx_pppm); + nxhi_in = static_cast (comm->xsplit[comm->myloc[0]+1] * nx_pppm) - 1; + + nylo_in = static_cast (comm->ysplit[comm->myloc[1]] * ny_pppm); + nyhi_in = static_cast (comm->ysplit[comm->myloc[1]+1] * ny_pppm) - 1; + + nzlo_in = static_cast + (comm->zsplit[comm->myloc[2]] * nz_pppm/slab_volfactor); + nzhi_in = static_cast + (comm->zsplit[comm->myloc[2]+1] * nz_pppm/slab_volfactor) - 1; + + } else { + nxlo_in = static_cast (comm->mysplit[0][0] * nx_pppm); + nxhi_in = static_cast (comm->mysplit[0][1] * nx_pppm) - 1; + + nylo_in = static_cast (comm->mysplit[1][0] * ny_pppm); + nyhi_in = static_cast (comm->mysplit[1][1] * ny_pppm) - 1; + + nzlo_in = static_cast (comm->mysplit[2][0] * nz_pppm/slab_volfactor); + nzhi_in = static_cast (comm->mysplit[2][1] * nz_pppm/slab_volfactor) - 1; + } + + // nlower,nupper = stencil size for mapping particles to PPPM grid + + nlower = -(order-1)/2; + nupper = order/2; + + // shift values for particle <-> grid mapping + // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1 + + if (order % 2) shift = OFFSET + 0.5; + else shift = OFFSET; + if (order % 2) shiftone = 0.0; + else shiftone = 0.5; + + // nlo_out,nhi_out = lower/upper limits of the 3d sub-brick of + // global PPPM grid that my particles can contribute charge to + // effectively nlo_in,nhi_in + ghost cells + // nlo,nhi = global coords of grid pt to "lower left" of smallest/largest + // position a particle in my box can be at + // dist[3] = particle position bound = subbox + skin/2.0 + qdist + // qdist = offset due to TIP4P fictitious charge + // convert to triclinic if necessary + // nlo_out,nhi_out = nlo,nhi + stencil size for particle mapping + // for slab PPPM, assign z grid as if it were not extended + + double *prd,*sublo,*subhi; + + if (triclinic == 0) { + prd = domain->prd; + boxlo = domain->boxlo; + sublo = domain->sublo; + subhi = domain->subhi; + } else { + prd = domain->prd_lamda; + boxlo = domain->boxlo_lamda; + sublo = domain->sublo_lamda; + subhi = domain->subhi_lamda; + } + + double xprd = prd[0]; + double yprd = prd[1]; + double zprd = prd[2]; + double zprd_slab = zprd*slab_volfactor; + + double dist[3] = {0.0,0.0,0.0}; + double cuthalf = 0.5*neighbor->skin + qdist; + if (triclinic == 0) dist[0] = dist[1] = dist[2] = cuthalf; + else kspacebbox(cuthalf,&dist[0]); + + int nlo,nhi; + nlo = nhi = 0; + + nlo = static_cast ((sublo[0]-dist[0]-boxlo[0]) * + nx_pppm/xprd + shift) - OFFSET; + nhi = static_cast ((subhi[0]+dist[0]-boxlo[0]) * + nx_pppm/xprd + shift) - OFFSET; + nxlo_out = nlo + nlower; + nxhi_out = nhi + nupper; + + nlo = static_cast ((sublo[1]-dist[1]-boxlo[1]) * + ny_pppm/yprd + shift) - OFFSET; + nhi = static_cast ((subhi[1]+dist[1]-boxlo[1]) * + ny_pppm/yprd + shift) - OFFSET; + nylo_out = nlo + nlower; + nyhi_out = nhi + nupper; + + nlo = static_cast ((sublo[2]-dist[2]-boxlo[2]) * + nz_pppm/zprd_slab + shift) - OFFSET; + nhi = static_cast ((subhi[2]+dist[2]-boxlo[2]) * + nz_pppm/zprd_slab + shift) - OFFSET; + nzlo_out = nlo + nlower; + nzhi_out = nhi + nupper; + + if (stagger_flag) { + nxhi_out++; + nyhi_out++; + nzhi_out++; + } + + // for slab PPPM, change the grid boundary for processors at +z end + // to include the empty volume between periodically repeating slabs + // for slab PPPM, want charge data communicated from -z proc to +z proc, + // but not vice versa, also want field data communicated from +z proc to + // -z proc, but not vice versa + // this is accomplished by nzhi_in = nzhi_out on +z end (no ghost cells) + // also insure no other procs use ghost cells beyond +z limit + // differnet logic for non-tiled vs tiled decomposition + + if (slabflag == 1) { + if (comm->layout != Comm::LAYOUT_TILED) { + if (comm->myloc[2] == comm->procgrid[2]-1) nzhi_in = nzhi_out = nz_pppm - 1; + } else { + if (comm->mysplit[2][1] == 1.0) nzhi_in = nzhi_out = nz_pppm - 1; + } + nzhi_out = MIN(nzhi_out,nz_pppm-1); + } + + // x-pencil decomposition of FFT mesh + // global indices range from 0 to N-1 + // each proc owns entire x-dimension, clumps of columns in y,z dimensions + // npey_fft,npez_fft = # of procs in y,z dims + // if nprocs is small enough, proc can own 1 or more entire xy planes, + // else proc owns 2d sub-blocks of yz plane + // me_y,me_z = which proc (0-npe_fft-1) I am in y,z dimensions + // nlo_fft,nhi_fft = lower/upper limit of the section + // of the global FFT mesh that I own in x-pencil decomposition + + int npey_fft,npez_fft; + if (nz_pppm >= nprocs) { + npey_fft = 1; + npez_fft = nprocs; + } else procs2grid2d(nprocs,ny_pppm,nz_pppm,&npey_fft,&npez_fft); + + int me_y = me % npey_fft; + int me_z = me / npey_fft; + + nxlo_fft = 0; + nxhi_fft = nx_pppm - 1; + nylo_fft = me_y*ny_pppm/npey_fft; + nyhi_fft = (me_y+1)*ny_pppm/npey_fft - 1; + nzlo_fft = me_z*nz_pppm/npez_fft; + nzhi_fft = (me_z+1)*nz_pppm/npez_fft - 1; + + // ngrid = count of PPPM grid pts owned by this proc, including ghosts + + ngrid = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) * + (nzhi_out-nzlo_out+1); + + // count of FFT grids pts owned by this proc, without ghosts + // nfft = FFT points in x-pencil FFT decomposition on this proc + // nfft_brick = FFT points in 3d brick-decomposition on this proc + // nfft_both = greater of 2 values + + nfft = (nxhi_fft-nxlo_fft+1) * (nyhi_fft-nylo_fft+1) * + (nzhi_fft-nzlo_fft+1); + int nfft_brick = (nxhi_in-nxlo_in+1) * (nyhi_in-nylo_in+1) * + (nzhi_in-nzlo_in+1); + nfft_both = MAX(nfft,nfft_brick); +} + +/* ---------------------------------------------------------------------- + pre-compute Green's function denominator expansion coeffs, Gamma(2n) +------------------------------------------------------------------------- */ + +void PPPM2::compute_gf_denom() +{ + int k,l,m; + + for (l = 1; l < order; l++) gf_b[l] = 0.0; + gf_b[0] = 1.0; + + for (m = 1; m < order; m++) { + for (l = m; l > 0; l--) + gf_b[l] = 4.0 * (gf_b[l]*(l-m)*(l-m-0.5)-gf_b[l-1]*(l-m-1)*(l-m-1)); + gf_b[0] = 4.0 * (gf_b[0]*(l-m)*(l-m-0.5)); + } + + bigint ifact = 1; + for (k = 1; k < 2*order; k++) ifact *= k; + double gaminv = 1.0/ifact; + for (l = 0; l < order; l++) gf_b[l] *= gaminv; +} + +/* ---------------------------------------------------------------------- + pre-compute modified (Hockney-Eastwood) Coulomb Green's function +------------------------------------------------------------------------- */ + +void PPPM2::compute_gf_ik() +{ + const double * const prd = domain->prd; + + const double xprd = prd[0]; + const double yprd = prd[1]; + const double zprd = prd[2]; + const double zprd_slab = zprd*slab_volfactor; + const double unitkx = (MY_2PI/xprd); + const double unitky = (MY_2PI/yprd); + const double unitkz = (MY_2PI/zprd_slab); + + double snx,sny,snz; + double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz; + double sum1,dot1,dot2; + double numerator,denominator; + double sqk; + + int k,l,m,n,nx,ny,nz,kper,lper,mper; + + const int nbx = static_cast ((g_ewald*xprd/(MY_PI*nx_pppm)) * + pow(-log(EPS_HOC),0.25)); + const int nby = static_cast ((g_ewald*yprd/(MY_PI*ny_pppm)) * + pow(-log(EPS_HOC),0.25)); + const int nbz = static_cast ((g_ewald*zprd_slab/(MY_PI*nz_pppm)) * + pow(-log(EPS_HOC),0.25)); + const int twoorder = 2*order; + + n = 0; + for (m = nzlo_fft; m <= nzhi_fft; m++) { + mper = m - nz_pppm*(2*m/nz_pppm); + snz = square(sin(0.5*unitkz*mper*zprd_slab/nz_pppm)); + + for (l = nylo_fft; l <= nyhi_fft; l++) { + lper = l - ny_pppm*(2*l/ny_pppm); + sny = square(sin(0.5*unitky*lper*yprd/ny_pppm)); + + for (k = nxlo_fft; k <= nxhi_fft; k++) { + kper = k - nx_pppm*(2*k/nx_pppm); + snx = square(sin(0.5*unitkx*kper*xprd/nx_pppm)); + + sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper); + + if (sqk != 0.0) { + numerator = 12.5663706/sqk; + denominator = gf_denom(snx,sny,snz); + sum1 = 0.0; + + for (nx = -nbx; nx <= nbx; nx++) { + qx = unitkx*(kper+nx_pppm*nx); + sx = exp(-0.25*square(qx/g_ewald)); + argx = 0.5*qx*xprd/nx_pppm; + wx = powsinxx(argx,twoorder); + + for (ny = -nby; ny <= nby; ny++) { + qy = unitky*(lper+ny_pppm*ny); + sy = exp(-0.25*square(qy/g_ewald)); + argy = 0.5*qy*yprd/ny_pppm; + wy = powsinxx(argy,twoorder); + + for (nz = -nbz; nz <= nbz; nz++) { + qz = unitkz*(mper+nz_pppm*nz); + sz = exp(-0.25*square(qz/g_ewald)); + argz = 0.5*qz*zprd_slab/nz_pppm; + wz = powsinxx(argz,twoorder); + + dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz; + dot2 = qx*qx+qy*qy+qz*qz; + sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz; + } + } + } + greensfn[n++] = numerator*sum1/denominator; + } else greensfn[n++] = 0.0; + } + } + } +} + +/* ---------------------------------------------------------------------- + pre-compute modified (Hockney-Eastwood) Coulomb Green's function + for a triclinic system +------------------------------------------------------------------------- */ + +void PPPM2::compute_gf_ik_triclinic() +{ + double snx,sny,snz; + double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz; + double sum1,dot1,dot2; + double numerator,denominator; + double sqk; + + int k,l,m,n,nx,ny,nz,kper,lper,mper; + + double tmp[3]; + tmp[0] = (g_ewald/(MY_PI*nx_pppm)) * pow(-log(EPS_HOC),0.25); + tmp[1] = (g_ewald/(MY_PI*ny_pppm)) * pow(-log(EPS_HOC),0.25); + tmp[2] = (g_ewald/(MY_PI*nz_pppm)) * pow(-log(EPS_HOC),0.25); + lamda2xT(&tmp[0],&tmp[0]); + const int nbx = static_cast (tmp[0]); + const int nby = static_cast (tmp[1]); + const int nbz = static_cast (tmp[2]); + + const int twoorder = 2*order; + + n = 0; + for (m = nzlo_fft; m <= nzhi_fft; m++) { + mper = m - nz_pppm*(2*m/nz_pppm); + snz = square(sin(MY_PI*mper/nz_pppm)); + + for (l = nylo_fft; l <= nyhi_fft; l++) { + lper = l - ny_pppm*(2*l/ny_pppm); + sny = square(sin(MY_PI*lper/ny_pppm)); + + for (k = nxlo_fft; k <= nxhi_fft; k++) { + kper = k - nx_pppm*(2*k/nx_pppm); + snx = square(sin(MY_PI*kper/nx_pppm)); + + double unitk_lamda[3]; + unitk_lamda[0] = 2.0*MY_PI*kper; + unitk_lamda[1] = 2.0*MY_PI*lper; + unitk_lamda[2] = 2.0*MY_PI*mper; + x2lamdaT(&unitk_lamda[0],&unitk_lamda[0]); + + sqk = square(unitk_lamda[0]) + square(unitk_lamda[1]) + square(unitk_lamda[2]); + + if (sqk != 0.0) { + numerator = 12.5663706/sqk; + denominator = gf_denom(snx,sny,snz); + sum1 = 0.0; + + for (nx = -nbx; nx <= nbx; nx++) { + argx = MY_PI*kper/nx_pppm + MY_PI*nx; + wx = powsinxx(argx,twoorder); + + for (ny = -nby; ny <= nby; ny++) { + argy = MY_PI*lper/ny_pppm + MY_PI*ny; + wy = powsinxx(argy,twoorder); + + for (nz = -nbz; nz <= nbz; nz++) { + argz = MY_PI*mper/nz_pppm + MY_PI*nz; + wz = powsinxx(argz,twoorder); + + double b[3]; + b[0] = 2.0*MY_PI*nx_pppm*nx; + b[1] = 2.0*MY_PI*ny_pppm*ny; + b[2] = 2.0*MY_PI*nz_pppm*nz; + x2lamdaT(&b[0],&b[0]); + + qx = unitk_lamda[0]+b[0]; + sx = exp(-0.25*square(qx/g_ewald)); + + qy = unitk_lamda[1]+b[1]; + sy = exp(-0.25*square(qy/g_ewald)); + + qz = unitk_lamda[2]+b[2]; + sz = exp(-0.25*square(qz/g_ewald)); + + dot1 = unitk_lamda[0]*qx + unitk_lamda[1]*qy + unitk_lamda[2]*qz; + dot2 = qx*qx+qy*qy+qz*qz; + sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz; + } + } + } + greensfn[n++] = numerator*sum1/denominator; + } else greensfn[n++] = 0.0; + } + } + } +} + +/* ---------------------------------------------------------------------- + compute optimized Green's function for energy calculation +------------------------------------------------------------------------- */ + +void PPPM2::compute_gf_ad() +{ + const double * const prd = domain->prd; + + const double xprd = prd[0]; + const double yprd = prd[1]; + const double zprd = prd[2]; + const double zprd_slab = zprd*slab_volfactor; + const double unitkx = (MY_2PI/xprd); + const double unitky = (MY_2PI/yprd); + const double unitkz = (MY_2PI/zprd_slab); + + double snx,sny,snz,sqk; + double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz; + double numerator,denominator; + int k,l,m,n,kper,lper,mper; + + const int twoorder = 2*order; + + for (int i = 0; i < 6; i++) sf_coeff[i] = 0.0; + + n = 0; + for (m = nzlo_fft; m <= nzhi_fft; m++) { + mper = m - nz_pppm*(2*m/nz_pppm); + qz = unitkz*mper; + snz = square(sin(0.5*qz*zprd_slab/nz_pppm)); + sz = exp(-0.25*square(qz/g_ewald)); + argz = 0.5*qz*zprd_slab/nz_pppm; + wz = powsinxx(argz,twoorder); + + for (l = nylo_fft; l <= nyhi_fft; l++) { + lper = l - ny_pppm*(2*l/ny_pppm); + qy = unitky*lper; + sny = square(sin(0.5*qy*yprd/ny_pppm)); + sy = exp(-0.25*square(qy/g_ewald)); + argy = 0.5*qy*yprd/ny_pppm; + wy = powsinxx(argy,twoorder); + + for (k = nxlo_fft; k <= nxhi_fft; k++) { + kper = k - nx_pppm*(2*k/nx_pppm); + qx = unitkx*kper; + snx = square(sin(0.5*qx*xprd/nx_pppm)); + sx = exp(-0.25*square(qx/g_ewald)); + argx = 0.5*qx*xprd/nx_pppm; + wx = powsinxx(argx,twoorder); + + sqk = qx*qx + qy*qy + qz*qz; + + if (sqk != 0.0) { + numerator = MY_4PI/sqk; + denominator = gf_denom(snx,sny,snz); + greensfn[n] = numerator*sx*sy*sz*wx*wy*wz/denominator; + sf_coeff[0] += sf_precoeff1[n]*greensfn[n]; + sf_coeff[1] += sf_precoeff2[n]*greensfn[n]; + sf_coeff[2] += sf_precoeff3[n]*greensfn[n]; + sf_coeff[3] += sf_precoeff4[n]*greensfn[n]; + sf_coeff[4] += sf_precoeff5[n]*greensfn[n]; + sf_coeff[5] += sf_precoeff6[n]*greensfn[n]; + n++; + } else { + greensfn[n] = 0.0; + sf_coeff[0] += sf_precoeff1[n]*greensfn[n]; + sf_coeff[1] += sf_precoeff2[n]*greensfn[n]; + sf_coeff[2] += sf_precoeff3[n]*greensfn[n]; + sf_coeff[3] += sf_precoeff4[n]*greensfn[n]; + sf_coeff[4] += sf_precoeff5[n]*greensfn[n]; + sf_coeff[5] += sf_precoeff6[n]*greensfn[n]; + n++; + } + } + } + } + + // compute the coefficients for the self-force correction + + double prex, prey, prez; + prex = prey = prez = MY_PI/volume; + prex *= nx_pppm/xprd; + prey *= ny_pppm/yprd; + prez *= nz_pppm/zprd_slab; + sf_coeff[0] *= prex; + sf_coeff[1] *= prex*2; + sf_coeff[2] *= prey; + sf_coeff[3] *= prey*2; + sf_coeff[4] *= prez; + sf_coeff[5] *= prez*2; + + // communicate values with other procs + + double tmp[6]; + MPI_Allreduce(sf_coeff,tmp,6,MPI_DOUBLE,MPI_SUM,world); + for (n = 0; n < 6; n++) sf_coeff[n] = tmp[n]; +} + +/* ---------------------------------------------------------------------- + compute self force coefficients for ad-differentiation scheme +------------------------------------------------------------------------- */ + +void PPPM2::compute_sf_precoeff() +{ + int i,k,l,m,n; + int nx,ny,nz,kper,lper,mper; + double wx0[5],wy0[5],wz0[5],wx1[5],wy1[5],wz1[5],wx2[5],wy2[5],wz2[5]; + double qx0,qy0,qz0,qx1,qy1,qz1,qx2,qy2,qz2; + double u0,u1,u2,u3,u4,u5,u6; + double sum1,sum2,sum3,sum4,sum5,sum6; + + n = 0; + for (m = nzlo_fft; m <= nzhi_fft; m++) { + mper = m - nz_pppm*(2*m/nz_pppm); + + for (l = nylo_fft; l <= nyhi_fft; l++) { + lper = l - ny_pppm*(2*l/ny_pppm); + + for (k = nxlo_fft; k <= nxhi_fft; k++) { + kper = k - nx_pppm*(2*k/nx_pppm); + + sum1 = sum2 = sum3 = sum4 = sum5 = sum6 = 0.0; + for (i = 0; i < 5; i++) { + + qx0 = MY_2PI*(kper+nx_pppm*(i-2)); + qx1 = MY_2PI*(kper+nx_pppm*(i-1)); + qx2 = MY_2PI*(kper+nx_pppm*(i )); + wx0[i] = powsinxx(0.5*qx0/nx_pppm,order); + wx1[i] = powsinxx(0.5*qx1/nx_pppm,order); + wx2[i] = powsinxx(0.5*qx2/nx_pppm,order); + + qy0 = MY_2PI*(lper+ny_pppm*(i-2)); + qy1 = MY_2PI*(lper+ny_pppm*(i-1)); + qy2 = MY_2PI*(lper+ny_pppm*(i )); + wy0[i] = powsinxx(0.5*qy0/ny_pppm,order); + wy1[i] = powsinxx(0.5*qy1/ny_pppm,order); + wy2[i] = powsinxx(0.5*qy2/ny_pppm,order); + + qz0 = MY_2PI*(mper+nz_pppm*(i-2)); + qz1 = MY_2PI*(mper+nz_pppm*(i-1)); + qz2 = MY_2PI*(mper+nz_pppm*(i )); + + wz0[i] = powsinxx(0.5*qz0/nz_pppm,order); + wz1[i] = powsinxx(0.5*qz1/nz_pppm,order); + wz2[i] = powsinxx(0.5*qz2/nz_pppm,order); + } + + for (nx = 0; nx < 5; nx++) { + for (ny = 0; ny < 5; ny++) { + for (nz = 0; nz < 5; nz++) { + u0 = wx0[nx]*wy0[ny]*wz0[nz]; + u1 = wx1[nx]*wy0[ny]*wz0[nz]; + u2 = wx2[nx]*wy0[ny]*wz0[nz]; + u3 = wx0[nx]*wy1[ny]*wz0[nz]; + u4 = wx0[nx]*wy2[ny]*wz0[nz]; + u5 = wx0[nx]*wy0[ny]*wz1[nz]; + u6 = wx0[nx]*wy0[ny]*wz2[nz]; + + sum1 += u0*u1; + sum2 += u0*u2; + sum3 += u0*u3; + sum4 += u0*u4; + sum5 += u0*u5; + sum6 += u0*u6; + } + } + } + + // store values + + sf_precoeff1[n] = sum1; + sf_precoeff2[n] = sum2; + sf_precoeff3[n] = sum3; + sf_precoeff4[n] = sum4; + sf_precoeff5[n] = sum5; + sf_precoeff6[n++] = sum6; + } + } + } +} + +/* ---------------------------------------------------------------------- + find center grid pt for each of my particles + check that full stencil for the particle will fit in my 3d brick + store central grid pt indices in part2grid array +------------------------------------------------------------------------- */ + +void PPPM2::particle_map() +{ + int nx,ny,nz; + + double **x = atom->x; + int nlocal = atom->nlocal; + + int flag = 0; + + if (!std::isfinite(boxlo[0]) || !std::isfinite(boxlo[1]) || !std::isfinite(boxlo[2])) + error->one(FLERR,"Non-numeric box dimensions - simulation unstable"); + + for (int i = 0; i < nlocal; i++) { + + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // current particle coord can be outside global and local box + // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1 + + nx = static_cast ((x[i][0]-boxlo[0])*delxinv+shift) - OFFSET; + ny = static_cast ((x[i][1]-boxlo[1])*delyinv+shift) - OFFSET; + nz = static_cast ((x[i][2]-boxlo[2])*delzinv+shift) - OFFSET; + + part2grid[i][0] = nx; + part2grid[i][1] = ny; + part2grid[i][2] = nz; + + // check that entire stencil around nx,ny,nz will fit in my 3d brick + + if (nx+nlower < nxlo_out || nx+nupper > nxhi_out || + ny+nlower < nylo_out || ny+nupper > nyhi_out || + nz+nlower < nzlo_out || nz+nupper > nzhi_out) + flag = 1; + } + + if (flag) error->one(FLERR,"Out of range atoms - cannot compute PPPM"); +} + +/* ---------------------------------------------------------------------- + create discretized "density" on section of global grid due to my particles + density(x,y,z) = charge "density" at grid points of my 3d brick + (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts) + in global grid +------------------------------------------------------------------------- */ + +void PPPM2::make_rho() +{ + int l,m,n,nx,ny,nz,mx,my,mz; + FFT_SCALAR dx,dy,dz,x0,y0,z0; + + // clear 3d density array + + memset(&(density_brick[nzlo_out][nylo_out][nxlo_out]),0, + ngrid*sizeof(FFT_SCALAR)); + + // loop over my charges, add their contribution to nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + + double *q = atom->q; + double **x = atom->x; + int nlocal = atom->nlocal; + + for (int i = 0; i < nlocal; i++) { + + nx = part2grid[i][0]; + ny = part2grid[i][1]; + nz = part2grid[i][2]; + dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv; + dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv; + dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv; + + compute_rho1d(dx,dy,dz); + + z0 = delvolinv * q[i]; + for (n = nlower; n <= nupper; n++) { + mz = n+nz; + y0 = z0*rho1d[2][n]; + for (m = nlower; m <= nupper; m++) { + my = m+ny; + x0 = y0*rho1d[1][m]; + for (l = nlower; l <= nupper; l++) { + mx = l+nx; + density_brick[mz][my][mx] += x0*rho1d[0][l]; + } + } + } + } +} + +/* ---------------------------------------------------------------------- + remap density from 3d brick decomposition to FFT decomposition +------------------------------------------------------------------------- */ + +void PPPM2::brick2fft() +{ + int n,ix,iy,iz; + + // copy grabs inner portion of density from 3d brick + // remap could be done as pre-stage of FFT, + // but this works optimally on only double values, not complex values + + n = 0; + for (iz = nzlo_in; iz <= nzhi_in; iz++) + for (iy = nylo_in; iy <= nyhi_in; iy++) + for (ix = nxlo_in; ix <= nxhi_in; ix++) + density_fft[n++] = density_brick[iz][iy][ix]; + + remap->perform(density_fft,density_fft,work1); +} + +/* ---------------------------------------------------------------------- + FFT-based Poisson solver +------------------------------------------------------------------------- */ + +void PPPM2::poisson() +{ + if (differentiation_flag == 1) poisson_ad(); + else poisson_ik(); +} + +/* ---------------------------------------------------------------------- + FFT-based Poisson solver for ik +------------------------------------------------------------------------- */ + +void PPPM2::poisson_ik() +{ + int i,j,k,n; + double eng; + + // transform charge density (r -> k) + + n = 0; + for (i = 0; i < nfft; i++) { + work1[n++] = density_fft[i]; + work1[n++] = ZEROF; + } + + fft1->compute(work1,work1,1); + + // global energy and virial contribution + + double scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm); + double s2 = scaleinv*scaleinv; + + if (eflag_global || vflag_global) { + if (vflag_global) { + n = 0; + for (i = 0; i < nfft; i++) { + eng = s2 * greensfn[i] * (work1[n]*work1[n] + work1[n+1]*work1[n+1]); + for (j = 0; j < 6; j++) virial[j] += eng*vg[i][j]; + if (eflag_global) energy += eng; + n += 2; + } + } else { + n = 0; + for (i = 0; i < nfft; i++) { + energy += + s2 * greensfn[i] * (work1[n]*work1[n] + work1[n+1]*work1[n+1]); + n += 2; + } + } + } + + // scale by 1/total-grid-pts to get rho(k) + // multiply by Green's function to get V(k) + + n = 0; + for (i = 0; i < nfft; i++) { + work1[n++] *= scaleinv * greensfn[i]; + work1[n++] *= scaleinv * greensfn[i]; + } + + // extra FFTs for per-atom energy/virial + + if (evflag_atom) poisson_peratom(); + + // triclinic system + + if (triclinic) { + poisson_ik_triclinic(); + return; + } + + // compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k) + // FFT leaves data in 3d brick decomposition + // copy it into inner portion of vdx,vdy,vdz arrays + + // x direction gradient + + n = 0; + for (k = nzlo_fft; k <= nzhi_fft; k++) + for (j = nylo_fft; j <= nyhi_fft; j++) + for (i = nxlo_fft; i <= nxhi_fft; i++) { + work2[n] = fkx[i]*work1[n+1]; + work2[n+1] = -fkx[i]*work1[n]; + n += 2; + } + + fft2->compute(work2,work2,-1); + + n = 0; + for (k = nzlo_in; k <= nzhi_in; k++) + for (j = nylo_in; j <= nyhi_in; j++) + for (i = nxlo_in; i <= nxhi_in; i++) { + vdx_brick[k][j][i] = work2[n]; + n += 2; + } + + // y direction gradient + + n = 0; + for (k = nzlo_fft; k <= nzhi_fft; k++) + for (j = nylo_fft; j <= nyhi_fft; j++) + for (i = nxlo_fft; i <= nxhi_fft; i++) { + work2[n] = fky[j]*work1[n+1]; + work2[n+1] = -fky[j]*work1[n]; + n += 2; + } + + fft2->compute(work2,work2,-1); + + n = 0; + for (k = nzlo_in; k <= nzhi_in; k++) + for (j = nylo_in; j <= nyhi_in; j++) + for (i = nxlo_in; i <= nxhi_in; i++) { + vdy_brick[k][j][i] = work2[n]; + n += 2; + } + + // z direction gradient + + n = 0; + for (k = nzlo_fft; k <= nzhi_fft; k++) + for (j = nylo_fft; j <= nyhi_fft; j++) + for (i = nxlo_fft; i <= nxhi_fft; i++) { + work2[n] = fkz[k]*work1[n+1]; + work2[n+1] = -fkz[k]*work1[n]; + n += 2; + } + + fft2->compute(work2,work2,-1); + + n = 0; + for (k = nzlo_in; k <= nzhi_in; k++) + for (j = nylo_in; j <= nyhi_in; j++) + for (i = nxlo_in; i <= nxhi_in; i++) { + vdz_brick[k][j][i] = work2[n]; + n += 2; + } +} + +/* ---------------------------------------------------------------------- + FFT-based Poisson solver for ik for a triclinic system +------------------------------------------------------------------------- */ + +void PPPM2::poisson_ik_triclinic() +{ + int i,j,k,n; + + // compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k) + // FFT leaves data in 3d brick decomposition + // copy it into inner portion of vdx,vdy,vdz arrays + + // x direction gradient + + n = 0; + for (i = 0; i < nfft; i++) { + work2[n] = fkx[i]*work1[n+1]; + work2[n+1] = -fkx[i]*work1[n]; + n += 2; + } + + fft2->compute(work2,work2,-1); + + n = 0; + for (k = nzlo_in; k <= nzhi_in; k++) + for (j = nylo_in; j <= nyhi_in; j++) + for (i = nxlo_in; i <= nxhi_in; i++) { + vdx_brick[k][j][i] = work2[n]; + n += 2; + } + + // y direction gradient + + n = 0; + for (i = 0; i < nfft; i++) { + work2[n] = fky[i]*work1[n+1]; + work2[n+1] = -fky[i]*work1[n]; + n += 2; + } + + fft2->compute(work2,work2,-1); + + n = 0; + for (k = nzlo_in; k <= nzhi_in; k++) + for (j = nylo_in; j <= nyhi_in; j++) + for (i = nxlo_in; i <= nxhi_in; i++) { + vdy_brick[k][j][i] = work2[n]; + n += 2; + } + + // z direction gradient + + n = 0; + for (i = 0; i < nfft; i++) { + work2[n] = fkz[i]*work1[n+1]; + work2[n+1] = -fkz[i]*work1[n]; + n += 2; + } + + fft2->compute(work2,work2,-1); + + n = 0; + for (k = nzlo_in; k <= nzhi_in; k++) + for (j = nylo_in; j <= nyhi_in; j++) + for (i = nxlo_in; i <= nxhi_in; i++) { + vdz_brick[k][j][i] = work2[n]; + n += 2; + } +} + +/* ---------------------------------------------------------------------- + FFT-based Poisson solver for ad +------------------------------------------------------------------------- */ + +void PPPM2::poisson_ad() +{ + int i,j,k,n; + double eng; + + // transform charge density (r -> k) + + n = 0; + for (i = 0; i < nfft; i++) { + work1[n++] = density_fft[i]; + work1[n++] = ZEROF; + } + + fft1->compute(work1,work1,1); + + // global energy and virial contribution + + double scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm); + double s2 = scaleinv*scaleinv; + + if (eflag_global || vflag_global) { + if (vflag_global) { + n = 0; + for (i = 0; i < nfft; i++) { + eng = s2 * greensfn[i] * (work1[n]*work1[n] + work1[n+1]*work1[n+1]); + for (j = 0; j < 6; j++) virial[j] += eng*vg[i][j]; + if (eflag_global) energy += eng; + n += 2; + } + } else { + n = 0; + for (i = 0; i < nfft; i++) { + energy += + s2 * greensfn[i] * (work1[n]*work1[n] + work1[n+1]*work1[n+1]); + n += 2; + } + } + } + + // scale by 1/total-grid-pts to get rho(k) + // multiply by Green's function to get V(k) + + n = 0; + for (i = 0; i < nfft; i++) { + work1[n++] *= scaleinv * greensfn[i]; + work1[n++] *= scaleinv * greensfn[i]; + } + + // extra FFTs for per-atom energy/virial + + if (vflag_atom) poisson_peratom(); + + n = 0; + for (i = 0; i < nfft; i++) { + work2[n] = work1[n]; + work2[n+1] = work1[n+1]; + n += 2; + } + + fft2->compute(work2,work2,-1); + + n = 0; + for (k = nzlo_in; k <= nzhi_in; k++) + for (j = nylo_in; j <= nyhi_in; j++) + for (i = nxlo_in; i <= nxhi_in; i++) { + u_brick[k][j][i] = work2[n]; + n += 2; + } +} + +/* ---------------------------------------------------------------------- + FFT-based Poisson solver for per-atom energy/virial +------------------------------------------------------------------------- */ + +void PPPM2::poisson_peratom() +{ + int i,j,k,n; + + // energy + + if (eflag_atom && differentiation_flag != 1) { + n = 0; + for (i = 0; i < nfft; i++) { + work2[n] = work1[n]; + work2[n+1] = work1[n+1]; + n += 2; + } + + fft2->compute(work2,work2,-1); + + n = 0; + for (k = nzlo_in; k <= nzhi_in; k++) + for (j = nylo_in; j <= nyhi_in; j++) + for (i = nxlo_in; i <= nxhi_in; i++) { + u_brick[k][j][i] = work2[n]; + n += 2; + } + } + + // 6 components of virial in v0 thru v5 + + if (!vflag_atom) return; + + n = 0; + for (i = 0; i < nfft; i++) { + work2[n] = work1[n]*vg[i][0]; + work2[n+1] = work1[n+1]*vg[i][0]; + n += 2; + } + + fft2->compute(work2,work2,-1); + + n = 0; + for (k = nzlo_in; k <= nzhi_in; k++) + for (j = nylo_in; j <= nyhi_in; j++) + for (i = nxlo_in; i <= nxhi_in; i++) { + v0_brick[k][j][i] = work2[n]; + n += 2; + } + + n = 0; + for (i = 0; i < nfft; i++) { + work2[n] = work1[n]*vg[i][1]; + work2[n+1] = work1[n+1]*vg[i][1]; + n += 2; + } + + fft2->compute(work2,work2,-1); + + n = 0; + for (k = nzlo_in; k <= nzhi_in; k++) + for (j = nylo_in; j <= nyhi_in; j++) + for (i = nxlo_in; i <= nxhi_in; i++) { + v1_brick[k][j][i] = work2[n]; + n += 2; + } + + n = 0; + for (i = 0; i < nfft; i++) { + work2[n] = work1[n]*vg[i][2]; + work2[n+1] = work1[n+1]*vg[i][2]; + n += 2; + } + + fft2->compute(work2,work2,-1); + + n = 0; + for (k = nzlo_in; k <= nzhi_in; k++) + for (j = nylo_in; j <= nyhi_in; j++) + for (i = nxlo_in; i <= nxhi_in; i++) { + v2_brick[k][j][i] = work2[n]; + n += 2; + } + + n = 0; + for (i = 0; i < nfft; i++) { + work2[n] = work1[n]*vg[i][3]; + work2[n+1] = work1[n+1]*vg[i][3]; + n += 2; + } + + fft2->compute(work2,work2,-1); + + n = 0; + for (k = nzlo_in; k <= nzhi_in; k++) + for (j = nylo_in; j <= nyhi_in; j++) + for (i = nxlo_in; i <= nxhi_in; i++) { + v3_brick[k][j][i] = work2[n]; + n += 2; + } + + n = 0; + for (i = 0; i < nfft; i++) { + work2[n] = work1[n]*vg[i][4]; + work2[n+1] = work1[n+1]*vg[i][4]; + n += 2; + } + + fft2->compute(work2,work2,-1); + + n = 0; + for (k = nzlo_in; k <= nzhi_in; k++) + for (j = nylo_in; j <= nyhi_in; j++) + for (i = nxlo_in; i <= nxhi_in; i++) { + v4_brick[k][j][i] = work2[n]; + n += 2; + } + + n = 0; + for (i = 0; i < nfft; i++) { + work2[n] = work1[n]*vg[i][5]; + work2[n+1] = work1[n+1]*vg[i][5]; + n += 2; + } + + fft2->compute(work2,work2,-1); + + n = 0; + for (k = nzlo_in; k <= nzhi_in; k++) + for (j = nylo_in; j <= nyhi_in; j++) + for (i = nxlo_in; i <= nxhi_in; i++) { + v5_brick[k][j][i] = work2[n]; + n += 2; + } +} + +/* ---------------------------------------------------------------------- + interpolate from grid to get electric field & force on my particles +------------------------------------------------------------------------- */ + +void PPPM2::fieldforce() +{ + if (differentiation_flag == 1) fieldforce_ad(); + else fieldforce_ik(); +} + +/* ---------------------------------------------------------------------- + interpolate from grid to get electric field & force on my particles for ik +------------------------------------------------------------------------- */ + +void PPPM2::fieldforce_ik() +{ + int i,l,m,n,nx,ny,nz,mx,my,mz; + FFT_SCALAR dx,dy,dz,x0,y0,z0; + FFT_SCALAR ekx,eky,ekz; + + // loop over my charges, interpolate electric field from nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + // ek = 3 components of E-field on particle + + double *q = atom->q; + double **x = atom->x; + double **f = atom->f; + + int nlocal = atom->nlocal; + + for (i = 0; i < nlocal; i++) { + nx = part2grid[i][0]; + ny = part2grid[i][1]; + nz = part2grid[i][2]; + dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv; + dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv; + dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv; + + compute_rho1d(dx,dy,dz); + + ekx = eky = ekz = ZEROF; + for (n = nlower; n <= nupper; n++) { + mz = n+nz; + z0 = rho1d[2][n]; + for (m = nlower; m <= nupper; m++) { + my = m+ny; + y0 = z0*rho1d[1][m]; + for (l = nlower; l <= nupper; l++) { + mx = l+nx; + x0 = y0*rho1d[0][l]; + ekx -= x0*vdx_brick[mz][my][mx]; + eky -= x0*vdy_brick[mz][my][mx]; + ekz -= x0*vdz_brick[mz][my][mx]; + } + } + } + + // convert E-field to force + + const double qfactor = qqrd2e * scale * q[i]; + f[i][0] += qfactor*ekx; + f[i][1] += qfactor*eky; + if (slabflag != 2) f[i][2] += qfactor*ekz; + } +} + +/* ---------------------------------------------------------------------- + interpolate from grid to get electric field & force on my particles for ad +------------------------------------------------------------------------- */ + +void PPPM2::fieldforce_ad() +{ + int i,l,m,n,nx,ny,nz,mx,my,mz; + FFT_SCALAR dx,dy,dz; + FFT_SCALAR ekx,eky,ekz; + double s1,s2,s3; + double sf = 0.0; + double *prd; + + prd = domain->prd; + double xprd = prd[0]; + double yprd = prd[1]; + double zprd = prd[2]; + + double hx_inv = nx_pppm/xprd; + double hy_inv = ny_pppm/yprd; + double hz_inv = nz_pppm/zprd; + + // loop over my charges, interpolate electric field from nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + // ek = 3 components of E-field on particle + + double *q = atom->q; + double **x = atom->x; + double **f = atom->f; + + int nlocal = atom->nlocal; + + for (i = 0; i < nlocal; i++) { + nx = part2grid[i][0]; + ny = part2grid[i][1]; + nz = part2grid[i][2]; + dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv; + dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv; + dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv; + + compute_rho1d(dx,dy,dz); + compute_drho1d(dx,dy,dz); + + ekx = eky = ekz = ZEROF; + for (n = nlower; n <= nupper; n++) { + mz = n+nz; + for (m = nlower; m <= nupper; m++) { + my = m+ny; + for (l = nlower; l <= nupper; l++) { + mx = l+nx; + ekx += drho1d[0][l]*rho1d[1][m]*rho1d[2][n]*u_brick[mz][my][mx]; + eky += rho1d[0][l]*drho1d[1][m]*rho1d[2][n]*u_brick[mz][my][mx]; + ekz += rho1d[0][l]*rho1d[1][m]*drho1d[2][n]*u_brick[mz][my][mx]; + } + } + } + ekx *= hx_inv; + eky *= hy_inv; + ekz *= hz_inv; + + // convert E-field to force and subtract self forces + + const double qfactor = qqrd2e * scale; + + s1 = x[i][0]*hx_inv; + s2 = x[i][1]*hy_inv; + s3 = x[i][2]*hz_inv; + sf = sf_coeff[0]*sin(2*MY_PI*s1); + sf += sf_coeff[1]*sin(4*MY_PI*s1); + sf *= 2*q[i]*q[i]; + f[i][0] += qfactor*(ekx*q[i] - sf); + + sf = sf_coeff[2]*sin(2*MY_PI*s2); + sf += sf_coeff[3]*sin(4*MY_PI*s2); + sf *= 2*q[i]*q[i]; + f[i][1] += qfactor*(eky*q[i] - sf); + + + sf = sf_coeff[4]*sin(2*MY_PI*s3); + sf += sf_coeff[5]*sin(4*MY_PI*s3); + sf *= 2*q[i]*q[i]; + if (slabflag != 2) f[i][2] += qfactor*(ekz*q[i] - sf); + } +} + +/* ---------------------------------------------------------------------- + interpolate from grid to get per-atom energy/virial +------------------------------------------------------------------------- */ + +void PPPM2::fieldforce_peratom() +{ + int i,l,m,n,nx,ny,nz,mx,my,mz; + FFT_SCALAR dx,dy,dz,x0,y0,z0; + FFT_SCALAR u,v0,v1,v2,v3,v4,v5; + + // loop over my charges, interpolate from nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + + double *q = atom->q; + double **x = atom->x; + + int nlocal = atom->nlocal; + + for (i = 0; i < nlocal; i++) { + nx = part2grid[i][0]; + ny = part2grid[i][1]; + nz = part2grid[i][2]; + dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv; + dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv; + dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv; + + compute_rho1d(dx,dy,dz); + + u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF; + for (n = nlower; n <= nupper; n++) { + mz = n+nz; + z0 = rho1d[2][n]; + for (m = nlower; m <= nupper; m++) { + my = m+ny; + y0 = z0*rho1d[1][m]; + for (l = nlower; l <= nupper; l++) { + mx = l+nx; + x0 = y0*rho1d[0][l]; + if (eflag_atom) u += x0*u_brick[mz][my][mx]; + if (vflag_atom) { + v0 += x0*v0_brick[mz][my][mx]; + v1 += x0*v1_brick[mz][my][mx]; + v2 += x0*v2_brick[mz][my][mx]; + v3 += x0*v3_brick[mz][my][mx]; + v4 += x0*v4_brick[mz][my][mx]; + v5 += x0*v5_brick[mz][my][mx]; + } + } + } + } + + if (eflag_atom) eatom[i] += q[i]*u; + if (vflag_atom) { + vatom[i][0] += q[i]*v0; + vatom[i][1] += q[i]*v1; + vatom[i][2] += q[i]*v2; + vatom[i][3] += q[i]*v3; + vatom[i][4] += q[i]*v4; + vatom[i][5] += q[i]*v5; + } + } +} + +/* ---------------------------------------------------------------------- + pack own values to buf to send to another proc +------------------------------------------------------------------------- */ + +void PPPM2::pack_forward2(int flag, void *pbuf, int nlist, int *list) +{ + FFT_SCALAR *buf = (FFT_SCALAR *) pbuf; + + int n = 0; + + if (flag == FORWARD_IK) { + FFT_SCALAR *xsrc = &vdx_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *ysrc = &vdy_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *zsrc = &vdz_brick[nzlo_out][nylo_out][nxlo_out]; + for (int i = 0; i < nlist; i++) { + buf[n++] = xsrc[list[i]]; + buf[n++] = ysrc[list[i]]; + buf[n++] = zsrc[list[i]]; + } + } else if (flag == FORWARD_AD) { + FFT_SCALAR *src = &u_brick[nzlo_out][nylo_out][nxlo_out]; + for (int i = 0; i < nlist; i++) + buf[i] = src[list[i]]; + } else if (flag == FORWARD_IK_PERATOM) { + FFT_SCALAR *esrc = &u_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v0src = &v0_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v1src = &v1_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v2src = &v2_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v3src = &v3_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v4src = &v4_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v5src = &v5_brick[nzlo_out][nylo_out][nxlo_out]; + for (int i = 0; i < nlist; i++) { + if (eflag_atom) buf[n++] = esrc[list[i]]; + if (vflag_atom) { + buf[n++] = v0src[list[i]]; + buf[n++] = v1src[list[i]]; + buf[n++] = v2src[list[i]]; + buf[n++] = v3src[list[i]]; + buf[n++] = v4src[list[i]]; + buf[n++] = v5src[list[i]]; + } + } + } else if (flag == FORWARD_AD_PERATOM) { + FFT_SCALAR *v0src = &v0_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v1src = &v1_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v2src = &v2_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v3src = &v3_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v4src = &v4_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v5src = &v5_brick[nzlo_out][nylo_out][nxlo_out]; + for (int i = 0; i < nlist; i++) { + buf[n++] = v0src[list[i]]; + buf[n++] = v1src[list[i]]; + buf[n++] = v2src[list[i]]; + buf[n++] = v3src[list[i]]; + buf[n++] = v4src[list[i]]; + buf[n++] = v5src[list[i]]; + } + } +} + +/* ---------------------------------------------------------------------- + unpack another proc's own values from buf and set own ghost values +------------------------------------------------------------------------- */ + +void PPPM2::unpack_forward2(int flag, void *pbuf, int nlist, int *list) +{ + FFT_SCALAR *buf = (FFT_SCALAR *) pbuf; + + int n = 0; + + if (flag == FORWARD_IK) { + FFT_SCALAR *xdest = &vdx_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *ydest = &vdy_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *zdest = &vdz_brick[nzlo_out][nylo_out][nxlo_out]; + for (int i = 0; i < nlist; i++) { + xdest[list[i]] = buf[n++]; + ydest[list[i]] = buf[n++]; + zdest[list[i]] = buf[n++]; + } + } else if (flag == FORWARD_AD) { + FFT_SCALAR *dest = &u_brick[nzlo_out][nylo_out][nxlo_out]; + for (int i = 0; i < nlist; i++) + dest[list[i]] = buf[i]; + } else if (flag == FORWARD_IK_PERATOM) { + FFT_SCALAR *esrc = &u_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v0src = &v0_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v1src = &v1_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v2src = &v2_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v3src = &v3_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v4src = &v4_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v5src = &v5_brick[nzlo_out][nylo_out][nxlo_out]; + for (int i = 0; i < nlist; i++) { + if (eflag_atom) esrc[list[i]] = buf[n++]; + if (vflag_atom) { + v0src[list[i]] = buf[n++]; + v1src[list[i]] = buf[n++]; + v2src[list[i]] = buf[n++]; + v3src[list[i]] = buf[n++]; + v4src[list[i]] = buf[n++]; + v5src[list[i]] = buf[n++]; + } + } + } else if (flag == FORWARD_AD_PERATOM) { + FFT_SCALAR *v0src = &v0_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v1src = &v1_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v2src = &v2_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v3src = &v3_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v4src = &v4_brick[nzlo_out][nylo_out][nxlo_out]; + FFT_SCALAR *v5src = &v5_brick[nzlo_out][nylo_out][nxlo_out]; + for (int i = 0; i < nlist; i++) { + v0src[list[i]] = buf[n++]; + v1src[list[i]] = buf[n++]; + v2src[list[i]] = buf[n++]; + v3src[list[i]] = buf[n++]; + v4src[list[i]] = buf[n++]; + v5src[list[i]] = buf[n++]; + } + } +} + +/* ---------------------------------------------------------------------- + pack ghost values into buf to send to another proc +------------------------------------------------------------------------- */ + +void PPPM2::pack_reverse2(int flag, void *pbuf, int nlist, int *list) +{ + FFT_SCALAR *buf = (FFT_SCALAR *) pbuf; + + if (flag == REVERSE_RHO) { + FFT_SCALAR *src = &density_brick[nzlo_out][nylo_out][nxlo_out]; + for (int i = 0; i < nlist; i++) + buf[i] = src[list[i]]; + } +} + +/* ---------------------------------------------------------------------- + unpack another proc's ghost values from buf and add to own values +------------------------------------------------------------------------- */ + +void PPPM2::unpack_reverse2(int flag, void *pbuf, int nlist, int *list) +{ + FFT_SCALAR *buf = (FFT_SCALAR *) pbuf; + + if (flag == REVERSE_RHO) { + FFT_SCALAR *dest = &density_brick[nzlo_out][nylo_out][nxlo_out]; + for (int i = 0; i < nlist; i++) + dest[list[i]] += buf[i]; + } +} + +/* ---------------------------------------------------------------------- + map nprocs to NX by NY grid as PX by PY procs - return optimal px,py +------------------------------------------------------------------------- */ + +void PPPM2::procs2grid2d(int nprocs, int nx, int ny, int *px, int *py) +{ + // loop thru all possible factorizations of nprocs + // surf = surface area of largest proc sub-domain + // innermost if test minimizes surface area and surface/volume ratio + + int bestsurf = 2 * (nx + ny); + int bestboxx = 0; + int bestboxy = 0; + + int boxx,boxy,surf,ipx,ipy; + + ipx = 1; + while (ipx <= nprocs) { + if (nprocs % ipx == 0) { + ipy = nprocs/ipx; + boxx = nx/ipx; + if (nx % ipx) boxx++; + boxy = ny/ipy; + if (ny % ipy) boxy++; + surf = boxx + boxy; + if (surf < bestsurf || + (surf == bestsurf && boxx*boxy > bestboxx*bestboxy)) { + bestsurf = surf; + bestboxx = boxx; + bestboxy = boxy; + *px = ipx; + *py = ipy; + } + } + ipx++; + } +} + +/* ---------------------------------------------------------------------- + charge assignment into rho1d + dx,dy,dz = distance of particle from "lower left" grid point +------------------------------------------------------------------------- */ + +void PPPM2::compute_rho1d(const FFT_SCALAR &dx, const FFT_SCALAR &dy, + const FFT_SCALAR &dz) +{ + int k,l; + FFT_SCALAR r1,r2,r3; + + for (k = (1-order)/2; k <= order/2; k++) { + r1 = r2 = r3 = ZEROF; + + for (l = order-1; l >= 0; l--) { + r1 = rho_coeff[l][k] + r1*dx; + r2 = rho_coeff[l][k] + r2*dy; + r3 = rho_coeff[l][k] + r3*dz; + } + rho1d[0][k] = r1; + rho1d[1][k] = r2; + rho1d[2][k] = r3; + } +} + +/* ---------------------------------------------------------------------- + charge assignment into drho1d + dx,dy,dz = distance of particle from "lower left" grid point +------------------------------------------------------------------------- */ + +void PPPM2::compute_drho1d(const FFT_SCALAR &dx, const FFT_SCALAR &dy, + const FFT_SCALAR &dz) +{ + int k,l; + FFT_SCALAR r1,r2,r3; + + for (k = (1-order)/2; k <= order/2; k++) { + r1 = r2 = r3 = ZEROF; + + for (l = order-2; l >= 0; l--) { + r1 = drho_coeff[l][k] + r1*dx; + r2 = drho_coeff[l][k] + r2*dy; + r3 = drho_coeff[l][k] + r3*dz; + } + drho1d[0][k] = r1; + drho1d[1][k] = r2; + drho1d[2][k] = r3; + } +} + +/* ---------------------------------------------------------------------- + generate coeffients for the weight function of order n + + (n-1) + Wn(x) = Sum wn(k,x) , Sum is over every other integer + k=-(n-1) + For k=-(n-1),-(n-1)+2, ....., (n-1)-2,n-1 + k is odd integers if n is even and even integers if n is odd + --- + | n-1 + | Sum a(l,j)*(x-k/2)**l if abs(x-k/2) < 1/2 + wn(k,x) = < l=0 + | + | 0 otherwise + --- + a coeffients are packed into the array rho_coeff to eliminate zeros + rho_coeff(l,((k+mod(n+1,2))/2) = a(l,k) +------------------------------------------------------------------------- */ + +void PPPM2::compute_rho_coeff() +{ + int j,k,l,m; + FFT_SCALAR s; + + FFT_SCALAR **a; + memory->create2d_offset(a,order,-order,order,"pppm:a"); + + for (k = -order; k <= order; k++) + for (l = 0; l < order; l++) + a[l][k] = 0.0; + + a[0][0] = 1.0; + for (j = 1; j < order; j++) { + for (k = -j; k <= j; k += 2) { + s = 0.0; + for (l = 0; l < j; l++) { + a[l+1][k] = (a[l][k+1]-a[l][k-1]) / (l+1); +#ifdef FFT_SINGLE + s += powf(0.5,(float) l+1) * + (a[l][k-1] + powf(-1.0,(float) l) * a[l][k+1]) / (l+1); +#else + s += pow(0.5,(double) l+1) * + (a[l][k-1] + pow(-1.0,(double) l) * a[l][k+1]) / (l+1); +#endif + } + a[0][k] = s; + } + } + + m = (1-order)/2; + for (k = -(order-1); k < order; k += 2) { + for (l = 0; l < order; l++) + rho_coeff[l][m] = a[l][k]; + for (l = 1; l < order; l++) + drho_coeff[l-1][m] = l*a[l][k]; + m++; + } + + memory->destroy2d_offset(a,-order); +} + +/* ---------------------------------------------------------------------- + Slab-geometry correction term to dampen inter-slab interactions between + periodically repeating slabs. Yields good approximation to 2D Ewald if + adequate empty space is left between repeating slabs (J. Chem. Phys. + 111, 3155). Slabs defined here to be parallel to the xy plane. Also + extended to non-neutral systems (J. Chem. Phys. 131, 094107). +------------------------------------------------------------------------- */ + +void PPPM2::slabcorr() +{ + // compute local contribution to global dipole moment + + double *q = atom->q; + double **x = atom->x; + double zprd = domain->zprd; + int nlocal = atom->nlocal; + + double dipole = 0.0; + for (int i = 0; i < nlocal; i++) dipole += q[i]*x[i][2]; + + // sum local contributions to get global dipole moment + + double dipole_all; + MPI_Allreduce(&dipole,&dipole_all,1,MPI_DOUBLE,MPI_SUM,world); + + // need to make non-neutral systems and/or + // per-atom energy translationally invariant + + double dipole_r2 = 0.0; + if (eflag_atom || fabs(qsum) > SMALL) { + for (int i = 0; i < nlocal; i++) + dipole_r2 += q[i]*x[i][2]*x[i][2]; + + // sum local contributions + + double tmp; + MPI_Allreduce(&dipole_r2,&tmp,1,MPI_DOUBLE,MPI_SUM,world); + dipole_r2 = tmp; + } + + // compute corrections + + const double e_slabcorr = MY_2PI*(dipole_all*dipole_all - + qsum*dipole_r2 - qsum*qsum*zprd*zprd/12.0)/volume; + const double qscale = qqrd2e * scale; + + if (eflag_global) energy += qscale * e_slabcorr; + + // per-atom energy + + if (eflag_atom) { + double efact = qscale * MY_2PI/volume; + for (int i = 0; i < nlocal; i++) + eatom[i] += efact * q[i]*(x[i][2]*dipole_all - 0.5*(dipole_r2 + + qsum*x[i][2]*x[i][2]) - qsum*zprd*zprd/12.0); + } + + // add on force corrections + + double ffact = qscale * (-4.0*MY_PI/volume); + double **f = atom->f; + + for (int i = 0; i < nlocal; i++) f[i][2] += ffact * q[i]*(dipole_all - qsum*x[i][2]); +} + +/* ---------------------------------------------------------------------- + perform and time the 1d FFTs required for N timesteps +------------------------------------------------------------------------- */ + +int PPPM2::timing_1d(int n, double &time1d) +{ + double time1,time2; + + for (int i = 0; i < 2*nfft_both; i++) work1[i] = ZEROF; + + MPI_Barrier(world); + time1 = MPI_Wtime(); + + for (int i = 0; i < n; i++) { + fft1->timing1d(work1,nfft_both,1); + fft2->timing1d(work1,nfft_both,-1); + if (differentiation_flag != 1) { + fft2->timing1d(work1,nfft_both,-1); + fft2->timing1d(work1,nfft_both,-1); + } + } + + MPI_Barrier(world); + time2 = MPI_Wtime(); + time1d = time2 - time1; + + if (differentiation_flag) return 2; + return 4; +} + +/* ---------------------------------------------------------------------- + perform and time the 3d FFTs required for N timesteps +------------------------------------------------------------------------- */ + +int PPPM2::timing_3d(int n, double &time3d) +{ + double time1,time2; + + for (int i = 0; i < 2*nfft_both; i++) work1[i] = ZEROF; + + MPI_Barrier(world); + time1 = MPI_Wtime(); + + for (int i = 0; i < n; i++) { + fft1->compute(work1,work1,1); + fft2->compute(work1,work1,-1); + if (differentiation_flag != 1) { + fft2->compute(work1,work1,-1); + fft2->compute(work1,work1,-1); + } + } + + MPI_Barrier(world); + time2 = MPI_Wtime(); + time3d = time2 - time1; + + if (differentiation_flag) return 2; + return 4; +} + +/* ---------------------------------------------------------------------- + memory usage of local arrays +------------------------------------------------------------------------- */ + +double PPPM2::memory_usage() +{ + double bytes = nmax*3 * sizeof(double); + + int nbrick = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) * + (nzhi_out-nzlo_out+1); + if (differentiation_flag == 1) { + bytes += 2 * nbrick * sizeof(FFT_SCALAR); + } else { + bytes += 4 * nbrick * sizeof(FFT_SCALAR); + } + + if (triclinic) bytes += 3 * nfft_both * sizeof(double); + bytes += 6 * nfft_both * sizeof(double); + bytes += nfft_both * sizeof(double); + bytes += nfft_both*5 * sizeof(FFT_SCALAR); + + if (peratom_allocate_flag) + bytes += 6 * nbrick * sizeof(FFT_SCALAR); + + if (group_allocate_flag) { + bytes += 2 * nbrick * sizeof(FFT_SCALAR); + bytes += 2 * nfft_both * sizeof(FFT_SCALAR);; + } + + // two GridComm bufs + + bytes += (ngc_buf1 + ngc_buf2) * npergrid * sizeof(FFT_SCALAR); + + return bytes; +} + +/* ---------------------------------------------------------------------- + group-group interactions + ------------------------------------------------------------------------- */ + +/* ---------------------------------------------------------------------- + compute the PPPM total long-range force and energy for groups A and B + ------------------------------------------------------------------------- */ + +void PPPM2::compute_group_group(int groupbit_A, int groupbit_B, int AA_flag) +{ + if (slabflag && triclinic) + error->all(FLERR,"Cannot (yet) use K-space slab " + "correction with compute group/group for triclinic systems"); + + if (differentiation_flag) + error->all(FLERR,"Cannot (yet) use kspace_modify " + "diff ad with compute group/group"); + + if (!group_allocate_flag) allocate_groups(); + + // convert atoms from box to lamda coords + + if (triclinic == 0) boxlo = domain->boxlo; + else { + boxlo = domain->boxlo_lamda; + domain->x2lamda(atom->nlocal); + } + + e2group = 0.0; //energy + f2group[0] = 0.0; //force in x-direction + f2group[1] = 0.0; //force in y-direction + f2group[2] = 0.0; //force in z-direction + + // map my particle charge onto my local 3d density grid + + make_rho_groups(groupbit_A,groupbit_B,AA_flag); + + // all procs communicate density values from their ghost cells + // to fully sum contribution in their 3d bricks + // remap from 3d decomposition to FFT decomposition + + // temporarily store and switch pointers so we can + // use brick2fft() for groups A and B (without + // writing an additional function) + + FFT_SCALAR ***density_brick_real = density_brick; + FFT_SCALAR *density_fft_real = density_fft; + + // group A + + density_brick = density_A_brick; + density_fft = density_A_fft; + + gc->reverse_comm(this,1,sizeof(FFT_SCALAR),REVERSE_RHO, + gc_buf1,gc_buf2,MPI_FFT_SCALAR); + brick2fft(); + + // group B + + density_brick = density_B_brick; + density_fft = density_B_fft; + + gc->reverse_comm(this,1,sizeof(FFT_SCALAR),REVERSE_RHO, + gc_buf1,gc_buf2,MPI_FFT_SCALAR); + brick2fft(); + + // switch back pointers + + density_brick = density_brick_real; + density_fft = density_fft_real; + + // compute potential gradient on my FFT grid and + // portion of group-group energy/force on this proc's FFT grid + + poisson_groups(AA_flag); + + const double qscale = qqrd2e * scale; + + // total group A <--> group B energy + // self and boundary correction terms are in compute_group_group.cpp + + double e2group_all; + MPI_Allreduce(&e2group,&e2group_all,1,MPI_DOUBLE,MPI_SUM,world); + e2group = e2group_all; + + e2group *= qscale*0.5*volume; + + // total group A <--> group B force + + double f2group_all[3]; + MPI_Allreduce(f2group,f2group_all,3,MPI_DOUBLE,MPI_SUM,world); + + f2group[0] = qscale*volume*f2group_all[0]; + f2group[1] = qscale*volume*f2group_all[1]; + if (slabflag != 2) f2group[2] = qscale*volume*f2group_all[2]; + + // convert atoms back from lamda to box coords + + if (triclinic) domain->lamda2x(atom->nlocal); + + if (slabflag == 1) + slabcorr_groups(groupbit_A, groupbit_B, AA_flag); +} + +/* ---------------------------------------------------------------------- + allocate group-group memory that depends on # of K-vectors and order + ------------------------------------------------------------------------- */ + +void PPPM2::allocate_groups() +{ + group_allocate_flag = 1; + + memory->create3d_offset(density_A_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out, + nxlo_out,nxhi_out,"pppm:density_A_brick"); + memory->create3d_offset(density_B_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out, + nxlo_out,nxhi_out,"pppm:density_B_brick"); + memory->create(density_A_fft,nfft_both,"pppm:density_A_fft"); + memory->create(density_B_fft,nfft_both,"pppm:density_B_fft"); +} + +/* ---------------------------------------------------------------------- + deallocate group-group memory that depends on # of K-vectors and order + ------------------------------------------------------------------------- */ + +void PPPM2::deallocate_groups() +{ + group_allocate_flag = 0; + + memory->destroy3d_offset(density_A_brick,nzlo_out,nylo_out,nxlo_out); + memory->destroy3d_offset(density_B_brick,nzlo_out,nylo_out,nxlo_out); + memory->destroy(density_A_fft); + memory->destroy(density_B_fft); +} + +/* ---------------------------------------------------------------------- + create discretized "density" on section of global grid due to my particles + density(x,y,z) = charge "density" at grid points of my 3d brick + (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts) + in global grid for group-group interactions + ------------------------------------------------------------------------- */ + +void PPPM2::make_rho_groups(int groupbit_A, int groupbit_B, int AA_flag) +{ + int l,m,n,nx,ny,nz,mx,my,mz; + FFT_SCALAR dx,dy,dz,x0,y0,z0; + + // clear 3d density arrays + + memset(&(density_A_brick[nzlo_out][nylo_out][nxlo_out]),0, + ngrid*sizeof(FFT_SCALAR)); + + memset(&(density_B_brick[nzlo_out][nylo_out][nxlo_out]),0, + ngrid*sizeof(FFT_SCALAR)); + + // loop over my charges, add their contribution to nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + + double *q = atom->q; + double **x = atom->x; + int nlocal = atom->nlocal; + int *mask = atom->mask; + + for (int i = 0; i < nlocal; i++) { + + if (!((mask[i] & groupbit_A) && (mask[i] & groupbit_B))) + if (AA_flag) continue; + + if ((mask[i] & groupbit_A) || (mask[i] & groupbit_B)) { + + nx = part2grid[i][0]; + ny = part2grid[i][1]; + nz = part2grid[i][2]; + dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv; + dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv; + dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv; + + compute_rho1d(dx,dy,dz); + + z0 = delvolinv * q[i]; + for (n = nlower; n <= nupper; n++) { + mz = n+nz; + y0 = z0*rho1d[2][n]; + for (m = nlower; m <= nupper; m++) { + my = m+ny; + x0 = y0*rho1d[1][m]; + for (l = nlower; l <= nupper; l++) { + mx = l+nx; + + // group A + + if (mask[i] & groupbit_A) + density_A_brick[mz][my][mx] += x0*rho1d[0][l]; + + // group B + + if (mask[i] & groupbit_B) + density_B_brick[mz][my][mx] += x0*rho1d[0][l]; + } + } + } + } + } +} + +/* ---------------------------------------------------------------------- + FFT-based Poisson solver for group-group interactions + ------------------------------------------------------------------------- */ + +void PPPM2::poisson_groups(int AA_flag) +{ + int i,j,k,n; + + // reuse memory (already declared) + + FFT_SCALAR *work_A = work1; + FFT_SCALAR *work_B = work2; + + // transform charge density (r -> k) + + // group A + + n = 0; + for (i = 0; i < nfft; i++) { + work_A[n++] = density_A_fft[i]; + work_A[n++] = ZEROF; + } + + fft1->compute(work_A,work_A,1); + + // group B + + n = 0; + for (i = 0; i < nfft; i++) { + work_B[n++] = density_B_fft[i]; + work_B[n++] = ZEROF; + } + + fft1->compute(work_B,work_B,1); + + // group-group energy and force contribution, + // keep everything in reciprocal space so + // no inverse FFTs needed + + double scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm); + double s2 = scaleinv*scaleinv; + + // energy + + n = 0; + for (i = 0; i < nfft; i++) { + e2group += s2 * greensfn[i] * + (work_A[n]*work_B[n] + work_A[n+1]*work_B[n+1]); + n += 2; + } + + if (AA_flag) return; + + + // multiply by Green's function and s2 + // (only for work_A so it is not squared below) + + n = 0; + for (i = 0; i < nfft; i++) { + work_A[n++] *= s2 * greensfn[i]; + work_A[n++] *= s2 * greensfn[i]; + } + + // triclinic system + + if (triclinic) { + poisson_groups_triclinic(); + return; + } + + double partial_group; + + // force, x direction + + n = 0; + for (k = nzlo_fft; k <= nzhi_fft; k++) + for (j = nylo_fft; j <= nyhi_fft; j++) + for (i = nxlo_fft; i <= nxhi_fft; i++) { + partial_group = work_A[n+1]*work_B[n] - work_A[n]*work_B[n+1]; + f2group[0] += fkx[i] * partial_group; + n += 2; + } + + // force, y direction + + n = 0; + for (k = nzlo_fft; k <= nzhi_fft; k++) + for (j = nylo_fft; j <= nyhi_fft; j++) + for (i = nxlo_fft; i <= nxhi_fft; i++) { + partial_group = work_A[n+1]*work_B[n] - work_A[n]*work_B[n+1]; + f2group[1] += fky[j] * partial_group; + n += 2; + } + + // force, z direction + + n = 0; + for (k = nzlo_fft; k <= nzhi_fft; k++) + for (j = nylo_fft; j <= nyhi_fft; j++) + for (i = nxlo_fft; i <= nxhi_fft; i++) { + partial_group = work_A[n+1]*work_B[n] - work_A[n]*work_B[n+1]; + f2group[2] += fkz[k] * partial_group; + n += 2; + } +} + +/* ---------------------------------------------------------------------- + FFT-based Poisson solver for group-group interactions + for a triclinic system + ------------------------------------------------------------------------- */ + +void PPPM2::poisson_groups_triclinic() +{ + int i,n; + + // reuse memory (already declared) + + FFT_SCALAR *work_A = work1; + FFT_SCALAR *work_B = work2; + + double partial_group; + + // force, x direction + + n = 0; + for (i = 0; i < nfft; i++) { + partial_group = work_A[n+1]*work_B[n] - work_A[n]*work_B[n+1]; + f2group[0] += fkx[i] * partial_group; + n += 2; + } + + // force, y direction + + n = 0; + for (i = 0; i < nfft; i++) { + partial_group = work_A[n+1]*work_B[n] - work_A[n]*work_B[n+1]; + f2group[1] += fky[i] * partial_group; + n += 2; + } + + // force, z direction + + n = 0; + for (i = 0; i < nfft; i++) { + partial_group = work_A[n+1]*work_B[n] - work_A[n]*work_B[n+1]; + f2group[2] += fkz[i] * partial_group; + n += 2; + } +} + +/* ---------------------------------------------------------------------- + Slab-geometry correction term to dampen inter-slab interactions between + periodically repeating slabs. Yields good approximation to 2D Ewald if + adequate empty space is left between repeating slabs (J. Chem. Phys. + 111, 3155). Slabs defined here to be parallel to the xy plane. Also + extended to non-neutral systems (J. Chem. Phys. 131, 094107). +------------------------------------------------------------------------- */ + +void PPPM2::slabcorr_groups(int groupbit_A, int groupbit_B, int AA_flag) +{ + // compute local contribution to global dipole moment + + double *q = atom->q; + double **x = atom->x; + double zprd = domain->zprd; + int *mask = atom->mask; + int nlocal = atom->nlocal; + + double qsum_A = 0.0; + double qsum_B = 0.0; + double dipole_A = 0.0; + double dipole_B = 0.0; + double dipole_r2_A = 0.0; + double dipole_r2_B = 0.0; + + for (int i = 0; i < nlocal; i++) { + if (!((mask[i] & groupbit_A) && (mask[i] & groupbit_B))) + if (AA_flag) continue; + + if (mask[i] & groupbit_A) { + qsum_A += q[i]; + dipole_A += q[i]*x[i][2]; + dipole_r2_A += q[i]*x[i][2]*x[i][2]; + } + + if (mask[i] & groupbit_B) { + qsum_B += q[i]; + dipole_B += q[i]*x[i][2]; + dipole_r2_B += q[i]*x[i][2]*x[i][2]; + } + } + + // sum local contributions to get total charge and global dipole moment + // for each group + + double tmp; + MPI_Allreduce(&qsum_A,&tmp,1,MPI_DOUBLE,MPI_SUM,world); + qsum_A = tmp; + + MPI_Allreduce(&qsum_B,&tmp,1,MPI_DOUBLE,MPI_SUM,world); + qsum_B = tmp; + + MPI_Allreduce(&dipole_A,&tmp,1,MPI_DOUBLE,MPI_SUM,world); + dipole_A = tmp; + + MPI_Allreduce(&dipole_B,&tmp,1,MPI_DOUBLE,MPI_SUM,world); + dipole_B = tmp; + + MPI_Allreduce(&dipole_r2_A,&tmp,1,MPI_DOUBLE,MPI_SUM,world); + dipole_r2_A = tmp; + + MPI_Allreduce(&dipole_r2_B,&tmp,1,MPI_DOUBLE,MPI_SUM,world); + dipole_r2_B = tmp; + + // compute corrections + + const double qscale = qqrd2e * scale; + const double efact = qscale * MY_2PI/volume; + + e2group += efact * (dipole_A*dipole_B - 0.5*(qsum_A*dipole_r2_B + + qsum_B*dipole_r2_A) - qsum_A*qsum_B*zprd*zprd/12.0); + + // add on force corrections + + const double ffact = qscale * (-4.0*MY_PI/volume); + f2group[2] += ffact * (qsum_A*dipole_B - qsum_B*dipole_A); +} diff --git a/src/KSPACE/pppm2.h b/src/KSPACE/pppm2.h new file mode 100644 index 0000000000..31b8534735 --- /dev/null +++ b/src/KSPACE/pppm2.h @@ -0,0 +1,360 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef KSPACE_CLASS + +KSpaceStyle(pppm2,PPPM2) + +#else + +#ifndef LMP_PPPM2_H +#define LMP_PPPM2_H + +#include "kspace.h" + +#if defined(FFT_FFTW3) +#define LMP_FFT_LIB "FFTW3" +#elif defined(FFT_MKL) +#define LMP_FFT_LIB "MKL FFT" +#elif defined(FFT_CUFFT) +#define LMP_FFT_LIB "cuFFT" +#else +#define LMP_FFT_LIB "KISS FFT" +#endif + +#ifdef FFT_SINGLE +typedef float FFT_SCALAR; +#define LMP_FFT_PREC "single" +#define MPI_FFT_SCALAR MPI_FLOAT +#else + +typedef double FFT_SCALAR; +#define LMP_FFT_PREC "double" +#define MPI_FFT_SCALAR MPI_DOUBLE +#endif + +namespace LAMMPS_NS { + +class PPPM2 : public KSpace { + public: + PPPM2(class LAMMPS *); + virtual ~PPPM2(); + virtual void settings(int, char **); + virtual void init(); + virtual void setup(); + virtual void setup_grid(); + virtual void compute(int, int); + virtual int timing_1d(int, double &); + virtual int timing_3d(int, double &); + virtual double memory_usage(); + + virtual void compute_group_group(int, int, int); + + protected: + int me,nprocs; + int nfactors; + int *factors; + double cutoff; + double volume; + double delxinv,delyinv,delzinv,delvolinv; + double h_x,h_y,h_z; + double shift,shiftone; + int peratom_allocate_flag; + + int nxlo_in,nylo_in,nzlo_in,nxhi_in,nyhi_in,nzhi_in; + int nxlo_out,nylo_out,nzlo_out,nxhi_out,nyhi_out,nzhi_out; + int nxlo_ghost,nxhi_ghost,nylo_ghost,nyhi_ghost,nzlo_ghost,nzhi_ghost; + int nxlo_fft,nylo_fft,nzlo_fft,nxhi_fft,nyhi_fft,nzhi_fft; + int nlower,nupper; + int ngrid,nfft,nfft_both; + + FFT_SCALAR ***density_brick; + FFT_SCALAR ***vdx_brick,***vdy_brick,***vdz_brick; + FFT_SCALAR ***u_brick; + FFT_SCALAR ***v0_brick,***v1_brick,***v2_brick; + FFT_SCALAR ***v3_brick,***v4_brick,***v5_brick; + double *greensfn; + double **vg; + double *fkx,*fky,*fkz; + FFT_SCALAR *density_fft; + FFT_SCALAR *work1,*work2; + + double *gf_b; + FFT_SCALAR **rho1d,**rho_coeff,**drho1d,**drho_coeff; + double *sf_precoeff1, *sf_precoeff2, *sf_precoeff3; + double *sf_precoeff4, *sf_precoeff5, *sf_precoeff6; + double sf_coeff[6]; // coefficients for calculating ad self-forces + double **acons; + + // FFTs and grid communication + + class FFT3d *fft1,*fft2; + class Remap *remap; + class GridComm2 *gc; + FFT_SCALAR *gc_buf1,*gc_buf2; + int ngc_buf1,ngc_buf2,npergrid; + + // group-group interactions + + int group_allocate_flag; + FFT_SCALAR ***density_A_brick,***density_B_brick; + FFT_SCALAR *density_A_fft,*density_B_fft; + + int **part2grid; // storage for particle -> grid mapping + int nmax; + + double *boxlo; + // TIP4P settings + int typeH,typeO; // atom types of TIP4P water H and O atoms + double qdist; // distance from O site to negative charge + double alpha; // geometric factor + + virtual void set_grid_global(); + void set_grid_local(); + void adjust_gewald(); + virtual double newton_raphson_f(); + double derivf(); + double final_accuracy(); + + virtual void allocate(); + virtual void allocate_peratom(); + virtual void deallocate(); + virtual void deallocate_peratom(); + int factorable(int); + double compute_df_kspace(); + double estimate_ik_error(double, double, bigint); + virtual double compute_qopt(); + virtual void compute_gf_denom(); + virtual void compute_gf_ik(); + virtual void compute_gf_ad(); + void compute_sf_precoeff(); + + virtual void particle_map(); + virtual void make_rho(); + virtual void brick2fft(); + + virtual void poisson(); + virtual void poisson_ik(); + virtual void poisson_ad(); + + virtual void fieldforce(); + virtual void fieldforce_ik(); + virtual void fieldforce_ad(); + + virtual void poisson_peratom(); + virtual void fieldforce_peratom(); + void procs2grid2d(int,int,int,int *, int*); + void compute_rho1d(const FFT_SCALAR &, const FFT_SCALAR &, + const FFT_SCALAR &); + void compute_drho1d(const FFT_SCALAR &, const FFT_SCALAR &, + const FFT_SCALAR &); + void compute_rho_coeff(); + virtual void slabcorr(); + + // grid communication + + virtual void pack_forward2(int, void *, int, int *); + virtual void unpack_forward2(int, void *, int, int *); + virtual void pack_reverse2(int, void *, int, int *); + virtual void unpack_reverse2(int, void *, int, int *); + + // triclinic + + int triclinic; // domain settings, orthog or triclinic + void setup_triclinic(); + void compute_gf_ik_triclinic(); + void poisson_ik_triclinic(); + void poisson_groups_triclinic(); + + // group-group interactions + + virtual void allocate_groups(); + virtual void deallocate_groups(); + virtual void make_rho_groups(int, int, int); + virtual void poisson_groups(int); + virtual void slabcorr_groups(int,int,int); + +/* ---------------------------------------------------------------------- + denominator for Hockney-Eastwood Green's function + of x,y,z = sin(kx*deltax/2), etc + + inf n-1 + S(n,k) = Sum W(k+pi*j)**2 = Sum b(l)*(z*z)**l + j=-inf l=0 + + = -(z*z)**n /(2n-1)! * (d/dx)**(2n-1) cot(x) at z = sin(x) + gf_b = denominator expansion coeffs +------------------------------------------------------------------------- */ + + inline double gf_denom(const double &x, const double &y, + const double &z) const { + double sx,sy,sz; + sz = sy = sx = 0.0; + for (int l = order-1; l >= 0; l--) { + sx = gf_b[l] + sx*x; + sy = gf_b[l] + sy*y; + sz = gf_b[l] + sz*z; + } + double s = sx*sy*sz; + return s*s; + }; +}; + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +E: Illegal ... command + +Self-explanatory. Check the input script syntax and compare to the +documentation for the command. You can use -echo screen as a +command-line option when running LAMMPS to see the offending line. + +E: Must redefine kspace_style after changing to triclinic box + +UNDOCUMENTED + +E: Cannot (yet) use PPPM with triclinic box and kspace_modify diff ad + +This feature is not yet supported. + +E: Cannot (yet) use PPPM with triclinic box and slab correction + +This feature is not yet supported. + +E: Cannot use PPPM with 2d simulation + +The kspace style pppm cannot be used in 2d simulations. You can use +2d PPPM in a 3d simulation; see the kspace_modify command. + +E: PPPM can only currently be used with comm_style brick + +This is a current restriction in LAMMPS. + +E: Kspace style requires atom attribute q + +The atom style defined does not have these attributes. + +E: Cannot use non-periodic boundaries with PPPM + +For kspace style pppm, all 3 dimensions must have periodic boundaries +unless you use the kspace_modify command to define a 2d slab with a +non-periodic z dimension. + +E: Incorrect boundaries with slab PPPM + +Must have periodic x,y dimensions and non-periodic z dimension to use +2d slab option with PPPM. + +E: PPPM order cannot be < 2 or > than %d + +This is a limitation of the PPPM implementation in LAMMPS. + +E: KSpace style is incompatible with Pair style + +Setting a kspace style requires that a pair style with matching +long-range Coulombic or dispersion components be used. + +E: Pair style is incompatible with TIP4P KSpace style + +The pair style does not have the requires TIP4P settings. + +E: Bond and angle potentials must be defined for TIP4P + +Cannot use TIP4P pair potential unless bond and angle potentials +are defined. + +E: Bad TIP4P angle type for PPPM/TIP4P + +Specified angle type is not valid. + +E: Bad TIP4P bond type for PPPM/TIP4P + +Specified bond type is not valid. + +W: Reducing PPPM order b/c stencil extends beyond nearest neighbor processor + +This may lead to a larger grid than desired. See the kspace_modify overlap +command to prevent changing of the PPPM order. + +E: PPPM order < minimum allowed order + +The default minimum order is 2. This can be reset by the +kspace_modify minorder command. + +E: PPPM grid stencil extends beyond nearest neighbor processor + +This is not allowed if the kspace_modify overlap setting is no. + +E: KSpace accuracy must be > 0 + +The kspace accuracy designated in the input must be greater than zero. + +E: Must use kspace_modify gewald for uncharged system + +UNDOCUMENTED + +E: Could not compute grid size + +The code is unable to compute a grid size consistent with the desired +accuracy. This error should not occur for typical problems. Please +send an email to the developers. + +E: PPPM grid is too large + +The global PPPM grid is larger than OFFSET in one or more dimensions. +OFFSET is currently set to 4096. You likely need to decrease the +requested accuracy. + +E: Could not compute g_ewald + +The Newton-Raphson solver failed to converge to a good value for +g_ewald. This error should not occur for typical problems. Please +send an email to the developers. + +E: Non-numeric box dimensions - simulation unstable + +The box size has apparently blown up. + +E: Out of range atoms - cannot compute PPPM + +One or more atoms are attempting to map their charge to a PPPM grid +point that is not owned by a processor. This is likely for one of two +reasons, both of them bad. First, it may mean that an atom near the +boundary of a processor's sub-domain has moved more than 1/2 the +"neighbor skin distance"_neighbor.html without neighbor lists being +rebuilt and atoms being migrated to new processors. This also means +you may be missing pairwise interactions that need to be computed. +The solution is to change the re-neighboring criteria via the +"neigh_modify"_neigh_modify command. The safest settings are "delay 0 +every 1 check yes". Second, it may mean that an atom has moved far +outside a processor's sub-domain or even the entire simulation box. +This indicates bad physics, e.g. due to highly overlapping atoms, too +large a timestep, etc. + +E: Cannot (yet) use K-space slab correction with compute group/group for triclinic systems + +This option is not yet supported. + +E: Cannot (yet) use kspace_modify diff ad with compute group/group + +This option is not yet supported. + +U: Cannot (yet) use PPPM with triclinic box and TIP4P + +This feature is not yet supported. + +*/ diff --git a/src/force.cpp b/src/force.cpp index d2eb137d06..595ffd3140 100644 --- a/src/force.cpp +++ b/src/force.cpp @@ -665,9 +665,9 @@ void Force::create_kspace(const std::string &style, int trysuffix) kspace = new_kspace(style,trysuffix,sflag); store_style(kspace_style,style,sflag); - if (comm->style == 1 && !kspace_match("ewald",0)) - error->all(FLERR, - "Cannot yet use KSpace solver with grid with comm style tiled"); + //if (comm->style == 1 && !kspace_match("ewald",0)) + // error->all(FLERR, + // "Cannot yet use KSpace solver with grid with comm style tiled"); } /* ---------------------------------------------------------------------- diff --git a/src/kspace.h b/src/kspace.h index e26b9490d6..4d686d59f0 100644 --- a/src/kspace.h +++ b/src/kspace.h @@ -126,6 +126,11 @@ class KSpace : protected Pointers { virtual void pack_reverse(int, FFT_SCALAR *, int, int *) {}; virtual void unpack_reverse(int, FFT_SCALAR *, int, int *) {}; + virtual void pack_forward2(int, void *, int, int *) {}; + virtual void unpack_forward2(int, void *, int, int *) {}; + virtual void pack_reverse2(int, void *, int, int *) {}; + virtual void unpack_reverse2(int, void *, int, int *) {}; + virtual int timing(int, double &, double &) {return 0;} virtual int timing_1d(int, double &) {return 0;} virtual int timing_3d(int, double &) {return 0;}