// clang-format off /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator https://www.lammps.org/, Sandia National Laboratories LAMMPS development team: developers@lammps.org Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing author: Trung Nguyen (Northwestern/UChicago) ------------------------------------------------------------------------- */ #include "pair_amoeba_gpu.h" #include "amoeba_convolution_gpu.h" #include "atom.h" #include "comm.h" #include "domain.h" #include "error.h" #include "fix_store_atom.h" #include "force.h" #include "gpu_extra.h" #include "info.h" #include "math_const.h" #include "memory.h" #include "my_page.h" #include "neigh_list.h" #include "neigh_request.h" #include "neighbor.h" #include "suffix.h" #include using namespace LAMMPS_NS; using namespace MathConst; // same as in amoeba_induce.cpp enum{INDUCE,RSD,SETUP_AMOEBA,SETUP_HIPPO,KMPOLE,AMGROUP}; // forward comm enum{FIELD,ZRSD,TORQUE,UFLD}; // reverse comm enum{VDWL,REPULSE,QFER,DISP,MPOLE,POLAR,USOLV,DISP_LONG,MPOLE_LONG,POLAR_LONG}; enum{MUTUAL,OPT,TCG,DIRECT}; enum{GEAR,ASPC,LSQR}; enum{BUILD,APPLY}; enum{GORDON1,GORDON2}; // same as in pair_amoeba.cpp enum{MPOLE_GRID,POLAR_GRID,POLAR_GRIDC,DISP_GRID,INDUCE_GRID,INDUCE_GRIDC}; #define DEBYE 4.80321 // conversion factor from q-Angs (real units) to Debye // External functions from cuda library for atom decomposition int amoeba_gpu_init(const int ntypes, const int max_amtype, const int max_amclass, const double *host_pdamp, const double *host_thole, const double *host_dirdamp, const int* host_amtype2class, const double *host_special_hal, const double *host_special_repel, const double *host_special_disp, const double *host_special_mpole, const double *host_special_polar_wscale, const double *host_special_polar_piscale, const double *host_special_polar_pscale, const double *host_csix, const double *host_adisp, const int nlocal, const int nall, const int max_nbors, const int maxspecial, const int maxspecial15, const double cell_size, int &gpu_mode, FILE *screen, const double polar_dscale, const double polar_uscale); void amoeba_gpu_clear(); int** amoeba_gpu_precompute(const int ago, const int inum_full, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, double *host_pval, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int *nspecial15, tagint **special15, const bool eflag_in, const bool vflag_in, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, double *host_q, double *boxlo, double *prd); void amoeba_gpu_compute_multipole_real(const int ago, const int inum, const int nall, double **host_x, int *host_type, int *host_amtype, int *host_amgroup, double **host_rpole, double *sublo, double *subhi, tagint *tag, int **nspecial, tagint **special, int* nspecial15, tagint** special15, const bool eflag, const bool vflag, const bool eatom, const bool vatom, int &host_start, int **ilist, int **jnum, const double cpu_time, bool &success, const double aewald, const double felec, const double off2, double *host_q, double *boxlo, double *prd, void **tq_ptr); void amoeba_gpu_compute_udirect2b(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, const double aewald, const double off2, void **fieldp_ptr); void amoeba_gpu_compute_umutual2b(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, const double aewald, const double off2, void **fieldp_ptr); void amoeba_gpu_update_fieldp(void **fieldp_ptr); void amoeba_gpu_precompute_kspace(const int inum_full, const int bsorder, double ***host_thetai1, double ***host_thetai2, double ***host_thetai3, int** igrid, const int nzlo_out, const int nzhi_out, const int nylo_out, const int nyhi_out, const int nxlo_out, const int nxhi_out); void amoeba_gpu_fphi_uind(double ****host_grid_brick, void **host_fdip_phi1, void **host_fdip_phi2, void **host_fdip_sum_phi); void amoeba_gpu_fphi_mpole(double ***host_grid_brick, void **host_fdip_sum_phi, const double felec); void amoeba_gpu_compute_polar_real(int *host_amtype, int *host_amgroup, double **host_rpole, double **host_uind, double **host_uinp, const bool eflag, const bool vflag, const bool eatom, const bool vatom, const double aewald, const double felec, const double off2, void **tq_ptr); double amoeba_gpu_bytes(); /* ---------------------------------------------------------------------- */ PairAmoebaGPU::PairAmoebaGPU(LAMMPS *lmp) : PairAmoeba(lmp), gpu_mode(GPU_FORCE) { respa_enable = 0; reinitflag = 0; cpu_time = 0.0; suffix_flag |= Suffix::GPU; fieldp_pinned = nullptr; tq_pinned = nullptr; gpu_hal_ready = false; // true for AMOEBA when ready gpu_repulsion_ready = false; // always false for AMOEBA gpu_dispersion_real_ready = false; // always false for AMOEBA gpu_multipole_real_ready = true; // need to be true for precompute() gpu_udirect2b_ready = true; gpu_umutual1_ready = true; gpu_fphi_uind_ready = true; gpu_umutual2b_ready = true; gpu_polar_real_ready = true; // need to be true for copying data from device back to host GPU_EXTRA::gpu_ready(lmp->modify, lmp->error); } /* ---------------------------------------------------------------------- free all arrays ------------------------------------------------------------------------- */ PairAmoebaGPU::~PairAmoebaGPU() { amoeba_gpu_clear(); } /* ---------------------------------------------------------------------- */ void PairAmoebaGPU::compute(int eflag, int vflag) { if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0) neighbor->build_topology(); PairAmoeba::compute(eflag, vflag); } /* ---------------------------------------------------------------------- init specific to this pair style ------------------------------------------------------------------------- */ void PairAmoebaGPU::init_style() { PairAmoeba::init_style(); // Repeat cutsq calculation because done after call to init_style double maxcut = -1.0; double cut; for (int i = 1; i <= atom->ntypes; i++) { for (int j = i; j <= atom->ntypes; j++) { if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) { cut = init_one(i,j); cut *= cut; if (cut > maxcut) maxcut = cut; cutsq[i][j] = cutsq[j][i] = cut; } else cutsq[i][j] = cutsq[j][i] = 0.0; } } double cell_size = sqrt(maxcut) + neighbor->skin; int maxspecial=0; int maxspecial15=0; if (atom->molecular != Atom::ATOMIC) { maxspecial=atom->maxspecial; maxspecial15=atom->maxspecial15; } int mnf = 5e-2 * neighbor->oneatom; int success = amoeba_gpu_init(atom->ntypes+1, max_amtype, max_amclass, pdamp, thole, dirdamp, amtype2class, special_hal, special_repel, special_disp, special_mpole, special_polar_wscale, special_polar_piscale, special_polar_pscale, csix, adisp, atom->nlocal, atom->nlocal+atom->nghost, mnf, maxspecial, maxspecial15, cell_size, gpu_mode, screen, polar_dscale, polar_uscale); GPU_EXTRA::check_flag(success,error,world); if (gpu_mode == GPU_FORCE) error->all(FLERR,"Pair style amoeba/gpu does not support neigh no for now"); acc_float = Info::has_accelerator_feature("GPU", "precision", "single"); // replace with the gpu counterpart if (gpu_umutual1_ready) { if (use_ewald && ic_kspace) { delete ic_kspace; ic_kspace = new AmoebaConvolutionGPU(lmp,this,nefft1,nefft2,nefft3,bsporder,INDUCE_GRIDC); } } } /* ---------------------------------------------------------------------- multipole_real = real-space portion of mulipole interactions adapted from Tinker emreal1d() routine ------------------------------------------------------------------------- */ void PairAmoebaGPU::multipole_real() { if (!gpu_multipole_real_ready) { PairAmoeba::multipole_real(); return; } int eflag=1, vflag=1; double **f = atom->f; int nall = atom->nlocal + atom->nghost; int inum, host_start; bool success = true; int *ilist, *numneigh; double sublo[3],subhi[3]; if (domain->triclinic == 0) { sublo[0] = domain->sublo[0]; sublo[1] = domain->sublo[1]; sublo[2] = domain->sublo[2]; subhi[0] = domain->subhi[0]; subhi[1] = domain->subhi[1]; subhi[2] = domain->subhi[2]; } else { domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); } inum = atom->nlocal; amoeba_gpu_precompute(neighbor->ago, inum, nall, atom->x, atom->type, amtype, amgroup, rpole, nullptr, nullptr, nullptr, sublo, subhi, atom->tag, atom->nspecial, atom->special, atom->nspecial15, atom->special15, eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh, cpu_time, success, atom->q, domain->boxlo, domain->prd); if (!success) error->one(FLERR,"Insufficient memory on accelerator"); // select the correct cutoff for the term if (use_ewald) choose(MPOLE_LONG); else choose(MPOLE); // set the energy unit conversion factor for multipolar real-space calculation double felec = electric / am_dielectric; amoeba_gpu_compute_multipole_real(neighbor->ago, inum, nall, atom->x, atom->type, amtype, amgroup, rpole, sublo, subhi, atom->tag, atom->nspecial, atom->special, atom->nspecial15, atom->special15, eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh, cpu_time, success, aewald, felec, off2, atom->q, domain->boxlo, domain->prd, &tq_pinned); // reference to the tep array from GPU lib if (acc_float) { auto *tq_ptr = (float *)tq_pinned; compute_force_from_torque(tq_ptr, f, virmpole); // fmpole } else { auto *tq_ptr = (double *)tq_pinned; compute_force_from_torque(tq_ptr, f, virmpole); // fmpole } } /* ---------------------------------------------------------------------- induce = induced dipole moments via pre-conditioned CG solver adapted from Tinker induce0a() routine NOTE: Almost the same in the CPU version, except that there is no need to call reverse_comm() for crstyle = FIELD; ------------------------------------------------------------------------- */ void PairAmoebaGPU::induce() { bool done; int i,j,m,itype; int iter,maxiter; double polmin; double eps,epsold; double epsd,epsp; double udsum,upsum; double a,ap,b,bp; double sum,sump,term; double reduce[4],allreduce[4]; // set cutoffs, taper coeffs, and PME params // create qfac here, free at end of polar() if (use_ewald) choose(POLAR_LONG); else choose(POLAR); // owned atoms int nlocal = atom->nlocal; // zero out the induced dipoles at each site for (i = 0; i < nlocal; i++) { for (j = 0; j < 3; j++) { uind[i][j] = 0.0; uinp[i][j] = 0.0; } } // get the electrostatic field due to permanent multipoles dfield0c(field,fieldp); // need reverse_comm if dfield0c (i.e. udirect2b) is CPU-only if (!gpu_udirect2b_ready) { crstyle = FIELD; comm->reverse_comm(this); } // set induced dipoles to polarizability times direct field for (i = 0; i < nlocal; i++) { itype = amtype[i]; for (j = 0; j < 3; j++) { udir[i][j] = polarity[itype] * field[i][j]; udirp[i][j] = polarity[itype] * fieldp[i][j]; if (pcgguess) { uind[i][j] = udir[i][j]; uinp[i][j] = udirp[i][j]; } } } // allocate memory and make early host-device transfers // must be done before the first ufield0c // NOTE: this is for ic_kspace, and thetai[1-3] if (ic_kspace) amoeba_gpu_precompute_kspace(atom->nlocal, bsorder, thetai1, thetai2, thetai3, igrid, ic_kspace->nzlo_out, ic_kspace->nzhi_out, ic_kspace->nylo_out, ic_kspace->nyhi_out, ic_kspace->nxlo_out, ic_kspace->nxhi_out); // get induced dipoles via the OPT extrapolation method // NOTE: any way to rewrite these loops to avoid allocating // uopt,uoptp with a optorder+1 dimension, just optorder ?? // since no need to store optorder+1 values after these loops if (poltyp == OPT) { for (i = 0; i < nlocal; i++) { for (j = 0; j < 3; j++) { uopt[i][0][j] = udir[i][j]; uoptp[i][0][j] = udirp[i][j]; } } for (m = 1; m <= optorder; m++) { optlevel = m - 1; // used in umutual1() for fopt,foptp cfstyle = INDUCE; comm->forward_comm(this); ufield0c(field,fieldp); if (!gpu_umutual2b_ready) { crstyle = FIELD; comm->reverse_comm(this); } for (i = 0; i < nlocal; i++) { itype = amtype[i]; for (j = 0; j < 3; j++) { uopt[i][m][j] = polarity[itype] * field[i][j]; uoptp[i][m][j] = polarity[itype] * fieldp[i][j]; uind[i][j] = uopt[i][m][j]; uinp[i][j] = uoptp[i][m][j]; } } } for (i = 0; i < nlocal; i++) { for (j = 0; j < 3; j++) { uind[i][j] = 0.0; uinp[i][j] = 0.0; usum[i][j] = 0.0; usump[i][j] = 0.0; for (m = 0; m <= optorder; m++) { usum[i][j] += uopt[i][m][j]; usump[i][j] += uoptp[i][m][j]; uind[i][j] += copt[m]*usum[i][j]; uinp[i][j] += copt[m]*usump[i][j]; } } } } // set tolerances for computation of mutual induced dipoles if (poltyp == MUTUAL) { done = false; maxiter = 100; iter = 0; polmin = 0.00000001; eps = 100.0; // estimate induced dipoles using a polynomial predictor if (use_pred && nualt == maxualt) { ulspred(); double ***udalt = fixudalt->tstore; double ***upalt = fixupalt->tstore; for (i = 0; i < nlocal; i++) { for (j = 0; j < 3; j++) { udsum = 0.0; upsum = 0.0; for (m = 0; m < nualt; m++) { udsum += bpred[m]*udalt[i][m][j]; upsum += bpredp[m]*upalt[i][m][j]; } uind[i][j] = udsum; uinp[i][j] = upsum; } } } // estimate induced dipoles via inertial extended Lagrangian // not supported for now // requires uaux,upaux to persist with each atom // also requires a velocity vector(s) to persist // also requires updating uaux,upaux in the Verlet integration /* if (use_ielscf) { for (i = 0; i < nlocal; i++) { for (j = 0; j < 3; j++) { uind[i][j] = uaux[i][j]; uinp[i][j] = upaux[i][j]; } } } */ // get the electrostatic field due to induced dipoles cfstyle = INDUCE; comm->forward_comm(this); ufield0c(field,fieldp); if (!gpu_umutual2b_ready) { crstyle = FIELD; comm->reverse_comm(this); } // set initial conjugate gradient residual and conjugate vector for (i = 0; i < nlocal; i++) { itype = amtype[i]; poli[i] = MAX(polmin,polarity[itype]); for (j = 0; j < 3; j++) { if (pcgguess) { rsd[i][j] = (udir[i][j]-uind[i][j])/poli[i] + field[i][j]; rsdp[i][j] = (udirp[i][j]-uinp[i][j])/poli[i] + fieldp[i][j]; } else { rsd[i][j] = udir[i][j] / poli[i]; rsdp[i][j] = udirp[i][j] / poli[i]; } zrsd[i][j] = rsd[i][j]; zrsdp[i][j] = rsdp[i][j]; } } if (pcgprec) { cfstyle = RSD; comm->forward_comm(this); uscale0b(BUILD,rsd,rsdp,zrsd,zrsdp); uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); crstyle = ZRSD; comm->reverse_comm(this); } for (i = 0; i < nlocal; i++) { for (j = 0; j < 3; j++) { conj[i][j] = zrsd[i][j]; conjp[i][j] = zrsdp[i][j]; } } // conjugate gradient iteration of the mutual induced dipoles while (!done) { iter++; for (i = 0; i < nlocal; i++) { for (j = 0; j < 3; j++) { vec[i][j] = uind[i][j]; vecp[i][j] = uinp[i][j]; uind[i][j] = conj[i][j]; uinp[i][j] = conjp[i][j]; } } cfstyle = INDUCE; comm->forward_comm(this); ufield0c(field,fieldp); if (!gpu_umutual2b_ready) { crstyle = FIELD; comm->reverse_comm(this); } for (i = 0; i < nlocal; i++) { for (j = 0; j < 3; j++) { uind[i][j] = vec[i][j]; uinp[i][j] = vecp[i][j]; vec[i][j] = conj[i][j]/poli[i] - field[i][j]; vecp[i][j] = conjp[i][j]/poli[i] - fieldp[i][j]; } } a = 0.0; ap = 0.0; sum = 0.0; sump = 0.0; for (i = 0; i < nlocal; i++) { for (j = 0; j < 3; j++) { a += conj[i][j]*vec[i][j]; ap += conjp[i][j]*vecp[i][j]; sum += rsd[i][j]*zrsd[i][j]; sump += rsdp[i][j]*zrsdp[i][j]; } } reduce[0] = a; reduce[1] = ap; reduce[2] = sum; reduce[3] = sump; MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world); a = allreduce[0]; ap = allreduce[1]; sum = allreduce[2]; sump = allreduce[3]; if (a != 0.0) a = sum / a; if (ap != 0.0) ap = sump / ap; for (i = 0; i < nlocal; i++) { for (j = 0; j < 3; j++) { uind[i][j] = uind[i][j] + a*conj[i][j]; uinp[i][j] = uinp[i][j] + ap*conjp[i][j]; rsd[i][j] = rsd[i][j] - a*vec[i][j]; rsdp[i][j] = rsdp[i][j] - ap*vecp[i][j]; zrsd[i][j] = rsd[i][j]; zrsdp[i][j] = rsdp[i][j]; } } if (pcgprec) { cfstyle = RSD; comm->forward_comm(this); uscale0b(APPLY,rsd,rsdp,zrsd,zrsdp); crstyle = ZRSD; comm->reverse_comm(this); } b = 0.0; bp = 0.0; for (i = 0; i < nlocal; i++) { for (j = 0; j < 3; j++) { b += rsd[i][j]*zrsd[i][j]; bp += rsdp[i][j]*zrsdp[i][j]; } } reduce[0] = b; reduce[1] = bp; MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world); b = allreduce[0]; bp = allreduce[1]; if (sum != 0.0) b /= sum; if (sump != 0.0) bp /= sump; epsd = 0.0; epsp = 0.0; for (i = 0; i < nlocal; i++) { for (j = 0; j < 3; j++) { conj[i][j] = zrsd[i][j] + b*conj[i][j]; conjp[i][j] = zrsdp[i][j] + bp*conjp[i][j]; epsd += rsd[i][j]*rsd[i][j]; epsp += rsdp[i][j]*rsdp[i][j]; } } reduce[0] = epsd; reduce[1] = epsp; MPI_Allreduce(reduce,allreduce,4,MPI_DOUBLE,MPI_SUM,world); epsd = allreduce[0]; epsp = allreduce[1]; // check the convergence of the mutual induced dipoles epsold = eps; eps = MAX(epsd,epsp); eps = DEBYE * sqrt(eps/atom->natoms); if (eps < poleps) done = true; if (eps > epsold) done = true; if (iter >= politer) done = true; // apply a "peek" iteration to the mutual induced dipoles if (done) { for (i = 0; i < nlocal; i++) { term = pcgpeek * poli[i]; for (j = 0; j < 3; j++) { uind[i][j] += term*rsd[i][j]; uinp[i][j] += term*rsdp[i][j]; } } } } // terminate the calculation if dipoles failed to converge // NOTE: could make this an error if (iter >= maxiter || eps > epsold) if (comm->me == 0) error->warning(FLERR,"AMOEBA induced dipoles did not converge"); } // update the lists of previous induced dipole values // shift previous m values up to m+1, add new values at m = 0 // only when preconditioner is used if (use_pred) { double ***udalt = fixudalt->tstore; double ***upalt = fixupalt->tstore; nualt = MIN(nualt+1,maxualt); for (i = 0; i < nlocal; i++) { for (j = 0; j < 3; j++) { for (m = nualt-1; m > 0; m--) { udalt[i][m][j] = udalt[i][m-1][j]; upalt[i][m][j] = upalt[i][m-1][j]; } udalt[i][0][j] = uind[i][j]; upalt[i][0][j] = uinp[i][j]; } } } } /* ---------------------------------------------------------------------- udirect2b = Ewald real direct field via list udirect2b computes the real space contribution of the permanent atomic multipole moments to the field via a neighbor list ------------------------------------------------------------------------- */ void PairAmoebaGPU::udirect2b(double **field, double **fieldp) { if (!gpu_udirect2b_ready) { PairAmoeba::udirect2b(field, fieldp); return; } int inum; double sublo[3],subhi[3]; if (domain->triclinic == 0) { sublo[0] = domain->sublo[0]; sublo[1] = domain->sublo[1]; sublo[2] = domain->sublo[2]; subhi[0] = domain->subhi[0]; subhi[1] = domain->subhi[1]; subhi[2] = domain->subhi[2]; } else { domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); } inum = atom->nlocal; // select the correct cutoff (off2) for the term if (use_ewald) choose(POLAR_LONG); else choose(POLAR); amoeba_gpu_compute_udirect2b(amtype, amgroup, rpole, uind, uinp, aewald, off2, &fieldp_pinned); // rebuild dipole-dipole pair list and store pairwise dipole matrices // done one atom at a time in real-space double loop over atoms & neighs // NOTE: for the moment the tdipdip values are computed just in time in umutual2b() // so no need to call ubdirect2b_cpu(). // udirect2b_cpu(); // accumulate the field and fieldp values from the GPU lib // field and fieldp may already have some nonzero values from kspace (udirect1) int nlocal = atom->nlocal; if (acc_float) { auto field_ptr = (float *)fieldp_pinned; for (int i = 0; i < nlocal; i++) { int idx = 3*i; field[i][0] += field_ptr[idx]; field[i][1] += field_ptr[idx+1]; field[i][2] += field_ptr[idx+2]; } field_ptr += 3*inum; for (int i = 0; i < nlocal; i++) { int idx = 3*i; fieldp[i][0] += field_ptr[idx]; fieldp[i][1] += field_ptr[idx+1]; fieldp[i][2] += field_ptr[idx+2]; } } else { auto field_ptr = (double *)fieldp_pinned; for (int i = 0; i < nlocal; i++) { int idx = 3*i; field[i][0] += field_ptr[idx]; field[i][1] += field_ptr[idx+1]; field[i][2] += field_ptr[idx+2]; } field_ptr += 3*inum; for (int i = 0; i < nlocal; i++) { int idx = 3*i; fieldp[i][0] += field_ptr[idx]; fieldp[i][1] += field_ptr[idx+1]; fieldp[i][2] += field_ptr[idx+2]; } } } /* ---------------------------------------------------------------------- udirect2b = Ewald real direct field via list udirect2b computes the real space contribution of the permanent atomic multipole moments to the field via a neighbor list ------------------------------------------------------------------------- */ void PairAmoebaGPU::udirect2b_cpu() { int i,j,m,n,ii,jj,jextra,ndip,itype,jtype,igroup,jgroup; double xr,yr,zr,r,r2; double rr1,rr2,rr3,rr5; double bfac,exp2a; double ralpha,aefac; double aesq2,aesq2n; double pdi,pti; double pgamma; double damp,expdamp; double scale3,scale5; double scalek; double bn[4],bcn[3]; double factor_uscale; int inum,jnum; int *ilist,*jlist,*numneigh,**firstneigh; double **x = atom->x; // neigh list inum = list->inum; ilist = list->ilist; numneigh = list->numneigh; firstneigh = list->firstneigh; // NOTE: doesn't this have a problem if aewald is tiny ?? aesq2 = 2.0 * aewald * aewald; aesq2n = 0.0; if (aewald > 0.0) aesq2n = 1.0 / (MY_PIS*aewald); // rebuild dipole-dipole pair list and store pairwise dipole matrices // done one atom at a time in real-space double loop over atoms & neighs int *neighptr; double *tdipdip; // compute the real space portion of the Ewald summation for (ii = 0; ii < inum; ii++) { i = ilist[ii]; itype = amtype[i]; igroup = amgroup[i]; jlist = firstneigh[i]; jnum = numneigh[i]; n = ndip = 0; neighptr = ipage_dipole->vget(); tdipdip = dpage_dipdip->vget(); pdi = pdamp[itype]; pti = thole[itype]; // evaluate all sites within the cutoff distance for (jj = 0; jj < jnum; jj++) { jextra = jlist[jj]; j = jextra & NEIGHMASK15; xr = x[j][0] - x[i][0]; yr = x[j][1] - x[i][1]; zr = x[j][2] - x[i][2]; r2 = xr*xr + yr* yr + zr*zr; if (r2 > off2) continue; jtype = amtype[j]; jgroup = amgroup[j]; if (igroup == jgroup) factor_uscale = polar_uscale; else factor_uscale = 1.0; r = sqrt(r2); rr1 = 1.0 / r; rr2 = rr1 * rr1; rr3 = rr2 * rr1; rr5 = 3.0 * rr2 * rr3; // calculate the real space Ewald error function terms ralpha = aewald * r; bn[0] = erfc(ralpha) * rr1; exp2a = exp(-ralpha*ralpha); aefac = aesq2n; for (m = 1; m <= 3; m++) { bfac = m+m-1; aefac = aesq2 * aefac; bn[m] = (bfac*bn[m-1]+aefac*exp2a) * rr2; } // find terms needed later to compute mutual polarization if (poltyp != DIRECT) { scale3 = 1.0; scale5 = 1.0; damp = pdi * pdamp[jtype]; if (damp != 0.0) { pgamma = MIN(pti,thole[jtype]); damp = pgamma * pow(r/damp,3.0); if (damp < 50.0) { expdamp = exp(-damp); scale3 = 1.0 - expdamp; scale5 = 1.0 - expdamp*(1.0+damp); } } scalek = factor_uscale; bcn[0] = bn[1] - (1.0-scalek*scale3)*rr3; bcn[1] = bn[2] - (1.0-scalek*scale5)*rr5; neighptr[n++] = j; tdipdip[ndip++] = -bcn[0] + bcn[1]*xr*xr; tdipdip[ndip++] = bcn[1]*xr*yr; tdipdip[ndip++] = bcn[1]*xr*zr; tdipdip[ndip++] = -bcn[0] + bcn[1]*yr*yr; tdipdip[ndip++] = bcn[1]*yr*zr; tdipdip[ndip++] = -bcn[0] + bcn[1]*zr*zr; } else { if (comm->me == 0) printf("i = %d: j = %d: poltyp == DIRECT\n", i, j); } } // jj firstneigh_dipole[i] = neighptr; firstneigh_dipdip[i] = tdipdip; numneigh_dipole[i] = n; ipage_dipole->vgot(n); dpage_dipdip->vgot(ndip); } } /* ---------------------------------------------------------------------- ufield0c = mutual induction via Ewald sum ufield0c computes the mutual electrostatic field due to induced dipole moments via Ewald summation ------------------------------------------------------------------------- */ void PairAmoebaGPU::ufield0c(double **field, double **fieldp) { double term; // zero field,fieldp for owned and ghost atoms int nlocal = atom->nlocal; int nall = nlocal + atom->nghost; memset(&field[0][0], 0, 3*nall *sizeof(double)); memset(&fieldp[0][0], 0, 3*nall *sizeof(double)); // get the real space portion of the mutual field first double time0, time1, time2; MPI_Barrier(world); time0 = platform::walltime(); if (polar_rspace_flag) umutual2b(field,fieldp); time1 = platform::walltime(); // get the reciprocal space part of the mutual field if (polar_kspace_flag) umutual1(field,fieldp); time2 = platform::walltime(); // add the self-energy portion of the mutual field term = (4.0/3.0) * aewald*aewald*aewald / MY_PIS; for (int i = 0; i < nlocal; i++) { field[i][0] += term*uind[i][0]; field[i][1] += term*uind[i][1]; field[i][2] += term*uind[i][2]; } for (int i = 0; i < nlocal; i++) { fieldp[i][0] += term*uinp[i][0]; fieldp[i][1] += term*uinp[i][1]; fieldp[i][2] += term*uinp[i][2]; } // accumulate the field and fieldp values from the real-space portion from umutual2b() on the GPU // field and fieldp may already have some nonzero values from kspace (umutual1 and self) amoeba_gpu_update_fieldp(&fieldp_pinned); int inum = atom->nlocal; if (acc_float) { auto field_ptr = (float *)fieldp_pinned; for (int i = 0; i < nlocal; i++) { int idx = 3*i; field[i][0] += field_ptr[idx]; field[i][1] += field_ptr[idx+1]; field[i][2] += field_ptr[idx+2]; } field_ptr += 3*inum; for (int i = 0; i < nlocal; i++) { int idx = 3*i; fieldp[i][0] += field_ptr[idx]; fieldp[i][1] += field_ptr[idx+1]; fieldp[i][2] += field_ptr[idx+2]; } } else { auto field_ptr = (double *)fieldp_pinned; for (int i = 0; i < nlocal; i++) { int idx = 3*i; field[i][0] += field_ptr[idx]; field[i][1] += field_ptr[idx+1]; field[i][2] += field_ptr[idx+2]; } field_ptr += 3*inum; for (int i = 0; i < nlocal; i++) { int idx = 3*i; fieldp[i][0] += field_ptr[idx]; fieldp[i][1] += field_ptr[idx+1]; fieldp[i][2] += field_ptr[idx+2]; } } // accumulate timing information time_mutual_rspace += time1 - time0; time_mutual_kspace += time2 - time1; } /* ---------------------------------------------------------------------- umutual1 = Ewald recip mutual induced field umutual1 computes the reciprocal space contribution of the induced atomic dipole moments to the field ------------------------------------------------------------------------- */ void PairAmoebaGPU::umutual1(double **field, double **fieldp) { int m,n; int nxlo,nxhi,nylo,nyhi,nzlo,nzhi; double term; double a[3][3]; // indices not flipped vs Fortran // return if the Ewald coefficient is zero if (aewald < 1.0e-6) return; // convert Cartesian dipoles to fractional coordinates for (int j = 0; j < 3; j++) { a[0][j] = nfft1 * recip[0][j]; a[1][j] = nfft2 * recip[1][j]; a[2][j] = nfft3 * recip[2][j]; } int nlocal = atom->nlocal; for (int i = 0; i < nlocal; i++) { fuind[i][0] = a[0][0]*uind[i][0] + a[0][1]*uind[i][1] + a[0][2]*uind[i][2]; fuind[i][1] = a[1][0]*uind[i][0] + a[1][1]*uind[i][1] + a[1][2]*uind[i][2]; fuind[i][2] = a[2][0]*uind[i][0] + a[2][1]*uind[i][1] + a[2][2]*uind[i][2]; } for (int i = 0; i < nlocal; i++) { fuinp[i][0] = a[0][0]*uinp[i][0] + a[0][1]*uinp[i][1] + a[0][2]*uinp[i][2]; fuinp[i][1] = a[1][0]*uinp[i][0] + a[1][1]*uinp[i][1] + a[1][2]*uinp[i][2]; fuinp[i][2] = a[2][0]*uinp[i][0] + a[2][1]*uinp[i][1] + a[2][2]*uinp[i][2]; } // gridpre = my portion of 4d grid in brick decomp w/ ghost values FFT_SCALAR ****gridpre = (FFT_SCALAR ****) ic_kspace->zero(); // map 2 values to grid double time0, time1; MPI_Barrier(world); time0 = platform::walltime(); grid_uind(fuind,fuinp,gridpre); time1 = platform::walltime(); time_grid_uind += (time1 - time0); // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomposition FFT_SCALAR *gridfft = ic_kspace->pre_convolution(); // --------------------- // convolution operation // --------------------- nxlo = ic_kspace->nxlo_fft; nxhi = ic_kspace->nxhi_fft; nylo = ic_kspace->nylo_fft; nyhi = ic_kspace->nyhi_fft; nzlo = ic_kspace->nzlo_fft; nzhi = ic_kspace->nzhi_fft; // use qfac values stored in udirect1() m = n = 0; for (int k = nzlo; k <= nzhi; k++) { for (int j = nylo; j <= nyhi; j++) { for (int i = nxlo; i <= nxhi; i++) { term = qfac[m++]; gridfft[n] *= term; gridfft[n+1] *= term; n += 2; } } } // post-convolution operations including backward FFT // gridppost = my portion of 4d grid in brick decomp w/ ghost values FFT_SCALAR ****gridpost = (FFT_SCALAR ****) ic_kspace->post_convolution(); // get potential MPI_Barrier(world); time0 = platform::walltime(); fphi_uind(gridpost,fdip_phi1,fdip_phi2,fdip_sum_phi); time1 = platform::walltime(); time_fphi_uind += (time1 - time0); // store fractional reciprocal potentials for OPT method if (poltyp == OPT) { for (int i = 0; i < nlocal; i++) { for (int j = 0; j < 10; j++) { fopt[i][optlevel][j] = fdip_phi1[i][j]; foptp[i][optlevel][j] = fdip_phi2[i][j]; } } } for (int i = 0; i < nlocal; i++) { double dfx = a[0][0]*fdip_phi1[i][1] + a[0][1]*fdip_phi1[i][2] + a[0][2]*fdip_phi1[i][3]; double dfy = a[1][0]*fdip_phi1[i][1] + a[1][1]*fdip_phi1[i][2] + a[1][2]*fdip_phi1[i][3]; double dfz = a[2][0]*fdip_phi1[i][1] + a[2][1]*fdip_phi1[i][2] + a[2][2]*fdip_phi1[i][3]; field[i][0] -= dfx; field[i][1] -= dfy; field[i][2] -= dfz; } for (int i = 0; i < nlocal; i++) { double dfx = a[0][0]*fdip_phi2[i][1] + a[0][1]*fdip_phi2[i][2] + a[0][2]*fdip_phi2[i][3]; double dfy = a[1][0]*fdip_phi2[i][1] + a[1][1]*fdip_phi2[i][2] + a[1][2]*fdip_phi2[i][3]; double dfz = a[2][0]*fdip_phi2[i][1] + a[2][1]*fdip_phi2[i][2] + a[2][2]*fdip_phi2[i][3]; fieldp[i][0] -= dfx; fieldp[i][1] -= dfy; fieldp[i][2] -= dfz; } } /* ---------------------------------------------------------------------- fphi_uind = induced potential from grid fphi_uind extracts the induced dipole potential from the particle mesh Ewald grid ------------------------------------------------------------------------- */ void PairAmoebaGPU::fphi_uind(FFT_SCALAR ****grid, double **fdip_phi1, double **fdip_phi2, double **fdip_sum_phi) { if (!gpu_fphi_uind_ready) { PairAmoeba::fphi_uind(grid, fdip_phi1, fdip_phi2, fdip_sum_phi); return; } void* fdip_phi1_pinned = nullptr; void* fdip_phi2_pinned = nullptr; void* fdip_sum_phi_pinned = nullptr; amoeba_gpu_fphi_uind(grid, &fdip_phi1_pinned, &fdip_phi2_pinned, &fdip_sum_phi_pinned); int nlocal = atom->nlocal; if (acc_float) { auto _fdip_phi1_ptr = (float *)fdip_phi1_pinned; for (int i = 0; i < nlocal; i++) { int n = i; for (int m = 0; m < 10; m++) { fdip_phi1[i][m] = _fdip_phi1_ptr[n]; n += nlocal; } } auto _fdip_phi2_ptr = (float *)fdip_phi2_pinned; for (int i = 0; i < nlocal; i++) { int n = i; for (int m = 0; m < 10; m++) { fdip_phi2[i][m] = _fdip_phi2_ptr[n]; n += nlocal; } } auto _fdip_sum_phi_ptr = (float *)fdip_sum_phi_pinned; for (int i = 0; i < nlocal; i++) { int n = i; for (int m = 0; m < 20; m++) { fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n]; n += nlocal; } } } else { auto _fdip_phi1_ptr = (double *)fdip_phi1_pinned; for (int i = 0; i < nlocal; i++) { int n = i; for (int m = 0; m < 10; m++) { fdip_phi1[i][m] = _fdip_phi1_ptr[n]; n += nlocal; } } auto _fdip_phi2_ptr = (double *)fdip_phi2_pinned; for (int i = 0; i < nlocal; i++) { int n = i; for (int m = 0; m < 10; m++) { fdip_phi2[i][m] = _fdip_phi2_ptr[n]; n += nlocal; } } auto _fdip_sum_phi_ptr = (double *)fdip_sum_phi_pinned; for (int i = 0; i < nlocal; i++) { int n = i; for (int m = 0; m < 20; m++) { fdip_sum_phi[i][m] = _fdip_sum_phi_ptr[n]; n += nlocal; } } } } /* ---------------------------------------------------------------------- umutual2b = Ewald real mutual field via list umutual2b computes the real space contribution of the induced atomic dipole moments to the field via a neighbor list ------------------------------------------------------------------------- */ void PairAmoebaGPU::umutual2b(double **field, double **fieldp) { if (!gpu_umutual2b_ready) { PairAmoeba::umutual2b(field, fieldp); return; } double sublo[3],subhi[3]; if (domain->triclinic == 0) { sublo[0] = domain->sublo[0]; sublo[1] = domain->sublo[1]; sublo[2] = domain->sublo[2]; subhi[0] = domain->subhi[0]; subhi[1] = domain->subhi[1]; subhi[2] = domain->subhi[2]; } else { domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); } // select the correct cutoff (off2) for the term if (use_ewald) choose(POLAR_LONG); else choose(POLAR); amoeba_gpu_compute_umutual2b(amtype, amgroup, rpole, uind, uinp, aewald, off2, &fieldp_pinned); } /* ---------------------------------------------------------------------- polar_real = real-space portion of induced dipole polarization adapted from Tinker epreal1d() routine ------------------------------------------------------------------------- */ void PairAmoebaGPU::polar_real() { if (!gpu_polar_real_ready) { PairAmoeba::polar_real(); return; } int eflag=1, vflag=1; double **f = atom->f; double sublo[3],subhi[3]; if (domain->triclinic == 0) { sublo[0] = domain->sublo[0]; sublo[1] = domain->sublo[1]; sublo[2] = domain->sublo[2]; subhi[0] = domain->subhi[0]; subhi[1] = domain->subhi[1]; subhi[2] = domain->subhi[2]; } else { domain->bbox(domain->sublo_lamda,domain->subhi_lamda,sublo,subhi); } // select the correct cutoff and aewald for the term if (use_ewald) choose(POLAR_LONG); else choose(POLAR); // set the energy unit conversion factor for polar real-space calculation double felec = 0.5 * electric / am_dielectric; amoeba_gpu_compute_polar_real(amtype, amgroup, rpole, uind, uinp, eflag, vflag, eflag_atom, vflag_atom, aewald, felec, off2, &tq_pinned); // reference to the tep array from GPU lib if (acc_float) { auto *tep_ptr = (float *)tq_pinned; compute_force_from_torque(tep_ptr, f, virpolar); // fpolar } else { auto *tep_ptr = (double *)tq_pinned; compute_force_from_torque(tep_ptr, f, virpolar); // fpolar } } /* ---------------------------------------------------------------------- polar_kspace = KSpace portion of induced dipole polarization adapted from Tinker eprecip1() routine same as PairAmoeba, except that fphi_uind() is reimplemented here ------------------------------------------------------------------------- */ void PairAmoebaGPU::polar_kspace() { int i,j,k,m,n; int nhalf1,nhalf2,nhalf3; int nxlo,nxhi,nylo,nyhi,nzlo,nzhi; int j1,j2,j3; int ix,iy,iz; double eterm,felec; double r1,r2,r3; double h1,h2,h3; double f1,f2,f3; double xix,yix,zix; double xiy,yiy,ziy; double xiz,yiz,ziz; double vxx,vyy,vzz; double vxy,vxz,vyz; double volterm,denom; double hsq,expterm; double term,pterm; double vterm,struc2; double tep[3]; double fix[3],fiy[3],fiz[3]; double cphid[4],cphip[4]; double a[3][3]; // indices not flipped vs Fortran bool gpu_fphi_mpole_ready = true; // indices into the electrostatic field array // decremented by 1 versus Fortran int deriv1[10] = {1, 4, 7, 8, 10, 15, 17, 13, 14, 19}; int deriv2[10] = {2, 7, 5, 9, 13, 11, 18, 15, 19, 16}; int deriv3[10] = {3, 8, 9, 6, 14, 16, 12, 19, 17, 18}; // return if the Ewald coefficient is zero if (aewald < 1.0e-6) return; // owned atoms double **x = atom->x; double **f = atom->f; int nlocal = atom->nlocal; double volbox = domain->prd[0] * domain->prd[1] * domain->prd[2]; pterm = pow((MY_PI/aewald),2.0); volterm = MY_PI * volbox; // initialize variables required for the scalar summation felec = electric / am_dielectric; // remove scalar sum virial from prior multipole FFT // can only do this if multipoles were computed with same aeewald = apewald // else need to re-compute it via new long-range solve nfft1 = p_kspace->nx; nfft2 = p_kspace->ny; nfft3 = p_kspace->nz; bsorder = p_kspace->order; nhalf1 = (nfft1+1) / 2; nhalf2 = (nfft2+1) / 2; nhalf3 = (nfft3+1) / 2; nxlo = p_kspace->nxlo_fft; nxhi = p_kspace->nxhi_fft; nylo = p_kspace->nylo_fft; nyhi = p_kspace->nyhi_fft; nzlo = p_kspace->nzlo_fft; nzhi = p_kspace->nzhi_fft; // use previous results or compute new qfac and convolution if (aewald == aeewald) { vxx = -vmsave[0]; vyy = -vmsave[1]; vzz = -vmsave[2]; vxy = -vmsave[3]; vxz = -vmsave[4]; vyz = -vmsave[5]; } else { // setup stencil size and B-spline coefficients moduli(); bspline_fill(); // allocate memory and make early host-device transfers // NOTE: this is for p_kspace, and igrid and thetai[1-3] are filled by bpsline_fill if (gpu_fphi_mpole_ready) { amoeba_gpu_precompute_kspace(atom->nlocal, bsorder, thetai1, thetai2, thetai3, igrid, p_kspace->nzlo_out, p_kspace->nzhi_out, p_kspace->nylo_out, p_kspace->nyhi_out, p_kspace->nxlo_out, p_kspace->nxhi_out); } // convert Cartesian multipoles to fractional coordinates cmp_to_fmp(cmp,fmp); // gridpre = my portion of 3d grid in brick decomp w/ ghost values FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero(); // map atoms to grid grid_mpole(fmp,gridpre); // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector FFT_SCALAR *gridfft = p_kspace->pre_convolution(); // --------------------- // convolution operation // --------------------- // zero virial accumulation variables vxx = vyy = vzz = vxy = vxz = vyz = 0.0; // perform convolution on K-space points I own m = n = 0; for (k = nzlo; k <= nzhi; k++) { for (j = nylo; j <= nyhi; j++) { for (i = nxlo; i <= nxhi; i++) { r1 = (i >= nhalf1) ? i-nfft1 : i; r2 = (j >= nhalf2) ? j-nfft2 : j; r3 = (k >= nhalf3) ? k-nfft3 : k; h1 = recip[0][0]*r1 + recip[0][1]*r2 + recip[0][2]*r3; // matvec h2 = recip[1][0]*r1 + recip[1][1]*r2 + recip[1][2]*r3; h3 = recip[2][0]*r1 + recip[2][1]*r2 + recip[2][2]*r3; hsq = h1*h1 + h2*h2 + h3*h3; term = -pterm * hsq; expterm = 0.0; if (term > -50.0 && hsq != 0.0) { denom = volterm*hsq*bsmod1[i]*bsmod2[j]*bsmod3[k]; if (hsq) expterm = exp(term) / denom; struc2 = gridfft[n]*gridfft[n] + gridfft[n+1]*gridfft[n+1]; eterm = 0.5 * felec * expterm * struc2; vterm = (2.0/hsq) * (1.0-term) * eterm; vxx -= h1*h1*vterm - eterm; vyy -= h2*h2*vterm - eterm; vzz -= h3*h3*vterm - eterm; vxy -= h1*h2*vterm; vxz -= h1*h3*vterm; vyz -= h2*h3*vterm; } expterm = qfac[m++]; gridfft[n] *= expterm; gridfft[n+1] *= expterm; n += 2; } } } // post-convolution operations including backward FFT // gridppost = my portion of 3d grid in brick decomp w/ ghost values FFT_SCALAR ***gridpost = (FFT_SCALAR ***) p_kspace->post_convolution(); // get potential if (!gpu_fphi_mpole_ready) { fphi_mpole(gridpost,fphi); for (i = 0; i < nlocal; i++) { for (k = 0; k < 20; k++) fphi[i][k] *= felec; } } else { void* fphi_pinned = nullptr; amoeba_gpu_fphi_mpole(gridpost, &fphi_pinned, felec); if (acc_float) { auto _fphi_ptr = (float *)fphi_pinned; for (int i = 0; i < nlocal; i++) { int idx = i; for (int m = 0; m < 20; m++) { fphi[i][m] = _fphi_ptr[idx]; idx += nlocal; } } } else { auto _fphi_ptr = (double *)fphi_pinned; for (int i = 0; i < nlocal; i++) { int idx = i; for (int m = 0; m < 20; m++) { fphi[i][m] = _fphi_ptr[idx]; idx += nlocal; } } } } // convert field from fractional to Cartesian fphi_to_cphi(fphi,cphi); } // convert Cartesian induced dipoles to fractional coordinates for (i = 0; i < 3; i++) { a[0][i] = nfft1 * recip[0][i]; a[1][i] = nfft2 * recip[1][i]; a[2][i] = nfft3 * recip[2][i]; } for (i = 0; i < nlocal; i++) { for (j = 0; j < 3; j++) { fuind[i][j] = a[j][0]*uind[i][0] + a[j][1]*uind[i][1] + a[j][2]*uind[i][2]; fuinp[i][j] = a[j][0]*uinp[i][0] + a[j][1]*uinp[i][1] + a[j][2]*uinp[i][2]; } } // gridpre2 = my portion of 4d grid in brick decomp w/ ghost values FFT_SCALAR ****gridpre2 = (FFT_SCALAR ****) pc_kspace->zero(); // map 2 values to grid grid_uind(fuind,fuinp,gridpre2); // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomposition FFT_SCALAR *gridfft = pc_kspace->pre_convolution(); // --------------------- // convolution operation // --------------------- // use qfac values from above or from induce() m = n = 0; for (k = nzlo; k <= nzhi; k++) { for (j = nylo; j <= nyhi; j++) { for (i = nxlo; i <= nxhi; i++) { term = qfac[m++]; gridfft[n] *= term; gridfft[n+1] *= term; n += 2; } } } // post-convolution operations including backward FFT // gridppost = my portion of 4d grid in brick decomp w/ ghost values FFT_SCALAR ****gridpost = (FFT_SCALAR ****) pc_kspace->post_convolution(); // get potential fphi_uind(gridpost,fphid,fphip,fphidp); // TODO: port the remaining loops to the GPU for (i = 0; i < nlocal; i++) { for (j = 1; j < 10; j++) { fphid[i][j] = felec * fphid[i][j]; fphip[i][j] = felec * fphip[i][j]; } for (j = 0; j < 20; j++) fphidp[i][j] = felec * fphidp[i][j]; } // increment the dipole polarization gradient contributions for (i = 0; i < nlocal; i++) { f1 = 0.0; f2 = 0.0; f3 = 0.0; for (k = 0; k < 3; k++) { j1 = deriv1[k+1]; j2 = deriv2[k+1]; j3 = deriv3[k+1]; f1 += (fuind[i][k]+fuinp[i][k])*fphi[i][j1]; f2 += (fuind[i][k]+fuinp[i][k])*fphi[i][j2]; f3 += (fuind[i][k]+fuinp[i][k])*fphi[i][j3]; if (poltyp == MUTUAL) { f1 += fuind[i][k]*fphip[i][j1] + fuinp[i][k]*fphid[i][j1]; f2 += fuind[i][k]*fphip[i][j2] + fuinp[i][k]*fphid[i][j2]; f3 += fuind[i][k]*fphip[i][j3] + fuinp[i][k]*fphid[i][j3]; } } for (k = 0; k < 10; k++) { f1 += fmp[i][k]*fphidp[i][deriv1[k]]; f2 += fmp[i][k]*fphidp[i][deriv2[k]]; f3 += fmp[i][k]*fphidp[i][deriv3[k]]; } f1 *= 0.5 * nfft1; f2 *= 0.5 * nfft2; f3 *= 0.5 * nfft3; h1 = recip[0][0]*f1 + recip[0][1]*f2 + recip[0][2]*f3; h2 = recip[1][0]*f1 + recip[1][1]*f2 + recip[1][2]*f3; h3 = recip[2][0]*f1 + recip[2][1]*f2 + recip[2][2]*f3; f[i][0] -= h1; f[i][1] -= h2; f[i][2] -= h3; } // set the potential to be the induced dipole average for (i = 0; i < nlocal; i++) { for (j = 0; j < 10; j++) fphidp[i][j] *= 0.5; } fphi_to_cphi(fphidp,cphidp); // get the fractional to Cartesian transformation matrix //frac_to_cart(); // increment the dipole polarization virial contributions for (i = 0; i < nlocal; i++) { for (j = 1; j < 4; j++) { cphid[j] = 0.0; cphip[j] = 0.0; for (k = 1; k < 4; k++) { cphid[j] += ftc[j][k]*fphid[i][k]; cphip[j] += ftc[j][k]*fphip[i][k]; } } vxx -= cmp[i][1]*cphidp[i][1] + 0.5*((uind[i][0]+uinp[i][0])*cphi[i][1]); vyy -= cmp[i][2]*cphidp[i][2] + 0.5*((uind[i][1]+uinp[i][1])*cphi[i][2]); vzz -= cmp[i][3]*cphidp[i][3] + 0.5*((uind[i][2]+uinp[i][2])*cphi[i][3]); vxy -= 0.5*(cphidp[i][1]*cmp[i][2]+cphidp[i][2]*cmp[i][1]) + 0.25*((uind[i][1]+uinp[i][1])*cphi[i][1] + (uind[i][0]+uinp[i][0])*cphi[i][2]); vyz -= 0.5*(cphidp[i][2]*cmp[i][3]+cphidp[i][3]*cmp[i][2]) + 0.25*((uind[i][2]+uinp[i][2])*cphi[i][2] + (uind[i][1]+uinp[i][1])*cphi[i][3]); vxz -= 0.5*(cphidp[i][1]*cmp[i][3]+cphidp[i][3]*cmp[i][1]) + 0.25*((uind[i][2]+uinp[i][2])*cphi[i][1] + (uind[i][0]+uinp[i][0])*cphi[i][3]); vxx -= 2.0*cmp[i][4]*cphidp[i][4] + cmp[i][7]*cphidp[i][7] + cmp[i][8]*cphidp[i][8]; vyy -= 2.0*cmp[i][5]*cphidp[i][5] + cmp[i][7]*cphidp[i][7] + cmp[i][9]*cphidp[i][9]; vzz -= 2.0*cmp[i][6]*cphidp[i][6] + cmp[i][8]*cphidp[i][8] + cmp[i][9]*cphidp[i][9]; vxy -= (cmp[i][4]+cmp[i][5])*cphidp[i][7] + 0.5*(cmp[i][7]*(cphidp[i][5]+cphidp[i][4]) + cmp[i][8]*cphidp[i][9]+cmp[i][9]*cphidp[i][8]); vyz -= (cmp[i][5]+cmp[i][6])*cphidp[i][9] + 0.5*(cmp[i][9]*(cphidp[i][5]+cphidp[i][6]) + cmp[i][7]*cphidp[i][8]+cmp[i][8]*cphidp[i][7]); vxz -= (cmp[i][4]+cmp[i][6])*cphidp[i][8] + 0.5*(cmp[i][8]*(cphidp[i][4]+cphidp[i][6]) + cmp[i][7]*cphidp[i][9]+cmp[i][9]*cphidp[i][7]); if (poltyp == MUTUAL) { vxx -= 0.5 * (cphid[1]*uinp[i][0]+cphip[1]*uind[i][0]); vyy -= 0.5 * (cphid[2]*uinp[i][1]+cphip[2]*uind[i][1]); vzz -= 0.5 * (cphid[3]*uinp[i][2]+cphip[3]*uind[i][2]); vxy -= 0.25 * (cphid[1]*uinp[i][1]+cphip[1]*uind[i][1] + cphid[2]*uinp[i][0]+cphip[2]*uind[i][0]); vyz -= 0.25 * (cphid[2]*uinp[i][2]+cphip[2]*uind[i][2] + cphid[3]*uinp[i][1]+cphip[3]*uind[i][1]); vxz -= 0.25 * (cphid[1]*uinp[i][2]+cphip[1]*uind[i][2] + cphid[3]*uinp[i][0]+cphip[3]*uind[i][0]); } } // resolve site torques then increment forces and virial for (i = 0; i < nlocal; i++) { tep[0] = cmp[i][3]*cphidp[i][2] - cmp[i][2]*cphidp[i][3] + 2.0*(cmp[i][6]-cmp[i][5])*cphidp[i][9] + cmp[i][8]*cphidp[i][7] + cmp[i][9]*cphidp[i][5]- cmp[i][7]*cphidp[i][8] - cmp[i][9]*cphidp[i][6]; tep[1] = cmp[i][1]*cphidp[i][3] - cmp[i][3]*cphidp[i][1] + 2.0*(cmp[i][4]-cmp[i][6])*cphidp[i][8] + cmp[i][7]*cphidp[i][9] + cmp[i][8]*cphidp[i][6] - cmp[i][8]*cphidp[i][4] - cmp[i][9]*cphidp[i][7]; tep[2] = cmp[i][2]*cphidp[i][1] - cmp[i][1]*cphidp[i][2] + 2.0*(cmp[i][5]-cmp[i][4])*cphidp[i][7] + cmp[i][7]*cphidp[i][4] + cmp[i][9]*cphidp[i][8] - cmp[i][7]*cphidp[i][5] - cmp[i][8]*cphidp[i][9]; torque2force(i,tep,fix,fiy,fiz,f); iz = zaxis2local[i]; ix = xaxis2local[i]; iy = yaxis2local[i]; xiz = x[iz][0] - x[i][0]; yiz = x[iz][1] - x[i][1]; ziz = x[iz][2] - x[i][2]; xix = x[ix][0] - x[i][0]; yix = x[ix][1] - x[i][1]; zix = x[ix][2] - x[i][2]; xiy = x[iy][0] - x[i][0]; yiy = x[iy][1] - x[i][1]; ziy = x[iy][2] - x[i][2]; vxx += xix*fix[0] + xiy*fiy[0] + xiz*fiz[0]; vyy += yix*fix[1] + yiy*fiy[1] + yiz*fiz[1]; vzz += zix*fix[2] + ziy*fiy[2] + ziz*fiz[2]; vxy += 0.5*(yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]); vyz += 0.5*(zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]); vxz += 0.5*(zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]); } // account for dipole response terms in the OPT method if (poltyp == OPT) { for (i = 0; i < nlocal; i++) { for (k = 0; k < optorder; k++) { for (j = 1; j < 10; j++) { fphid[i][j] = felec * fopt[i][k][j]; fphip[i][j] = felec * foptp[i][k][j]; } for (m = 0; m < optorder-k; m++) { for (j = 0; j < 3; j++) { fuind[i][j] = a[0][j]*uopt[i][m][0] + a[1][j]*uopt[i][m][1] + a[2][j]*uopt[i][m][2]; fuinp[i][j] = a[0][j]*uoptp[i][m][0] + a[1][j]*uoptp[i][m][1] + a[2][j]*uoptp[i][m][2]; } f1 = 0.0; f2 = 0.0; f3 = 0.0; for (j = 0; j < 3; j++) { j1 = deriv1[j+1]; j2 = deriv2[j+1]; j3 = deriv3[j+1]; f1 += fuind[i][j]*fphip[i][j1] + fuinp[i][j]*fphid[i][j1]; f2 += fuind[i][j]*fphip[i][j2] + fuinp[i][j]*fphid[i][j2]; f3 += fuind[i][j]*fphip[i][j3] + fuinp[i][j]*fphid[i][j3]; } f1 *= 0.5 * nfft1; f2 *= 0.5 * nfft2; f3 *= 0.5 * nfft3; h1 = recip[0][0]*f1 + recip[0][1]*f2 + recip[0][2]*f3; h2 = recip[1][0]*f1 + recip[1][1]*f2 + recip[1][2]*f3; h3 = recip[2][0]*f1 + recip[2][1]*f2 + recip[2][2]*f3; f[i][0] -= copm[k+m+1]*h1; f[i][1] -= copm[k+m+1]*h2; f[i][2] -= copm[k+m+1]*h3; for (j = 1; j < 4; j++) { cphid[j] = 0.0; cphip[j] = 0.0; for (j1 = 1; j1 < 4; j1++) { cphid[j] += ftc[j][j1]*fphid[i][j1]; cphip[j] += ftc[j][j1]*fphip[i][j1]; } } vxx -= 0.5*copm[k+m+1] * (cphid[1]*uoptp[i][m][0] + cphip[1]*uopt[i][m][0]); vyy -= 0.5*copm[k+m+1] * (cphid[2]*uoptp[i][m][1]+ cphip[2]*uopt[i][m][1]); vzz -= 0.5*copm[k+m+1] * (cphid[3]*uoptp[i][m][2]+ cphip[3]*uopt[i][m][2]); vxy -= 0.25*copm[k+m+1] * (cphid[1]*uoptp[i][m][1]+ cphip[1]*uopt[i][m][1]+ cphid[2]*uoptp[i][m][0]+ cphip[2]*uopt[i][m][0]); vyz -= 0.25*copm[k+m+1] * (cphid[1]*uoptp[i][m][2]+ cphip[1]*uopt[i][m][2]+ cphid[3]*uoptp[i][m][0]+ cphip[3]*uopt[i][m][0]); vxz -= 0.25*copm[k+m+1] * (cphid[2]*uoptp[i][m][2]+ cphip[2]*uopt[i][m][2]+ cphid[3]*uoptp[i][m][1]+ cphip[3]*uopt[i][m][1]); } } } } // assign permanent and induced multipoles to the PME grid for (i = 0; i < nlocal; i++) { for (j = 1; j < 4; j++) cmp[i][j] += uinp[i][j-1]; } // convert Cartesian multipoles to fractional multipoles cmp_to_fmp(cmp,fmp); // gridpre = my portion of 3d grid in brick decomp w/ ghost values // zeroed by zero() FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero(); // map atoms to grid grid_mpole(fmp,gridpre); // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector gridfft = p_kspace->pre_convolution(); // gridfft1 = copy of first FFT int nfft_owned = p_kspace->nfft_owned; memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(FFT_SCALAR)); // assign induced dipoles to the PME grid for (i = 0; i < nlocal; i++) { for (j = 1; j < 4; j++) cmp[i][j] += uind[i][j-1] - uinp[i][j-1]; } // convert Cartesian multipoles to fractional multipoles cmp_to_fmp(cmp,fmp); // gridpre = my portion of 3d grid in brick decomp w/ ghost values // zeroed by zero() gridpre = (FFT_SCALAR ***) p_kspace->zero(); // map atoms to grid grid_mpole(fmp,gridpre); // pre-convolution operations including forward FFT // gridfft1/2 = my portions of complex 3d grid in FFT decomp as 1d vectors FFT_SCALAR *gridfft2 = p_kspace->pre_convolution(); // --------------------- // convolution operation // --------------------- m = n = 0; for (k = nzlo; k <= nzhi; k++) { for (j = nylo; j <= nyhi; j++) { for (i = nxlo; i <= nxhi; i++) { r1 = (i >= nhalf1) ? i-nfft1 : i; r2 = (j >= nhalf2) ? j-nfft2 : j; r3 = (k >= nhalf3) ? k-nfft3 : k; h1 = recip[0][0]*r1 + recip[0][1]*r2 + recip[0][2]*r3; // matvec h2 = recip[1][0]*r1 + recip[1][1]*r2 + recip[1][2]*r3; h3 = recip[2][0]*r1 + recip[2][1]*r2 + recip[2][2]*r3; hsq = h1*h1 + h2*h2 + h3*h3; term = -pterm * hsq; expterm = 0.0; if (term > -50.0 && hsq != 0.0) { denom = volterm*hsq*bsmod1[i]*bsmod2[j]*bsmod3[k]; expterm = exp(term) / denom; struc2 = gridfft1[n]*gridfft2[n] + gridfft1[n+1]*gridfft2[n+1]; eterm = 0.5 * felec * expterm * struc2; vterm = (2.0/hsq) * (1.0-term) * eterm; vxx += h1*h1*vterm - eterm; vyy += h2*h2*vterm - eterm; vzz += h3*h3*vterm - eterm; vxy += h1*h2*vterm; vyz += h2*h3*vterm; vxz += h1*h3*vterm; } n += 2; } } } // assign only the induced dipoles to the PME grid // and perform the 3-D FFT forward transformation // NOTE: why is there no inverse FFT in this section? if (poltyp == DIRECT || poltyp == TCG) { for (i = 0; i < nlocal; i++) { for (j = 0; j < 10; j++) cmp[i][j] = 0.0; for (j = 1; j < 4; j++) cmp[i][j] = uinp[i][j-1]; } // convert Cartesian multipoles to fractional multipoles cmp_to_fmp(cmp,fmp); // gridpre = my portion of 3d grid in brick decomp w/ ghost values // zeroed by zero() FFT_SCALAR ***gridpre = (FFT_SCALAR ***) p_kspace->zero(); // map atoms to grid grid_mpole(fmp,gridpre); // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector FFT_SCALAR *gridfft = p_kspace->pre_convolution(); // gridfft1 = copy of first FFT int nfft_owned = p_kspace->nfft_owned; memcpy(gridfft1,gridfft,2*nfft_owned*sizeof(FFT_SCALAR)); // assign ??? to the PME grid for (i = 0; i < nlocal; i++) { for (j = 1; j < 4; j++) cmp[i][j] = uind[i][j-1]; } // convert Cartesian multipoles to fractional multipoles cmp_to_fmp(cmp,fmp); // gridpre = my portion of 3d grid in brick decomp w/ ghost values gridpre = (FFT_SCALAR ***) p_kspace->zero(); // map atoms to grid grid_mpole(fmp,gridpre); // pre-convolution operations including forward FFT // gridfft = my portion of complex 3d grid in FFT decomp as 1d vector FFT_SCALAR *gridfft2 = p_kspace->pre_convolution(); // --------------------- // convolution operation // --------------------- m = n = 0; for (k = nzlo; k <= nzhi; k++) { for (j = nylo; j <= nyhi; j++) { for (i = nxlo; i <= nxhi; i++) { r1 = (i >= nhalf1) ? i-nfft1 : i; r2 = (j >= nhalf2) ? j-nfft2 : j; r3 = (k >= nhalf3) ? k-nfft3 : k; h1 = recip[0][0]*r1 + recip[0][1]*r2 + recip[0][2]*r3; // matvec h2 = recip[1][0]*r1 + recip[1][1]*r2 + recip[1][2]*r3; h3 = recip[2][0]*r1 + recip[2][1]*r2 + recip[2][2]*r3; hsq = h1*h1 + h2*h2 + h3*h3; term = -pterm * hsq; expterm = 0.0; if (term > -50.0 && hsq != 0.0) { denom = volterm*hsq*bsmod1[i]*bsmod2[j]*bsmod3[k]; expterm = exp(term) / denom; struc2 = gridfft1[n]*gridfft2[n] + gridfft1[n+1]*gridfft2[n+1]; eterm = 0.5 * felec * expterm * struc2; vterm = (2.0/hsq) * (1.0-term) * eterm; vxx += h1*h1*vterm - eterm; vyy += h2*h2*vterm - eterm; vzz += h3*h3*vterm - eterm; vxy += h1*h2*vterm; vyz += h2*h3*vterm; vxz += h1*h3*vterm; } n += 2; } } } } // increment the total internal virial tensor components if (vflag_global) { virpolar[0] -= vxx; virpolar[1] -= vyy; virpolar[2] -= vzz; virpolar[3] -= vxy; virpolar[4] -= vxz; virpolar[5] -= vyz; } } /* ---------------------------------------------------------------------- compute atom forces from torques ------------------------------------------------------------------------- */ template void PairAmoebaGPU::compute_force_from_torque(const numtyp* tq_ptr, double** force_comp, double* virial_comp) { int i,ix,iy,iz; double xix,yix,zix; double xiy,yiy,ziy; double xiz,yiz,ziz; double vxx,vyy,vzz; double vxy,vxz,vyz; double fix[3],fiy[3],fiz[3],_tq[4]; double** x = atom->x; int nlocal = atom->nlocal; for (i = 0; i < nlocal; i++) { _tq[0] = tq_ptr[3*i]; _tq[1] = tq_ptr[3*i+1]; _tq[2] = tq_ptr[3*i+2]; torque2force(i,_tq,fix,fiy,fiz,force_comp); iz = zaxis2local[i]; ix = xaxis2local[i]; iy = yaxis2local[i]; xiz = x[iz][0] - x[i][0]; yiz = x[iz][1] - x[i][1]; ziz = x[iz][2] - x[i][2]; xix = x[ix][0] - x[i][0]; yix = x[ix][1] - x[i][1]; zix = x[ix][2] - x[i][2]; xiy = x[iy][0] - x[i][0]; yiy = x[iy][1] - x[i][1]; ziy = x[iy][2] - x[i][2]; vxx = xix*fix[0] + xiy*fiy[0] + xiz*fiz[0]; vyy = yix*fix[1] + yiy*fiy[1] + yiz*fiz[1]; vzz = zix*fix[2] + ziy*fiy[2] + ziz*fiz[2]; vxy = 0.5 * (yix*fix[0] + yiy*fiy[0] + yiz*fiz[0] + xix*fix[1] + xiy*fiy[1] + xiz*fiz[1]); vxz = 0.5 * (zix*fix[0] + ziy*fiy[0] + ziz*fiz[0] + xix*fix[2] + xiy*fiy[2] + xiz*fiz[2]); vyz = 0.5 * (zix*fix[1] + ziy*fiy[1] + ziz*fiz[1] + yix*fix[2] + yiy*fiy[2] + yiz*fiz[2]); virial_comp[0] -= vxx; virial_comp[1] -= vyy; virial_comp[2] -= vzz; virial_comp[3] -= vxy; virial_comp[4] -= vxz; virial_comp[5] -= vyz; } } /* ---------------------------------------------------------------------- */ double PairAmoebaGPU::memory_usage() { double bytes = Pair::memory_usage(); return bytes + amoeba_gpu_bytes(); }