/* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing authors: Rodrigo Canales (RWTH Aachen University) W. Michael Brown (Intel) ------------------------------------------------------------------------- */ #include #include #include #include "pppm_intel.h" #include "atom.h" #include "error.h" #include "gridcomm.h" #include "math_const.h" #include "math_special.h" #include "memory.h" #include "suffix.h" using namespace LAMMPS_NS; using namespace MathConst; using namespace MathSpecial; #define MAXORDER 7 #define OFFSET 16384 #define LARGE 10000.0 #define SMALL 0.00001 #define EPS_HOC 1.0e-7 enum{REVERSE_RHO}; enum{FORWARD_IK,FORWARD_AD,FORWARD_IK_PERATOM,FORWARD_AD_PERATOM}; #ifdef FFT_SINGLE #define ZEROF 0.0f #define ONEF 1.0f #else #define ZEROF 0.0 #define ONEF 1.0 #endif /* ---------------------------------------------------------------------- */ PPPMIntel::PPPMIntel(LAMMPS *lmp, int narg, char **arg) : PPPM(lmp, narg, arg) { suffix_flag |= Suffix::INTEL; } PPPMIntel::~PPPMIntel() { } /* ---------------------------------------------------------------------- called once before run ------------------------------------------------------------------------- */ void PPPMIntel::init() { PPPM::init(); int ifix = modify->find_fix("package_intel"); if (ifix < 0) error->all(FLERR, "The 'package intel' command is required for /intel styles"); fix = static_cast(modify->fix[ifix]); #ifdef _LMP_INTEL_OFFLOAD _use_base = 0; if (fix->offload_balance() != 0.0) { _use_base = 1; return; } #endif fix->kspace_init_check(); if (order > INTEL_P3M_MAXORDER) error->all(FLERR,"PPPM order greater than supported by USER-INTEL\n"); /* if (fix->precision() == FixIntel::PREC_MODE_MIXED) pack_force_const(force_const_single, fix->get_mixed_buffers()); else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) pack_force_const(force_const_double, fix->get_double_buffers()); else pack_force_const(force_const_single, fix->get_single_buffers()); */ } /* ---------------------------------------------------------------------- compute the PPPMIntel long-range force, energy, virial ------------------------------------------------------------------------- */ void PPPMIntel::compute(int eflag, int vflag) { #ifdef _LMP_INTEL_OFFLOAD if (_use_base) { PPPM::compute(eflag, vflag); return; } #endif compute_first(eflag,vflag); compute_second(eflag,vflag); } /* ---------------------------------------------------------------------- */ void PPPMIntel::compute_first(int eflag, int vflag) { int i,j; // set energy/virial flags // invoke allocate_peratom() if needed for first time if (eflag || vflag) ev_setup(eflag,vflag); else evflag = evflag_atom = eflag_global = vflag_global = eflag_atom = vflag_atom = 0; if (evflag_atom && !peratom_allocate_flag) { allocate_peratom(); cg_peratom->ghost_notify(); cg_peratom->setup(); } // if atom count has changed, update qsum and qsqsum if (atom->natoms != natoms_original) { qsum_qsq(); natoms_original = atom->natoms; } // return if there are no charges if (qsqsum == 0.0) return; // convert atoms from box to lamda coords if (triclinic == 0) boxlo = domain->boxlo; else { boxlo = domain->boxlo_lamda; domain->x2lamda(atom->nlocal); } // extend size of per-atom arrays if necessary if (atom->nmax > nmax) { memory->destroy(part2grid); nmax = atom->nmax; memory->create(part2grid,nmax,3,"pppm:part2grid"); } // find grid points for all my particles // map my particle charge onto my local 3d density grid if (fix->precision() == FixIntel::PREC_MODE_MIXED) { particle_map(fix->get_mixed_buffers()); make_rho(fix->get_mixed_buffers()); } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) { particle_map(fix->get_double_buffers()); make_rho(fix->get_double_buffers()); } else { particle_map(fix->get_single_buffers()); make_rho(fix->get_single_buffers()); } // all procs communicate density values from their ghost cells // to fully sum contribution in their 3d bricks // remap from 3d decomposition to FFT decomposition cg->reverse_comm(this,REVERSE_RHO); brick2fft(); // compute potential gradient on my FFT grid and // portion of e_long on this proc's FFT grid // return gradients (electric fields) in 3d brick decomposition // also performs per-atom calculations via poisson_peratom() poisson(); // all procs communicate E-field values // to fill ghost cells surrounding their 3d bricks if (differentiation_flag == 1) cg->forward_comm(this,FORWARD_AD); else cg->forward_comm(this,FORWARD_IK); // extra per-atom energy/virial communication if (evflag_atom) { if (differentiation_flag == 1 && vflag_atom) cg_peratom->forward_comm(this,FORWARD_AD_PERATOM); else if (differentiation_flag == 0) cg_peratom->forward_comm(this,FORWARD_IK_PERATOM); } } /* ---------------------------------------------------------------------- */ void PPPMIntel::compute_second(int eflag, int vflag) { int i,j; // calculate the force on my particles if (differentiation_flag == 1) { if (fix->precision() == FixIntel::PREC_MODE_MIXED) fieldforce_ad(fix->get_mixed_buffers()); else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) fieldforce_ad(fix->get_double_buffers()); else fieldforce_ad(fix->get_single_buffers()); } else { if (fix->precision() == FixIntel::PREC_MODE_MIXED) fieldforce_ik(fix->get_mixed_buffers()); else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) fieldforce_ik(fix->get_double_buffers()); else fieldforce_ik(fix->get_single_buffers()); } // extra per-atom energy/virial communication if (evflag_atom) fieldforce_peratom(); // sum global energy across procs and add in volume-dependent term const double qscale = qqrd2e * scale; if (eflag_global) { double energy_all; MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world); energy = energy_all; energy *= 0.5*volume; energy -= g_ewald*qsqsum/MY_PIS + MY_PI2*qsum*qsum / (g_ewald*g_ewald*volume); energy *= qscale; } // sum global virial across procs if (vflag_global) { double virial_all[6]; MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world); for (i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i]; } // per-atom energy/virial // energy includes self-energy correction // ntotal accounts for TIP4P tallying eatom/vatom for ghost atoms if (evflag_atom) { double *q = atom->q; int nlocal = atom->nlocal; int ntotal = nlocal; if (tip4pflag) ntotal += atom->nghost; if (eflag_atom) { for (i = 0; i < nlocal; i++) { eatom[i] *= 0.5; eatom[i] -= g_ewald*q[i]*q[i]/MY_PIS + MY_PI2*q[i]*qsum / (g_ewald*g_ewald*volume); eatom[i] *= qscale; } for (i = nlocal; i < ntotal; i++) eatom[i] *= 0.5*qscale; } if (vflag_atom) { for (i = 0; i < ntotal; i++) for (j = 0; j < 6; j++) vatom[i][j] *= 0.5*qscale; } } // 2d slab correction if (slabflag == 1) slabcorr(); // convert atoms back from lamda to box coords if (triclinic) domain->lamda2x(atom->nlocal); } /* ---------------------------------------------------------------------- find center grid pt for each of my particles check that full stencil for the particle will fit in my 3d brick store central grid pt indices in part2grid array ------------------------------------------------------------------------- */ template void PPPMIntel::particle_map(IntelBuffers *buffers) { int nx,ny,nz; ATOM_T * _noalias const x = buffers->get_x(0); int nlocal = atom->nlocal; int flag = 0; if (!ISFINITE(boxlo[0]) || !ISFINITE(boxlo[1]) || !ISFINITE(boxlo[2])) error->one(FLERR,"Non-numeric box dimensions - simulation unstable"); const flt_t lo0 = boxlo[0]; const flt_t lo1 = boxlo[1]; const flt_t lo2 = boxlo[2]; const flt_t xi = delxinv; const flt_t yi = delyinv; const flt_t zi = delzinv; const flt_t fshift = shift; #if defined(LMP_SIMD_COMPILER) #pragma vector aligned #pragma simd #endif for (int i = 0; i < nlocal; i++) { // (nx,ny,nz) = global coords of grid pt to "lower left" of charge // current particle coord can be outside global and local box // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1 nx = static_cast ((x[i].x-lo0)*xi+fshift) - OFFSET; ny = static_cast ((x[i].y-lo1)*yi+fshift) - OFFSET; nz = static_cast ((x[i].z-lo2)*zi+fshift) - OFFSET; part2grid[i][0] = nx; part2grid[i][1] = ny; part2grid[i][2] = nz; // check that entire stencil around nx,ny,nz will fit in my 3d brick if (nx+nlower < nxlo_out || nx+nupper > nxhi_out || ny+nlower < nylo_out || ny+nupper > nyhi_out || nz+nlower < nzlo_out || nz+nupper > nzhi_out) flag = 1; } if (flag) error->one(FLERR,"Out of range atoms - cannot compute PPPM"); } /* ---------------------------------------------------------------------- create discretized "density" on section of global grid due to my particles density(x,y,z) = charge "density" at grid points of my 3d brick (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts) in global grid ------------------------------------------------------------------------- */ template void PPPMIntel::make_rho(IntelBuffers *buffers) { // clear 3d density array memset(&(density_brick[nzlo_out][nylo_out][nxlo_out]),0, ngrid*sizeof(FFT_SCALAR)); // loop over my charges, add their contribution to nearby grid points // (nx,ny,nz) = global coords of grid pt to "lower left" of charge // (dx,dy,dz) = distance to "lower left" grid pt // (mx,my,mz) = global coords of moving stencil pt ATOM_T * _noalias const x = buffers->get_x(0); flt_t * _noalias const q = buffers->get_q(0); int nlocal = atom->nlocal; const flt_t lo0 = boxlo[0]; const flt_t lo1 = boxlo[1]; const flt_t lo2 = boxlo[2]; const flt_t xi = delxinv; const flt_t yi = delyinv; const flt_t zi = delzinv; const flt_t fshift = shift; const flt_t fshiftone = shiftone; const flt_t fdelvolinv = delvolinv; for (int i = 0; i < nlocal; i++) { int nx = part2grid[i][0]; int ny = part2grid[i][1]; int nz = part2grid[i][2]; FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi; FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi; FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi; flt_t rho[3][INTEL_P3M_MAXORDER]; for (int k = nlower; k <= nupper; k++) { FFT_SCALAR r1,r2,r3; r1 = r2 = r3 = ZEROF; for (int l = order-1; l >= 0; l--) { r1 = rho_coeff[l][k] + r1*dx; r2 = rho_coeff[l][k] + r2*dy; r3 = rho_coeff[l][k] + r3*dz; } rho[0][k-nlower] = r1; rho[1][k-nlower] = r2; rho[2][k-nlower] = r3; } FFT_SCALAR z0 = fdelvolinv * q[i]; for (int n = nlower; n <= nupper; n++) { int mz = n+nz; FFT_SCALAR y0 = z0*rho[2][n-nlower]; for (int m = nlower; m <= nupper; m++) { int my = m+ny; FFT_SCALAR x0 = y0*rho[1][m-nlower]; for (int l = nlower; l <= nupper; l++) { int mx = l+nx; density_brick[mz][my][mx] += x0*rho[0][l-nlower]; } } } } } /* ---------------------------------------------------------------------- interpolate from grid to get electric field & force on my particles for ik ------------------------------------------------------------------------- */ template void PPPMIntel::fieldforce_ik(IntelBuffers *buffers) { // loop over my charges, interpolate electric field from nearby grid points // (nx,ny,nz) = global coords of grid pt to "lower left" of charge // (dx,dy,dz) = distance to "lower left" grid pt // (mx,my,mz) = global coords of moving stencil pt // ek = 3 components of E-field on particle ATOM_T * _noalias const x = buffers->get_x(0); flt_t * _noalias const q = buffers->get_q(0); FORCE_T * _noalias const f = buffers->get_f(); int nlocal = atom->nlocal; const flt_t lo0 = boxlo[0]; const flt_t lo1 = boxlo[1]; const flt_t lo2 = boxlo[2]; const flt_t xi = delxinv; const flt_t yi = delyinv; const flt_t zi = delzinv; const flt_t fshiftone = shiftone; const flt_t fqqrd2es = qqrd2e * scale; #if defined(LMP_SIMD_COMPILER) #pragma vector aligned nontemporal #pragma simd #endif for (int i = 0; i < nlocal; i++) { int nx = part2grid[i][0]; int ny = part2grid[i][1]; int nz = part2grid[i][2]; FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi; FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi; FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi; flt_t rho[3][INTEL_P3M_MAXORDER]; for (int k = nlower; k <= nupper; k++) { FFT_SCALAR r1 = rho_coeff[order-1][k]; FFT_SCALAR r2 = rho_coeff[order-1][k]; FFT_SCALAR r3 = rho_coeff[order-1][k]; for (int l = order-2; l >= 0; l--) { r1 = rho_coeff[l][k] + r1*dx; r2 = rho_coeff[l][k] + r2*dy; r3 = rho_coeff[l][k] + r3*dz; } rho[0][k-nlower] = r1; rho[1][k-nlower] = r2; rho[2][k-nlower] = r3; } FFT_SCALAR ekx, eky, ekz; ekx = eky = ekz = ZEROF; for (int n = nlower; n <= nupper; n++) { int mz = n+nz; FFT_SCALAR z0 = rho[2][n-nlower]; for (int m = nlower; m <= nupper; m++) { int my = m+ny; FFT_SCALAR y0 = z0*rho[1][m-nlower]; for (int l = nlower; l <= nupper; l++) { int mx = l+nx; FFT_SCALAR x0 = y0*rho[0][l-nlower]; ekx -= x0*vdx_brick[mz][my][mx]; eky -= x0*vdy_brick[mz][my][mx]; ekz -= x0*vdz_brick[mz][my][mx]; } } } // convert E-field to force const flt_t qfactor = fqqrd2es * q[i]; f[i].x += qfactor*ekx; f[i].y += qfactor*eky; if (slabflag != 2) f[i].z += qfactor*ekz; } } /* ---------------------------------------------------------------------- interpolate from grid to get electric field & force on my particles for ad ------------------------------------------------------------------------- */ template void PPPMIntel::fieldforce_ad(IntelBuffers *buffers) { // loop over my charges, interpolate electric field from nearby grid points // (nx,ny,nz) = global coords of grid pt to "lower left" of charge // (dx,dy,dz) = distance to "lower left" grid pt // (mx,my,mz) = global coords of moving stencil pt // ek = 3 components of E-field on particle ATOM_T * _noalias const x = buffers->get_x(0); const flt_t * _noalias const q = buffers->get_q(0); FORCE_T * _noalias const f = buffers->get_f(); int nlocal = atom->nlocal; const flt_t ftwo_pi = MY_PI * 2.0; const flt_t ffour_pi = MY_PI * 4.0; const flt_t lo0 = boxlo[0]; const flt_t lo1 = boxlo[1]; const flt_t lo2 = boxlo[2]; const flt_t xi = delxinv; const flt_t yi = delyinv; const flt_t zi = delzinv; const flt_t fshiftone = shiftone; const flt_t fqqrd2es = qqrd2e * scale; const double *prd = domain->prd; const double xprd = prd[0]; const double yprd = prd[1]; const double zprd = prd[2]; const flt_t hx_inv = nx_pppm/xprd; const flt_t hy_inv = ny_pppm/yprd; const flt_t hz_inv = nz_pppm/zprd; const flt_t fsf_coeff0 = sf_coeff[0]; const flt_t fsf_coeff1 = sf_coeff[1]; const flt_t fsf_coeff2 = sf_coeff[2]; const flt_t fsf_coeff3 = sf_coeff[3]; const flt_t fsf_coeff4 = sf_coeff[4]; const flt_t fsf_coeff5 = sf_coeff[5]; #if defined(LMP_SIMD_COMPILER) #pragma vector aligned nontemporal #pragma simd #endif for (int i = 0; i < nlocal; i++) { int nx = part2grid[i][0]; int ny = part2grid[i][1]; int nz = part2grid[i][2]; FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi; FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi; FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi; flt_t rho[3][INTEL_P3M_MAXORDER]; flt_t drho[3][INTEL_P3M_MAXORDER]; for (int k = nlower; k <= nupper; k++) { FFT_SCALAR r1,r2,r3,dr1,dr2,dr3; dr1 = dr2 = dr3 = ZEROF; r1 = rho_coeff[order-1][k]; r2 = rho_coeff[order-1][k]; r3 = rho_coeff[order-1][k]; for (int l = order-2; l >= 0; l--) { r1 = rho_coeff[l][k] + r1 * dx; r2 = rho_coeff[l][k] + r2 * dy; r3 = rho_coeff[l][k] + r3 * dz; dr1 = drho_coeff[l][k] + dr1 * dx; dr2 = drho_coeff[l][k] + dr2 * dy; dr3 = drho_coeff[l][k] + dr3 * dz; } rho[0][k-nlower] = r1; rho[1][k-nlower] = r2; rho[2][k-nlower] = r3; drho[0][k-nlower] = dr1; drho[1][k-nlower] = dr2; drho[2][k-nlower] = dr3; } FFT_SCALAR ekx, eky, ekz; ekx = eky = ekz = ZEROF; for (int n = nlower; n <= nupper; n++) { int mz = n+nz; for (int m = nlower; m <= nupper; m++) { int my = m+ny; FFT_SCALAR ekx_p = rho[1][m-nlower] * rho[2][n-nlower]; FFT_SCALAR eky_p = drho[1][m-nlower] * rho[2][n-nlower]; FFT_SCALAR ekz_p = rho[1][m-nlower] * drho[2][n-nlower]; for (int l = nlower; l <= nupper; l++) { int mx = l+nx; ekx += drho[0][l-nlower] * ekx_p * u_brick[mz][my][mx]; eky += rho[0][l-nlower] * eky_p * u_brick[mz][my][mx]; ekz += rho[0][l-nlower] * ekz_p * u_brick[mz][my][mx]; } } } ekx *= hx_inv; eky *= hy_inv; ekz *= hz_inv; // convert E-field to force const flt_t qfactor = fqqrd2es * q[i]; const flt_t twoqsq = (flt_t)2.0 * q[i] * q[i]; const flt_t s1 = x[i].x * hx_inv; const flt_t s2 = x[i].y * hy_inv; const flt_t s3 = x[i].z * hz_inv; flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1); sf += fsf_coeff1 * sin(ffour_pi * s1); sf *= twoqsq; f[i].x += qfactor * ekx - fqqrd2es * sf; sf = fsf_coeff2 * sin(ftwo_pi * s2); sf += fsf_coeff3 * sin(ffour_pi * s2); sf *= twoqsq; f[i].y += qfactor * eky - fqqrd2es * sf; sf = fsf_coeff4 * sin(ftwo_pi * s3); sf += fsf_coeff5 * sin(ffour_pi * s3); sf *= twoqsq; if (slabflag != 2) f[i].z += qfactor * ekz - fqqrd2es * sf; } } /* ---------------------------------------------------------------------- Pack data into intel package buffers if using LRT mode ------------------------------------------------------------------------- */ void PPPMIntel::pack_buffers() { fix->start_watch(TIME_PACK); #if defined(_OPENMP) #pragma omp parallel default(none) #endif { int ifrom, ito, tid; IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal+atom->nghost, comm->nthreads, sizeof(IntelBuffers::atom_t)); if (fix->precision() == FixIntel::PREC_MODE_MIXED) fix->get_mixed_buffers()->thr_pack(ifrom,ito,1); else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) fix->get_double_buffers()->thr_pack(ifrom,ito,1); else fix->get_single_buffers()->thr_pack(ifrom,ito,1); } fix->stop_watch(TIME_PACK); } /* ---------------------------------------------------------------------- Returns 0 if Intel optimizations for PPPM ignored due to offload ------------------------------------------------------------------------- */ #ifdef _LMP_INTEL_OFFLOAD int PPPMIntel::use_base() { return _use_base; } #endif