lammps/src/USER-INTEL/pppm_intel.cpp

/* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov

   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.

   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */

/* ----------------------------------------------------------------------
   Contributing authors: Rodrigo Canales (RWTH Aachen University)
                         W. Michael Brown (Intel)
------------------------------------------------------------------------- */

#include <mpi.h>
#include <stdlib.h>
#include <math.h>
#include "pppm_intel.h"
#include "atom.h"
#include "error.h"
#include "gridcomm.h"
#include "math_const.h"
#include "math_special.h"
#include "memory.h"
#include "suffix.h"

using namespace LAMMPS_NS;
using namespace MathConst;
using namespace MathSpecial;

#define MAXORDER 7
#define OFFSET 16384
#define LARGE 10000.0
#define SMALL 0.00001
#define EPS_HOC 1.0e-7

enum{REVERSE_RHO};
enum{FORWARD_IK,FORWARD_AD,FORWARD_IK_PERATOM,FORWARD_AD_PERATOM};

#ifdef FFT_SINGLE
#define ZEROF 0.0f
#define ONEF  1.0f
#else
#define ZEROF 0.0
#define ONEF  1.0
#endif

/* ---------------------------------------------------------------------- */

PPPMIntel::PPPMIntel(LAMMPS *lmp, int narg, char **arg) : PPPM(lmp, narg, arg)
{
  suffix_flag |= Suffix::INTEL;
}

PPPMIntel::~PPPMIntel()
{
}

/* ----------------------------------------------------------------------
   called once before run
------------------------------------------------------------------------- */

void PPPMIntel::init()
{
  PPPM::init();
  int ifix = modify->find_fix("package_intel");
  if (ifix < 0)
    error->all(FLERR,
               "The 'package intel' command is required for /intel styles");
  fix = static_cast<FixIntel *>(modify->fix[ifix]);

  #ifdef _LMP_INTEL_OFFLOAD
  _use_base = 0;
  if (fix->offload_balance() != 0.0) {
    _use_base = 1;
    return;
  }
  #endif

  fix->kspace_init_check();

  if (order > INTEL_P3M_MAXORDER)
    error->all(FLERR,"PPPM order greater than supported by USER-INTEL\n");

  /*
  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
    pack_force_const(force_const_single, fix->get_mixed_buffers());
  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
    pack_force_const(force_const_double, fix->get_double_buffers());
  else
    pack_force_const(force_const_single, fix->get_single_buffers());
  */
}

/* ----------------------------------------------------------------------
   compute the PPPMIntel long-range force, energy, virial
------------------------------------------------------------------------- */

void PPPMIntel::compute(int eflag, int vflag)
{
  #ifdef _LMP_INTEL_OFFLOAD
  if (_use_base) {
    PPPM::compute(eflag, vflag);
    return;
  }
  #endif
  compute_first(eflag,vflag);
  compute_second(eflag,vflag);
}

/* ---------------------------------------------------------------------- */

void PPPMIntel::compute_first(int eflag, int vflag)
{
  int i,j;

  // set energy/virial flags
  // invoke allocate_peratom() if needed for first time

  if (eflag || vflag) ev_setup(eflag,vflag);
  else evflag = evflag_atom = eflag_global = vflag_global =
         eflag_atom = vflag_atom = 0;

  if (evflag_atom && !peratom_allocate_flag) {
    allocate_peratom();
    cg_peratom->ghost_notify();
    cg_peratom->setup();
  }

  // if atom count has changed, update qsum and qsqsum

  if (atom->natoms != natoms_original) {
    qsum_qsq();
    natoms_original = atom->natoms;
  }

  // return if there are no charges

  if (qsqsum == 0.0) return;

  // convert atoms from box to lamda coords

  if (triclinic == 0) boxlo = domain->boxlo;
  else {
    boxlo = domain->boxlo_lamda;
    domain->x2lamda(atom->nlocal);
  }

  // extend size of per-atom arrays if necessary

  if (atom->nmax > nmax) {
    memory->destroy(part2grid);
    nmax = atom->nmax;
    memory->create(part2grid,nmax,3,"pppm:part2grid");
  }

  // find grid points for all my particles
  // map my particle charge onto my local 3d density grid

  if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
    particle_map<float,double>(fix->get_mixed_buffers());
    make_rho<float,double>(fix->get_mixed_buffers());
  } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
    particle_map<double,double>(fix->get_double_buffers());
    make_rho<double,double>(fix->get_double_buffers());
  } else {
    particle_map<float,float>(fix->get_single_buffers());
    make_rho<float,float>(fix->get_single_buffers());
  }

  // all procs communicate density values from their ghost cells
  //   to fully sum contribution in their 3d bricks
  // remap from 3d decomposition to FFT decomposition

  cg->reverse_comm(this,REVERSE_RHO);
  brick2fft();

  // compute potential gradient on my FFT grid and
  //   portion of e_long on this proc's FFT grid
  // return gradients (electric fields) in 3d brick decomposition
  // also performs per-atom calculations via poisson_peratom()

  poisson();

  // all procs communicate E-field values
  // to fill ghost cells surrounding their 3d bricks

  if (differentiation_flag == 1) cg->forward_comm(this,FORWARD_AD);
  else cg->forward_comm(this,FORWARD_IK);

  // extra per-atom energy/virial communication

  if (evflag_atom) {
    if (differentiation_flag == 1 && vflag_atom)
      cg_peratom->forward_comm(this,FORWARD_AD_PERATOM);
    else if (differentiation_flag == 0)
      cg_peratom->forward_comm(this,FORWARD_IK_PERATOM);
  }
}

/* ---------------------------------------------------------------------- */

void PPPMIntel::compute_second(int eflag, int vflag)
{
  int i,j;

  // calculate the force on my particles

  if (differentiation_flag == 1) {
    if (fix->precision() == FixIntel::PREC_MODE_MIXED)
      fieldforce_ad<float,double>(fix->get_mixed_buffers());
    else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
      fieldforce_ad<double,double>(fix->get_double_buffers());
    else
      fieldforce_ad<float,float>(fix->get_single_buffers());
  } else {
    if (fix->precision() == FixIntel::PREC_MODE_MIXED)
      fieldforce_ik<float,double>(fix->get_mixed_buffers());
    else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
      fieldforce_ik<double,double>(fix->get_double_buffers());
    else
      fieldforce_ik<float,float>(fix->get_single_buffers());
  }

  // extra per-atom energy/virial communication

  if (evflag_atom) fieldforce_peratom();

  // sum global energy across procs and add in volume-dependent term

  const double qscale = qqrd2e * scale;

  if (eflag_global) {
    double energy_all;
    MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
    energy = energy_all;

    energy *= 0.5*volume;
    energy -= g_ewald*qsqsum/MY_PIS +
      MY_PI2*qsum*qsum / (g_ewald*g_ewald*volume);
    energy *= qscale;
  }

  // sum global virial across procs

  if (vflag_global) {
    double virial_all[6];
    MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
    for (i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i];
  }

  // per-atom energy/virial
  // energy includes self-energy correction
  // ntotal accounts for TIP4P tallying eatom/vatom for ghost atoms

  if (evflag_atom) {
    double *q = atom->q;
    int nlocal = atom->nlocal;
    int ntotal = nlocal;
    if (tip4pflag) ntotal += atom->nghost;

    if (eflag_atom) {
      for (i = 0; i < nlocal; i++) {
        eatom[i] *= 0.5;
        eatom[i] -= g_ewald*q[i]*q[i]/MY_PIS + MY_PI2*q[i]*qsum /
          (g_ewald*g_ewald*volume);
        eatom[i] *= qscale;
      }
      for (i = nlocal; i < ntotal; i++) eatom[i] *= 0.5*qscale;
    }

    if (vflag_atom) {
      for (i = 0; i < ntotal; i++)
        for (j = 0; j < 6; j++) vatom[i][j] *= 0.5*qscale;
    }
  }

  // 2d slab correction

  if (slabflag == 1) slabcorr();

  // convert atoms back from lamda to box coords

  if (triclinic) domain->lamda2x(atom->nlocal);
}

/* ----------------------------------------------------------------------
   find center grid pt for each of my particles
   check that full stencil for the particle will fit in my 3d brick
   store central grid pt indices in part2grid array
------------------------------------------------------------------------- */

template<class flt_t, class acc_t>
void PPPMIntel::particle_map(IntelBuffers<flt_t,acc_t> *buffers)
{
  int nx,ny,nz;

  ATOM_T * _noalias const x = buffers->get_x(0);
  int nlocal = atom->nlocal;

  int flag = 0;

  if (!ISFINITE(boxlo[0]) || !ISFINITE(boxlo[1]) || !ISFINITE(boxlo[2]))
    error->one(FLERR,"Non-numeric box dimensions - simulation unstable");

  const flt_t lo0 = boxlo[0];
  const flt_t lo1 = boxlo[1];
  const flt_t lo2 = boxlo[2];
  const flt_t xi = delxinv;
  const flt_t yi = delyinv;
  const flt_t zi = delzinv;
  const flt_t fshift = shift;

  #if defined(LMP_SIMD_COMPILER)
  #pragma vector aligned
  #pragma simd
  #endif
  for (int i = 0; i < nlocal; i++) {

    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
    // current particle coord can be outside global and local box
    // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1

    nx = static_cast<int> ((x[i].x-lo0)*xi+fshift) - OFFSET;
    ny = static_cast<int> ((x[i].y-lo1)*yi+fshift) - OFFSET;
    nz = static_cast<int> ((x[i].z-lo2)*zi+fshift) - OFFSET;

    part2grid[i][0] = nx;
    part2grid[i][1] = ny;
    part2grid[i][2] = nz;

    // check that entire stencil around nx,ny,nz will fit in my 3d brick

    if (nx+nlower < nxlo_out || nx+nupper > nxhi_out ||
        ny+nlower < nylo_out || ny+nupper > nyhi_out ||
        nz+nlower < nzlo_out || nz+nupper > nzhi_out)
      flag = 1;
  }

  if (flag) error->one(FLERR,"Out of range atoms - cannot compute PPPM");
}


/* ----------------------------------------------------------------------
   create discretized "density" on section of global grid due to my particles
   density(x,y,z) = charge "density" at grid points of my 3d brick
   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
   in global grid
------------------------------------------------------------------------- */

template<class flt_t, class acc_t>
void PPPMIntel::make_rho(IntelBuffers<flt_t,acc_t> *buffers)
{
  // clear 3d density array

  memset(&(density_brick[nzlo_out][nylo_out][nxlo_out]),0,
         ngrid*sizeof(FFT_SCALAR));

  // loop over my charges, add their contribution to nearby grid points
  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
  // (dx,dy,dz) = distance to "lower left" grid pt
  // (mx,my,mz) = global coords of moving stencil pt

  ATOM_T * _noalias const x = buffers->get_x(0);
  flt_t * _noalias const q = buffers->get_q(0);
  int nlocal = atom->nlocal;

  const flt_t lo0 = boxlo[0];
  const flt_t lo1 = boxlo[1];
  const flt_t lo2 = boxlo[2];
  const flt_t xi = delxinv;
  const flt_t yi = delyinv;
  const flt_t zi = delzinv;
  const flt_t fshift = shift;
  const flt_t fshiftone = shiftone;
  const flt_t fdelvolinv = delvolinv;

  for (int i = 0; i < nlocal; i++) {

    int nx = part2grid[i][0];
    int ny = part2grid[i][1];
    int nz = part2grid[i][2];
    FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi;
    FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi;
    FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi;

    flt_t rho[3][INTEL_P3M_MAXORDER];

    for (int k = nlower; k <= nupper; k++) {
      FFT_SCALAR r1,r2,r3;
      r1 = r2 = r3 = ZEROF;

      for (int l = order-1; l >= 0; l--) {
        r1 = rho_coeff[l][k] + r1*dx;
        r2 = rho_coeff[l][k] + r2*dy;
        r3 = rho_coeff[l][k] + r3*dz;
      }
      rho[0][k-nlower] = r1;
      rho[1][k-nlower] = r2;
      rho[2][k-nlower] = r3;
    }

    FFT_SCALAR z0 = fdelvolinv * q[i];
    for (int n = nlower; n <= nupper; n++) {
      int mz = n+nz;
      FFT_SCALAR y0 = z0*rho[2][n-nlower];
      for (int m = nlower; m <= nupper; m++) {
        int my = m+ny;
        FFT_SCALAR x0 = y0*rho[1][m-nlower];
        for (int l = nlower; l <= nupper; l++) {
          int mx = l+nx;
          density_brick[mz][my][mx] += x0*rho[0][l-nlower];
        }
      }
    }
  }
}

/* ----------------------------------------------------------------------
   interpolate from grid to get electric field & force on my particles for ik
------------------------------------------------------------------------- */

template<class flt_t, class acc_t>
void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
{
  // loop over my charges, interpolate electric field from nearby grid points
  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
  // (dx,dy,dz) = distance to "lower left" grid pt
  // (mx,my,mz) = global coords of moving stencil pt
  // ek = 3 components of E-field on particle

  ATOM_T * _noalias const x = buffers->get_x(0);
  flt_t * _noalias const q = buffers->get_q(0);
  FORCE_T * _noalias const f = buffers->get_f();
  int nlocal = atom->nlocal;

  const flt_t lo0 = boxlo[0];
  const flt_t lo1 = boxlo[1];
  const flt_t lo2 = boxlo[2];
  const flt_t xi = delxinv;
  const flt_t yi = delyinv;
  const flt_t zi = delzinv;
  const flt_t fshiftone = shiftone;
  const flt_t fqqrd2es = qqrd2e * scale;

  #if defined(LMP_SIMD_COMPILER)
  #pragma vector aligned nontemporal
  #pragma simd
  #endif
  for (int i = 0; i < nlocal; i++) {
    int nx = part2grid[i][0];
    int ny = part2grid[i][1];
    int nz = part2grid[i][2];
    FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi;
    FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi;
    FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi;

    flt_t rho[3][INTEL_P3M_MAXORDER];

    for (int k = nlower; k <= nupper; k++) {
      FFT_SCALAR r1 = rho_coeff[order-1][k];
      FFT_SCALAR r2 = rho_coeff[order-1][k];
      FFT_SCALAR r3 = rho_coeff[order-1][k];
      for (int l = order-2; l >= 0; l--) {
        r1 = rho_coeff[l][k] + r1*dx;
        r2 = rho_coeff[l][k] + r2*dy;
        r3 = rho_coeff[l][k] + r3*dz;
      }
      rho[0][k-nlower] = r1;
      rho[1][k-nlower] = r2;
      rho[2][k-nlower] = r3;
    }

    FFT_SCALAR ekx, eky, ekz;
    ekx = eky = ekz = ZEROF;
    for (int n = nlower; n <= nupper; n++) {
      int mz = n+nz;
      FFT_SCALAR z0 = rho[2][n-nlower];
      for (int m = nlower; m <= nupper; m++) {
        int my = m+ny;
        FFT_SCALAR y0 = z0*rho[1][m-nlower];
        for (int l = nlower; l <= nupper; l++) {
          int mx = l+nx;
          FFT_SCALAR x0 = y0*rho[0][l-nlower];
          ekx -= x0*vdx_brick[mz][my][mx];
          eky -= x0*vdy_brick[mz][my][mx];
          ekz -= x0*vdz_brick[mz][my][mx];
        }
      }
    }

    // convert E-field to force

    const flt_t qfactor = fqqrd2es * q[i];
    f[i].x += qfactor*ekx;
    f[i].y += qfactor*eky;
    if (slabflag != 2) f[i].z += qfactor*ekz;
  }
}

/* ----------------------------------------------------------------------
   interpolate from grid to get electric field & force on my particles for ad
------------------------------------------------------------------------- */

template<class flt_t, class acc_t>
void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
{
  // loop over my charges, interpolate electric field from nearby grid points
  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
  // (dx,dy,dz) = distance to "lower left" grid pt
  // (mx,my,mz) = global coords of moving stencil pt
  // ek = 3 components of E-field on particle

  ATOM_T * _noalias const x = buffers->get_x(0);
  const flt_t * _noalias const q = buffers->get_q(0);
  FORCE_T * _noalias const f = buffers->get_f();
  int nlocal = atom->nlocal;

  const flt_t ftwo_pi = MY_PI * 2.0;
  const flt_t ffour_pi = MY_PI * 4.0;

  const flt_t lo0 = boxlo[0];
  const flt_t lo1 = boxlo[1];
  const flt_t lo2 = boxlo[2];
  const flt_t xi = delxinv;
  const flt_t yi = delyinv;
  const flt_t zi = delzinv;
  const flt_t fshiftone = shiftone;
  const flt_t fqqrd2es = qqrd2e * scale;

  const double *prd = domain->prd;
  const double xprd = prd[0];
  const double yprd = prd[1];
  const double zprd = prd[2];

  const flt_t hx_inv = nx_pppm/xprd;
  const flt_t hy_inv = ny_pppm/yprd;
  const flt_t hz_inv = nz_pppm/zprd;

  const flt_t fsf_coeff0 = sf_coeff[0];
  const flt_t fsf_coeff1 = sf_coeff[1];
  const flt_t fsf_coeff2 = sf_coeff[2];
  const flt_t fsf_coeff3 = sf_coeff[3];
  const flt_t fsf_coeff4 = sf_coeff[4];
  const flt_t fsf_coeff5 = sf_coeff[5];

  #if defined(LMP_SIMD_COMPILER)
  #pragma vector aligned nontemporal
  #pragma simd
  #endif
  for (int i = 0; i < nlocal; i++) {
    int nx = part2grid[i][0];
    int ny = part2grid[i][1];
    int nz = part2grid[i][2];
    FFT_SCALAR dx = nx+fshiftone - (x[i].x-lo0)*xi;
    FFT_SCALAR dy = ny+fshiftone - (x[i].y-lo1)*yi;
    FFT_SCALAR dz = nz+fshiftone - (x[i].z-lo2)*zi;

    flt_t rho[3][INTEL_P3M_MAXORDER];
    flt_t drho[3][INTEL_P3M_MAXORDER];

    for (int k = nlower; k <= nupper; k++) {
      FFT_SCALAR r1,r2,r3,dr1,dr2,dr3;
      dr1 = dr2 = dr3 = ZEROF;

      r1 = rho_coeff[order-1][k];
      r2 = rho_coeff[order-1][k];
      r3 = rho_coeff[order-1][k];
      for (int l = order-2; l >= 0; l--) {
        r1 = rho_coeff[l][k] + r1 * dx;
        r2 = rho_coeff[l][k] + r2 * dy;
        r3 = rho_coeff[l][k] + r3 * dz;
	dr1 = drho_coeff[l][k] + dr1 * dx;
	dr2 = drho_coeff[l][k] + dr2 * dy;
	dr3 = drho_coeff[l][k] + dr3 * dz;
      }
      rho[0][k-nlower] = r1;
      rho[1][k-nlower] = r2;
      rho[2][k-nlower] = r3;
      drho[0][k-nlower] = dr1;
      drho[1][k-nlower] = dr2;
      drho[2][k-nlower] = dr3;
    }

    FFT_SCALAR ekx, eky, ekz;
    ekx = eky = ekz = ZEROF;
    for (int n = nlower; n <= nupper; n++) {
      int mz = n+nz;
      for (int m = nlower; m <= nupper; m++) {
        int my = m+ny;
        FFT_SCALAR ekx_p = rho[1][m-nlower] * rho[2][n-nlower];
        FFT_SCALAR eky_p = drho[1][m-nlower] * rho[2][n-nlower];
        FFT_SCALAR ekz_p = rho[1][m-nlower] * drho[2][n-nlower];
        for (int l = nlower; l <= nupper; l++) {
          int mx = l+nx;
          ekx += drho[0][l-nlower] * ekx_p * u_brick[mz][my][mx];
          eky += rho[0][l-nlower] * eky_p * u_brick[mz][my][mx];
          ekz += rho[0][l-nlower] * ekz_p * u_brick[mz][my][mx];
        }
      }
    }
    ekx *= hx_inv;
    eky *= hy_inv;
    ekz *= hz_inv;

    // convert E-field to force

    const flt_t qfactor = fqqrd2es * q[i];
    const flt_t twoqsq = (flt_t)2.0 * q[i] * q[i];

    const flt_t s1 = x[i].x * hx_inv;
    const flt_t s2 = x[i].y * hy_inv;
    const flt_t s3 = x[i].z * hz_inv;
    flt_t sf = fsf_coeff0 * sin(ftwo_pi * s1);
    sf += fsf_coeff1 * sin(ffour_pi * s1);
    sf *= twoqsq;
    f[i].x += qfactor * ekx - fqqrd2es * sf;

    sf = fsf_coeff2 * sin(ftwo_pi * s2);
    sf += fsf_coeff3 * sin(ffour_pi * s2);
    sf *= twoqsq;
    f[i].y += qfactor * eky - fqqrd2es * sf;

    sf = fsf_coeff4 * sin(ftwo_pi * s3);
    sf += fsf_coeff5 * sin(ffour_pi * s3);
    sf *= twoqsq;

    if (slabflag != 2) f[i].z += qfactor * ekz - fqqrd2es * sf;
  }
}

/* ----------------------------------------------------------------------
  Pack data into intel package buffers if using LRT mode
------------------------------------------------------------------------- */

void PPPMIntel::pack_buffers()
{
  fix->start_watch(TIME_PACK);
  #if defined(_OPENMP)
  #pragma omp parallel default(none)
  #endif
  {
    int ifrom, ito, tid;
    IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal+atom->nghost,
                              comm->nthreads,
                              sizeof(IntelBuffers<float,double>::atom_t));
    if (fix->precision() == FixIntel::PREC_MODE_MIXED)
      fix->get_mixed_buffers()->thr_pack(ifrom,ito,1);
    else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
      fix->get_double_buffers()->thr_pack(ifrom,ito,1);
    else
      fix->get_single_buffers()->thr_pack(ifrom,ito,1);
  }
  fix->stop_watch(TIME_PACK);
}

/* ----------------------------------------------------------------------
   Returns 0 if Intel optimizations for PPPM ignored due to offload
------------------------------------------------------------------------- */

#ifdef _LMP_INTEL_OFFLOAD
int PPPMIntel::use_base() {
  return _use_base;
}
#endif