lammps/src/OPENMP/pppm_tip4p_omp.cpp

// clang-format off
/* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov

   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.

   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */

/* ----------------------------------------------------------------------
   Contributing author: Axel Kohlmeyer (Temple U)
------------------------------------------------------------------------- */

#include "pppm_tip4p_omp.h"

#include "atom.h"
#include "comm.h"
#include "domain.h"
#include "error.h"
#include "force.h"
#include "math_const.h"
#include "math_special.h"
#include "suffix.h"

#include <cstring>
#include <cmath>

#include "omp_compat.h"
#if defined(_OPENMP)
#include <omp.h>
#endif

using namespace LAMMPS_NS;
using namespace MathConst;
using namespace MathSpecial;

#ifdef FFT_SINGLE
#define ZEROF 0.0f
#else
#define ZEROF 0.0
#endif

#define EPS_HOC 1.0e-7
#define OFFSET 16384

/* ---------------------------------------------------------------------- */

PPPMTIP4POMP::PPPMTIP4POMP(LAMMPS *lmp) :
  PPPMTIP4P(lmp), ThrOMP(lmp, THR_KSPACE)
{
  triclinic_support = 1;
  suffix_flag |= Suffix::OMP;
}

/* ----------------------------------------------------------------------
   clean up per-thread allocations
------------------------------------------------------------------------- */

PPPMTIP4POMP::~PPPMTIP4POMP()
{
#if defined(_OPENMP)
#pragma omp parallel LMP_DEFAULT_NONE
#endif
  {
#if defined(_OPENMP)
    const int tid = omp_get_thread_num();
#else
    const int tid = 0;
#endif
    ThrData *thr = fix->get_thr(tid);
    thr->init_pppm(-order,memory);
  }
}

/* ----------------------------------------------------------------------
   allocate memory that depends on # of K-vectors and order
------------------------------------------------------------------------- */

void PPPMTIP4POMP::allocate()
{
  PPPMTIP4P::allocate();

#if defined(_OPENMP)
#pragma omp parallel LMP_DEFAULT_NONE
#endif
  {
#if defined(_OPENMP)
    const int tid = omp_get_thread_num();
#else
    const int tid = 0;
#endif
    ThrData *thr = fix->get_thr(tid);
    thr->init_pppm(order,memory);
  }
}

/* ----------------------------------------------------------------------
   pre-compute modified (Hockney-Eastwood) Coulomb Green's function
------------------------------------------------------------------------- */

void PPPMTIP4POMP::compute_gf_ik()
{
  const double * const prd = (triclinic==0) ? domain->prd : domain->prd_lamda;

  const double xprd = prd[0];
  const double yprd = prd[1];
  const double zprd = prd[2];
  const double zprd_slab = zprd*slab_volfactor;
  const double unitkx = (MY_2PI/xprd);
  const double unitky = (MY_2PI/yprd);
  const double unitkz = (MY_2PI/zprd_slab);

  const int nbx = static_cast<int> ((g_ewald*xprd/(MY_PI*nx_pppm)) *
                                    pow(-log(EPS_HOC),0.25));
  const int nby = static_cast<int> ((g_ewald*yprd/(MY_PI*ny_pppm)) *
                                    pow(-log(EPS_HOC),0.25));
  const int nbz = static_cast<int> ((g_ewald*zprd_slab/(MY_PI*nz_pppm)) *
                                    pow(-log(EPS_HOC),0.25));
  const int numk = nxhi_fft - nxlo_fft + 1;
  const int numl = nyhi_fft - nylo_fft + 1;

  const int twoorder = 2*order;

#if defined(_OPENMP)
#pragma omp parallel LMP_DEFAULT_NONE
#endif
  {
    double snx,sny,snz;
    double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
    double sum1,dot1,dot2;
    double numerator,denominator;
    double sqk;

    int k,l,m,nx,ny,nz,kper,lper,mper,n,nfrom,nto,tid;

    loop_setup_thr(nfrom, nto, tid, nfft, comm->nthreads);
    ThrData *thr = fix->get_thr(tid);
    thr->timer(Timer::START);

    for (n = nfrom; n < nto; ++n) {
      m = n / (numl*numk);
      l = (n - m*numl*numk) / numk;
      k = n - m*numl*numk - l*numk;
      m += nzlo_fft;
      l += nylo_fft;
      k += nxlo_fft;

      mper = m - nz_pppm*(2*m/nz_pppm);
      snz = square(sin(0.5*unitkz*mper*zprd_slab/nz_pppm));

      lper = l - ny_pppm*(2*l/ny_pppm);
      sny = square(sin(0.5*unitky*lper*yprd/ny_pppm));

      kper = k - nx_pppm*(2*k/nx_pppm);
      snx = square(sin(0.5*unitkx*kper*xprd/nx_pppm));

      sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper);

      if (sqk != 0.0) {
        numerator = 12.5663706/sqk;
        denominator = gf_denom(snx,sny,snz);
        sum1 = 0.0;

        for (nx = -nbx; nx <= nbx; nx++) {
          qx = unitkx*(kper+nx_pppm*nx);
          sx = exp(-0.25*square(qx/g_ewald));
          argx = 0.5*qx*xprd/nx_pppm;
          wx = powsinxx(argx,twoorder);

          for (ny = -nby; ny <= nby; ny++) {
            qy = unitky*(lper+ny_pppm*ny);
            sy = exp(-0.25*square(qy/g_ewald));
            argy = 0.5*qy*yprd/ny_pppm;
            wy = powsinxx(argy,twoorder);

            for (nz = -nbz; nz <= nbz; nz++) {
              qz = unitkz*(mper+nz_pppm*nz);
              sz = exp(-0.25*square(qz/g_ewald));
              argz = 0.5*qz*zprd_slab/nz_pppm;
              wz = powsinxx(argz,twoorder);

              dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
              dot2 = qx*qx+qy*qy+qz*qz;
              sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz;
            }
          }
        }
        greensfn[n] = numerator*sum1/denominator;
      } else greensfn[n] = 0.0;
    }
    thr->timer(Timer::KSPACE);
  } // end of parallel region
}

/* ----------------------------------------------------------------------
   compute optimized Green's function for energy calculation
------------------------------------------------------------------------- */

void PPPMTIP4POMP::compute_gf_ad()
{

  const double * const prd = (triclinic==0) ? domain->prd : domain->prd_lamda;

  const double xprd = prd[0];
  const double yprd = prd[1];
  const double zprd = prd[2];
  const double zprd_slab = zprd*slab_volfactor;
  const double unitkx = (MY_2PI/xprd);
  const double unitky = (MY_2PI/yprd);
  const double unitkz = (MY_2PI/zprd_slab);

  const int numk = nxhi_fft - nxlo_fft + 1;
  const int numl = nyhi_fft - nylo_fft + 1;

  const int twoorder = 2*order;
  double sf0=0.0,sf1=0.0,sf2=0.0,sf3=0.0,sf4=0.0,sf5=0.0;

#if defined(_OPENMP)
#pragma omp parallel LMP_DEFAULT_NONE reduction(+:sf0,sf1,sf2,sf3,sf4,sf5)
#endif
  {
    double snx,sny,snz,sqk;
    double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
    double numerator,denominator;
    int k,l,m,kper,lper,mper,n,nfrom,nto,tid;

    loop_setup_thr(nfrom, nto, tid, nfft, comm->nthreads);
    ThrData *thr = fix->get_thr(tid);
    thr->timer(Timer::START);

    for (n = nfrom; n < nto; ++n) {

      m = n / (numl*numk);
      l = (n - m*numl*numk) / numk;
      k = n - m*numl*numk - l*numk;
      m += nzlo_fft;
      l += nylo_fft;
      k += nxlo_fft;

      mper = m - nz_pppm*(2*m/nz_pppm);
      qz = unitkz*mper;
      snz = square(sin(0.5*qz*zprd_slab/nz_pppm));
      sz = exp(-0.25*square(qz/g_ewald));
      argz = 0.5*qz*zprd_slab/nz_pppm;
      wz = powsinxx(argz,twoorder);

      lper = l - ny_pppm*(2*l/ny_pppm);
      qy = unitky*lper;
      sny = square(sin(0.5*qy*yprd/ny_pppm));
      sy = exp(-0.25*square(qy/g_ewald));
      argy = 0.5*qy*yprd/ny_pppm;
      wy = powsinxx(argy,twoorder);

      kper = k - nx_pppm*(2*k/nx_pppm);
      qx = unitkx*kper;
      snx = square(sin(0.5*qx*xprd/nx_pppm));
      sx = exp(-0.25*square(qx/g_ewald));
      argx = 0.5*qx*xprd/nx_pppm;
      wx = powsinxx(argx,twoorder);

      sqk = qx*qx + qy*qy + qz*qz;

      if (sqk != 0.0) {
        numerator = MY_4PI/sqk;
        denominator = gf_denom(snx,sny,snz);
        greensfn[n] = numerator*sx*sy*sz*wx*wy*wz/denominator;
        sf0 += sf_precoeff1[n]*greensfn[n];
        sf1 += sf_precoeff2[n]*greensfn[n];
        sf2 += sf_precoeff3[n]*greensfn[n];
        sf3 += sf_precoeff4[n]*greensfn[n];
        sf4 += sf_precoeff5[n]*greensfn[n];
        sf5 += sf_precoeff6[n]*greensfn[n];
      } else {
        greensfn[n] = 0.0;
        sf0 += sf_precoeff1[n]*greensfn[n];
        sf1 += sf_precoeff2[n]*greensfn[n];
        sf2 += sf_precoeff3[n]*greensfn[n];
        sf3 += sf_precoeff4[n]*greensfn[n];
        sf4 += sf_precoeff5[n]*greensfn[n];
        sf5 += sf_precoeff6[n]*greensfn[n];
      }
    }
    thr->timer(Timer::KSPACE);
  } // end of parallel region

  // compute the coefficients for the self-force correction

  double prex, prey, prez, tmp[6];
  prex = prey = prez = MY_PI/volume;
  prex *= nx_pppm/xprd;
  prey *= ny_pppm/yprd;
  prez *= nz_pppm/zprd_slab;
  tmp[0] = sf0 * prex;
  tmp[1] = sf1 * prex*2;
  tmp[2] = sf2 * prey;
  tmp[3] = sf3 * prey*2;
  tmp[4] = sf4 * prez;
  tmp[5] = sf5 * prez*2;

  // communicate values with other procs

  MPI_Allreduce(tmp,sf_coeff,6,MPI_DOUBLE,MPI_SUM,world);
}

/* ----------------------------------------------------------------------
   run the regular toplevel compute method from plain PPPM
   which will have individual methods replaced by our threaded
   versions and then call the obligatory force reduction.
------------------------------------------------------------------------- */

void PPPMTIP4POMP::compute(int eflag, int vflag)
{

  PPPMTIP4P::compute(eflag,vflag);

#if defined(_OPENMP)
#pragma omp parallel LMP_DEFAULT_NONE LMP_SHARED(eflag,vflag)
#endif
  {
#if defined(_OPENMP)
    const int tid = omp_get_thread_num();
#else
    const int tid = 0;
#endif
    ThrData *thr = fix->get_thr(tid);
    thr->timer(Timer::START);
    reduce_thr(this, eflag, vflag, thr);
  } // end of omp parallel region
}

/* ----------------------------------------------------------------------
   find center grid pt for each of my particles
   check that full stencil for the particle will fit in my 3d brick
   store central grid pt indices in part2grid array
------------------------------------------------------------------------- */

void PPPMTIP4POMP::particle_map()
{
  // no local atoms => nothing to do

  if (atom->nlocal == 0) return;

  const int * _noalias const type = atom->type;
  const auto * _noalias const x = (dbl3_t *) atom->x[0];
  auto * _noalias const p2g = (int3_t *) part2grid[0];
  const double boxlox = boxlo[0];
  const double boxloy = boxlo[1];
  const double boxloz = boxlo[2];
  const int nlocal = atom->nlocal;

  if (!std::isfinite(boxlox) || !std::isfinite(boxloy) || !std::isfinite(boxloz))
    error->one(FLERR,"Non-numeric box dimensions - simulation unstable");

  int flag = 0;
#if defined(_OPENMP)
#pragma omp parallel for LMP_DEFAULT_NONE reduction(+:flag) schedule(static)
#endif
  for (int i = 0; i < nlocal; i++) {
    dbl3_t xM;
    int iH1,iH2;

    if (type[i] == typeO) {
      find_M_thr(i,iH1,iH2,xM);
    } else {
      xM = x[i];
    }

    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
    // current particle coord can be outside global and local box
    // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1

    const int nx = static_cast<int> ((xM.x-boxlox)*delxinv+shift) - OFFSET;
    const int ny = static_cast<int> ((xM.y-boxloy)*delyinv+shift) - OFFSET;
    const int nz = static_cast<int> ((xM.z-boxloz)*delzinv+shift) - OFFSET;

    p2g[i].a = nx;
    p2g[i].b = ny;
    p2g[i].t = nz;

    // check that entire stencil around nx,ny,nz will fit in my 3d brick

    if (nx+nlower < nxlo_out || nx+nupper > nxhi_out ||
        ny+nlower < nylo_out || ny+nupper > nyhi_out ||
        nz+nlower < nzlo_out || nz+nupper > nzhi_out)
      flag++;
  }

  int flag_all;
  MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world);
  if (flag_all) error->all(FLERR,"Out of range atoms - cannot compute PPPM");
}

/* ----------------------------------------------------------------------
   create discretized "density" on section of global grid due to my particles
   density(x,y,z) = charge "density" at grid points of my 3d brick
   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
   in global grid
------------------------------------------------------------------------- */

void PPPMTIP4POMP::make_rho()
{

  // clear 3d density array

  FFT_SCALAR * _noalias const d = &(density_brick[nzlo_out][nylo_out][nxlo_out]);
  memset(d,0,ngrid*sizeof(FFT_SCALAR));

  // no local atoms => nothing else to do

  const int nlocal = atom->nlocal;
  if (nlocal == 0) return;

  const int ix = nxhi_out - nxlo_out + 1;
  const int iy = nyhi_out - nylo_out + 1;

#if defined(_OPENMP)
#pragma omp parallel LMP_DEFAULT_NONE
#endif
  {
    const double * _noalias const q = atom->q;
    const auto * _noalias const x = (dbl3_t *) atom->x[0];
    const auto * _noalias const p2g = (int3_t *) part2grid[0];
    const int * _noalias const type = atom->type;
    dbl3_t xM;

    const double boxlox = boxlo[0];
    const double boxloy = boxlo[1];
    const double boxloz = boxlo[2];

    // determine range of grid points handled by this thread
    int i,jfrom,jto,tid,iH1,iH2;
    loop_setup_thr(jfrom,jto,tid,ngrid,comm->nthreads);

    // get per thread data
    ThrData *thr = fix->get_thr(tid);
    thr->timer(Timer::START);
    FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());

    // loop over my charges, add their contribution to nearby grid points
    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
    // (dx,dy,dz) = distance to "lower left" grid pt

    // loop over all local atoms for all threads
    for (i = 0; i < nlocal; i++) {

      const int nx = p2g[i].a;
      const int ny = p2g[i].b;
      const int nz = p2g[i].t;

      // pre-screen whether this atom will ever come within
      // reach of the data segement this thread is updating.
      if ( ((nz+nlower-nzlo_out)*ix*iy >= jto)
           || ((nz+nupper-nzlo_out+1)*ix*iy < jfrom) ) continue;

      if (type[i] == typeO) {
        find_M_thr(i,iH1,iH2,xM);
      } else {
        xM = x[i];
      }
      const FFT_SCALAR dx = nx+shiftone - (xM.x-boxlox)*delxinv;
      const FFT_SCALAR dy = ny+shiftone - (xM.y-boxloy)*delyinv;
      const FFT_SCALAR dz = nz+shiftone - (xM.z-boxloz)*delzinv;

      compute_rho1d_thr(r1d,dx,dy,dz);

      const FFT_SCALAR z0 = delvolinv * q[i];

      for (int n = nlower; n <= nupper; ++n) {
        const int jn = (nz+n-nzlo_out)*ix*iy;
        const FFT_SCALAR y0 = z0*r1d[2][n];

        for (int m = nlower; m <= nupper; ++m) {
          const int jm = jn+(ny+m-nylo_out)*ix;
          const FFT_SCALAR x0 = y0*r1d[1][m];

          for (int l = nlower; l <= nupper; ++l) {
            const int jl = jm+nx+l-nxlo_out;
            // make sure each thread only updates
            // "his" elements of the density grid
            if (jl >= jto) break;
            if (jl < jfrom) continue;

            d[jl] += x0*r1d[0][l];
          }
        }
      }
    }
    thr->timer(Timer::KSPACE);
  }
}

/* ----------------------------------------------------------------------
   interpolate from grid to get electric field & force on my particles for ik
------------------------------------------------------------------------- */

void PPPMTIP4POMP::fieldforce_ik()
{
  const int nthreads = comm->nthreads;
  const int nlocal = atom->nlocal;

  // no local atoms => nothing to do

  if (nlocal == 0) return;

  // loop over my charges, interpolate electric field from nearby grid points
  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
  // (dx,dy,dz) = distance to "lower left" grid pt
  // (mx,my,mz) = global coords of moving stencil pt
  // ek = 3 components of E-field on particle

  const auto * _noalias const x = (dbl3_t *) atom->x[0];
  const double * _noalias const q = atom->q;
  const auto * _noalias const p2g = (int3_t *) part2grid[0];
  const int * _noalias const type = atom->type;

  const double qqrd2e = force->qqrd2e;
  const double boxlox = boxlo[0];
  const double boxloy = boxlo[1];
  const double boxloz = boxlo[2];

#if defined(_OPENMP)
#pragma omp parallel LMP_DEFAULT_NONE
#endif
  {
    dbl3_t xM;
    FFT_SCALAR x0,y0,z0,ekx,eky,ekz;
    int i,ifrom,ito,tid,iH1,iH2,l,m,n,mx,my,mz;

    loop_setup_thr(ifrom,ito,tid,nlocal,nthreads);

    // get per thread data
    ThrData *thr = fix->get_thr(tid);
    thr->timer(Timer::START);
    auto * _noalias const f = (dbl3_t *) thr->get_f()[0];
    FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());

    for (i = ifrom; i < ito; ++i) {
      if (type[i] == typeO) {
        find_M_thr(i,iH1,iH2,xM);
      } else xM = x[i];

      const int nx = p2g[i].a;
      const int ny = p2g[i].b;
      const int nz = p2g[i].t;
      const FFT_SCALAR dx = nx+shiftone - (xM.x-boxlox)*delxinv;
      const FFT_SCALAR dy = ny+shiftone - (xM.y-boxloy)*delyinv;
      const FFT_SCALAR dz = nz+shiftone - (xM.z-boxloz)*delzinv;

      compute_rho1d_thr(r1d,dx,dy,dz);

      ekx = eky = ekz = ZEROF;
      for (n = nlower; n <= nupper; n++) {
        mz = n+nz;
        z0 = r1d[2][n];
        for (m = nlower; m <= nupper; m++) {
          my = m+ny;
          y0 = z0*r1d[1][m];
          for (l = nlower; l <= nupper; l++) {
            mx = l+nx;
            x0 = y0*r1d[0][l];
            ekx -= x0*vdx_brick[mz][my][mx];
            eky -= x0*vdy_brick[mz][my][mx];
            ekz -= x0*vdz_brick[mz][my][mx];
          }
        }
      }

      // convert E-field to force

      const double qfactor = qqrd2e * scale * q[i];
      if (type[i] != typeO) {
        f[i].x += qfactor*ekx;
        f[i].y += qfactor*eky;
        if (slabflag != 2) f[i].z += qfactor*ekz;

      } else {
        const double fx = qfactor * ekx;
        const double fy = qfactor * eky;
        const double fz = qfactor * ekz;

        f[i].x += fx*(1 - alpha);
        f[i].y += fy*(1 - alpha);
        if (slabflag != 2) f[i].z += fz*(1 - alpha);

        f[iH1].x += 0.5*alpha*fx;
        f[iH1].y += 0.5*alpha*fy;
        if (slabflag != 2) f[iH1].z += 0.5*alpha*fz;

        f[iH2].x += 0.5*alpha*fx;
        f[iH2].y += 0.5*alpha*fy;
        if (slabflag != 2) f[iH2].z += 0.5*alpha*fz;
      }
    }
    thr->timer(Timer::KSPACE);
  } // end of parallel region
}

/* ----------------------------------------------------------------------
   interpolate from grid to get electric field & force on my particles for ad
------------------------------------------------------------------------- */

void PPPMTIP4POMP::fieldforce_ad()
{
  const int nthreads = comm->nthreads;
  const int nlocal = atom->nlocal;

  // no local atoms => nothing to do

  if (nlocal == 0) return;

  const double *prd = (triclinic == 0) ? domain->prd : domain->prd_lamda;
  const double hx_inv = nx_pppm/prd[0];
  const double hy_inv = ny_pppm/prd[1];
  const double hz_inv = nz_pppm/prd[2];

  // loop over my charges, interpolate electric field from nearby grid points
  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
  // (dx,dy,dz) = distance to "lower left" grid pt
  // (mx,my,mz) = global coords of moving stencil pt
  // ek = 3 components of E-field on particle

  const auto * _noalias const x = (dbl3_t *) atom->x[0];
  const double * _noalias const q = atom->q;
  const auto * _noalias const p2g = (int3_t *) part2grid[0];
  const int * _noalias const type = atom->type;

  const double qqrd2e = force->qqrd2e;
  const double boxlox = boxlo[0];
  const double boxloy = boxlo[1];
  const double boxloz = boxlo[2];

#if defined(_OPENMP)
#pragma omp parallel LMP_DEFAULT_NONE
#endif
  {
    double s1,s2,s3,sf;
    dbl3_t xM;
    FFT_SCALAR ekx,eky,ekz;
    int i,ifrom,ito,tid,iH1,iH2,l,m,n,mx,my,mz;

    loop_setup_thr(ifrom,ito,tid,nlocal,nthreads);

    // get per thread data
    ThrData *thr = fix->get_thr(tid);
    thr->timer(Timer::START);
    auto * _noalias const f = (dbl3_t *) thr->get_f()[0];
    FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
    FFT_SCALAR * const * const d1d = static_cast<FFT_SCALAR **>(thr->get_drho1d());

    for (i = ifrom; i < ito; ++i) {
      if (type[i] == typeO) {
        find_M_thr(i,iH1,iH2,xM);
      } else xM = x[i];

      const int nx = p2g[i].a;
      const int ny = p2g[i].b;
      const int nz = p2g[i].t;
      const FFT_SCALAR dx = nx+shiftone - (xM.x-boxlox)*delxinv;
      const FFT_SCALAR dy = ny+shiftone - (xM.y-boxloy)*delyinv;
      const FFT_SCALAR dz = nz+shiftone - (xM.z-boxloz)*delzinv;

      compute_rho1d_thr(r1d,dx,dy,dz);
      compute_drho1d_thr(d1d,dx,dy,dz);

      ekx = eky = ekz = ZEROF;
      for (n = nlower; n <= nupper; n++) {
        mz = n+nz;
        for (m = nlower; m <= nupper; m++) {
          my = m+ny;
          for (l = nlower; l <= nupper; l++) {
            mx = l+nx;
            ekx += d1d[0][l]*r1d[1][m]*r1d[2][n]*u_brick[mz][my][mx];
            eky += r1d[0][l]*d1d[1][m]*r1d[2][n]*u_brick[mz][my][mx];
            ekz += r1d[0][l]*r1d[1][m]*d1d[2][n]*u_brick[mz][my][mx];
          }
        }
      }
      ekx *= hx_inv;
      eky *= hy_inv;
      ekz *= hz_inv;

      // convert E-field to force and subtract self forces

      const double qi = q[i];
      const double qfactor = qqrd2e * scale * qi;

      s1 = xM.x*hx_inv;
      sf = sf_coeff[0]*sin(MY_2PI*s1);
      sf += sf_coeff[1]*sin(MY_4PI*s1);
      sf *= 2.0*qi;
      const double fx = qfactor*(ekx - sf);

      s2 = xM.y*hy_inv;
      sf = sf_coeff[2]*sin(MY_2PI*s2);
      sf += sf_coeff[3]*sin(MY_4PI*s2);
      sf *= 2.0*qi;
      const double fy = qfactor*(eky - sf);

      s3 = xM.z*hz_inv;
      sf = sf_coeff[4]*sin(MY_2PI*s3);
      sf += sf_coeff[5]*sin(MY_4PI*s3);
      sf *= 2.0*qi;
      const double fz = qfactor*(ekz - sf);

      if (type[i] != typeO) {
        f[i].x += fx;
        f[i].y += fy;
        if (slabflag != 2) f[i].z += fz;

      } else {
        f[i].x += fx*(1 - alpha);
        f[i].y += fy*(1 - alpha);
        if (slabflag != 2) f[i].z += fz*(1 - alpha);

        f[iH1].x += 0.5*alpha*fx;
        f[iH1].y += 0.5*alpha*fy;
        if (slabflag != 2) f[iH1].z += 0.5*alpha*fz;

        f[iH2].x += 0.5*alpha*fx;
        f[iH2].y += 0.5*alpha*fy;
        if (slabflag != 2) f[iH2].z += 0.5*alpha*fz;
      }
    }
    thr->timer(Timer::KSPACE);
  } // end of parallel region
}

/* ----------------------------------------------------------------------
  find 2 H atoms bonded to O atom i
  compute position xM of fictitious charge site for O atom
  also return local indices iH1,iH2 of H atoms
------------------------------------------------------------------------- */

void PPPMTIP4POMP::find_M_thr(int i, int &iH1, int &iH2, dbl3_t &xM)
{
  double **x = atom->x;

  iH1 = atom->map(atom->tag[i] + 1);
  iH2 = atom->map(atom->tag[i] + 2);

  if (iH1 == -1 || iH2 == -1) error->one(FLERR,"TIP4P hydrogen is missing");
  if (atom->type[iH1] != typeH || atom->type[iH2] != typeH)
    error->one(FLERR,"TIP4P hydrogen has incorrect atom type");

  if (triclinic) {

    // need to use custom code to find the closest image for triclinic,
    // since local atoms are in lambda coordinates, but ghosts are not.

    int *sametag = atom->sametag;
    double xo[3],xh1[3],xh2[3],xm[3];
    const int nlocal = atom->nlocal;

    for (int ii = 0; ii < 3; ++ii) {
      xo[ii] = x[i][ii];
      xh1[ii] = x[iH1][ii];
      xh2[ii] = x[iH2][ii];
    }

    if (i < nlocal) domain->lamda2x(x[i],xo);
    if (iH1 < nlocal) domain->lamda2x(x[iH1],xh1);
    if (iH2 < nlocal) domain->lamda2x(x[iH2],xh2);

    double delx = xo[0] - xh1[0];
    double dely = xo[1] - xh1[1];
    double delz = xo[2] - xh1[2];
    double rsqmin = delx*delx + dely*dely + delz*delz;
    double rsq;
    int closest = iH1;

    // no need to run lamda2x here -> ghost atoms
    while (sametag[iH1] >= 0) {
      iH1 = sametag[iH1];
      delx = xo[0] - x[iH1][0];
      dely = xo[1] - x[iH1][1];
      delz = xo[2] - x[iH1][2];
      rsq = delx*delx + dely*dely + delz*delz;
      if (rsq < rsqmin) {
        rsqmin = rsq;
        closest = iH1;
        xh1[0] = x[iH1][0];
        xh1[1] = x[iH1][1];
        xh1[2] = x[iH1][2];
      }
    }
    iH1 = closest;

    closest = iH2;
    delx = xo[0] - xh2[0];
    dely = xo[1] - xh2[1];
    delz = xo[2] - xh2[2];
    rsqmin = delx*delx + dely*dely + delz*delz;

    while (sametag[iH2] >= 0) {
      iH2 = sametag[iH2];
      delx = xo[0] - x[iH2][0];
      dely = xo[1] - x[iH2][1];
      delz = xo[2] - x[iH2][2];
      rsq = delx*delx + dely*dely + delz*delz;
      if (rsq < rsqmin) {
        rsqmin = rsq;
        closest = iH2;
        xh2[0] = x[iH2][0];
        xh2[1] = x[iH2][1];
        xh2[2] = x[iH2][2];
      }
    }
    iH2 = closest;

    // finally compute M in real coordinates ...

    double delx1 = xh1[0] - xo[0];
    double dely1 = xh1[1] - xo[1];
    double delz1 = xh1[2] - xo[2];

    double delx2 = xh2[0] - xo[0];
    double dely2 = xh2[1] - xo[1];
    double delz2 = xh2[2] - xo[2];

    xm[0] = xo[0] + alpha * 0.5 * (delx1 + delx2);
    xm[1] = xo[1] + alpha * 0.5 * (dely1 + dely2);
    xm[2] = xo[2] + alpha * 0.5 * (delz1 + delz2);

    // ... and convert M to lamda space for PPPM

    domain->x2lamda(xm,(double *)&xM);

  } else {

    // set iH1,iH2 to index of closest image to O

    iH1 = domain->closest_image(i,iH1);
    iH2 = domain->closest_image(i,iH2);

    const auto * _noalias const x = (dbl3_t *) atom->x[0];

    double delx1 = x[iH1].x - x[i].x;
    double dely1 = x[iH1].y - x[i].y;
    double delz1 = x[iH1].z - x[i].z;

    double delx2 = x[iH2].x - x[i].x;
    double dely2 = x[iH2].y - x[i].y;
    double delz2 = x[iH2].z - x[i].z;

    xM.x = x[i].x + alpha * 0.5 * (delx1 + delx2);
    xM.y = x[i].y + alpha * 0.5 * (dely1 + dely2);
    xM.z = x[i].z + alpha * 0.5 * (delz1 + delz2);
  }
}


/* ----------------------------------------------------------------------
   charge assignment into rho1d
   dx,dy,dz = distance of particle from "lower left" grid point
------------------------------------------------------------------------- */
void PPPMTIP4POMP::compute_rho1d_thr(FFT_SCALAR * const * const r1d, const FFT_SCALAR &dx,
                                const FFT_SCALAR &dy, const FFT_SCALAR &dz)
{
  int k,l;
  FFT_SCALAR r1,r2,r3;

  for (k = (1-order)/2; k <= order/2; k++) {
    r1 = r2 = r3 = ZEROF;

    for (l = order-1; l >= 0; l--) {
      r1 = rho_coeff[l][k] + r1*dx;
      r2 = rho_coeff[l][k] + r2*dy;
      r3 = rho_coeff[l][k] + r3*dz;
    }
    r1d[0][k] = r1;
    r1d[1][k] = r2;
    r1d[2][k] = r3;
  }
}

/* ----------------------------------------------------------------------
   charge assignment into drho1d
   dx,dy,dz = distance of particle from "lower left" grid point
------------------------------------------------------------------------- */

void PPPMTIP4POMP::compute_drho1d_thr(FFT_SCALAR * const * const d1d, const FFT_SCALAR &dx,
                              const FFT_SCALAR &dy, const FFT_SCALAR &dz)
{
  int k,l;
  FFT_SCALAR r1,r2,r3;

  for (k = (1-order)/2; k <= order/2; k++) {
    r1 = r2 = r3 = ZEROF;

    for (l = order-2; l >= 0; l--) {
      r1 = drho_coeff[l][k] + r1*dx;
      r2 = drho_coeff[l][k] + r2*dy;
      r3 = drho_coeff[l][k] + r3*dz;
    }
    d1d[0][k] = r1;
    d1d[1][k] = r2;
    d1d[2][k] = r3;
  }
}