lammps/src/USER-OMP/pppm_omp.cpp

/* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov

   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.

   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */

/* ----------------------------------------------------------------------
   Contributing author: Axel Kohlmeyer (Temple U)
------------------------------------------------------------------------- */

#include "pppm_omp.h"
#include "atom.h"
#include "comm.h"
#include "domain.h"
#include "force.h"
#include "memory.h"
#include "math_const.h"

#include <string.h>
#include <math.h>

#include "suffix.h"
using namespace LAMMPS_NS;
using namespace MathConst;

#ifdef FFT_SINGLE
#define ZEROF 0.0f
#define ONEF  1.0f
#else
#define ZEROF 0.0
#define ONEF  1.0
#endif

#define EPS_HOC 1.0e-7

/* ---------------------------------------------------------------------- */

PPPMOMP::PPPMOMP(LAMMPS *lmp, int narg, char **arg) :
  PPPM(lmp, narg, arg), ThrOMP(lmp, THR_KSPACE)
{
  suffix_flag |= Suffix::OMP;
}

/* ----------------------------------------------------------------------
   allocate memory that depends on # of K-vectors and order
------------------------------------------------------------------------- */

void PPPMOMP::allocate()
{
  PPPM::allocate();

  const int nthreads = comm->nthreads;

#if defined(_OPENMP)
#pragma omp parallel default(none)
#endif
  {
#if defined(_OPENMP)
    const int tid = omp_get_thread_num();
#else
    const int tid = 0;
#endif

    FFT_SCALAR **rho1d_thr;
    memory->create2d_offset(rho1d_thr,3,-order/2,order/2,"pppm:rho1d_thr");
    ThrData *thr = fix->get_thr(tid);
    thr->init_pppm(static_cast<void *>(rho1d_thr));
  }

  const int nzend = (nzhi_out-nzlo_out+1)*nthreads + nzlo_out -1;

  // reallocate density brick, so it fits our needs
  memory->destroy3d_offset(density_brick,nzlo_out,nylo_out,nxlo_out);
  memory->create3d_offset(density_brick,nzlo_out,nzend,nylo_out,nyhi_out,
			  nxlo_out,nxhi_out,"pppm:density_brick");
}

// NOTE: special version of reduce_data for FFT_SCALAR data type.
// reduce per thread data into the first part of the data
// array that is used for the non-threaded parts and reset
// the temporary storage to 0.0. this routine depends on
// multi-dimensional arrays like force stored in this order
// x1,y1,z1,x2,y2,z2,...
// we need to post a barrier to wait until all threads are done
// writing to the array.
static void data_reduce_fft(FFT_SCALAR *dall, int nall, int nthreads, int ndim, int tid)
{
#if defined(_OPENMP)
  // NOOP in non-threaded execution.
  if (nthreads == 1) return;
#pragma omp barrier
  {
    const int nvals = ndim*nall;
    const int idelta = nvals/nthreads + 1;
    const int ifrom = tid*idelta;
    const int ito   = ((ifrom + idelta) > nvals) ? nvals : (ifrom + idelta);

    // this if protects against having more threads than atoms
    if (ifrom < nall) {
      for (int m = ifrom; m < ito; ++m) {
	for (int n = 1; n < nthreads; ++n) {
	  dall[m] += dall[n*nvals + m];
	  dall[n*nvals + m] = 0.0;
	}
      }
    }
  }
#else
  // NOOP in non-threaded execution.
  return;
#endif
}

/* ----------------------------------------------------------------------
   free memory that depends on # of K-vectors and order
------------------------------------------------------------------------- */

void PPPMOMP::deallocate()
{
  PPPM::deallocate();
  for (int i=0; i < comm->nthreads; ++i) {
    ThrData * thr = fix->get_thr(i);
    FFT_SCALAR ** rho1d_thr = static_cast<FFT_SCALAR **>(thr->get_rho1d());
    memory->destroy2d_offset(rho1d_thr,-order/2);
  }
}

/* ----------------------------------------------------------------------
   adjust PPPM coeffs, called initially and whenever volume has changed
------------------------------------------------------------------------- */

void PPPMOMP::setup()
{
  int i,j,k,n;
  double *prd;

  // volume-dependent factors
  // adjust z dimension for 2d slab PPPM
  // z dimension for 3d PPPM is zprd since slab_volfactor = 1.0

  if (triclinic == 0) prd = domain->prd;
  else prd = domain->prd_lamda;

  const double xprd = prd[0];
  const double yprd = prd[1];
  const double zprd = prd[2];
  const double zprd_slab = zprd*slab_volfactor;
  volume = xprd * yprd * zprd_slab;

  delxinv = nx_pppm/xprd;
  delyinv = ny_pppm/yprd;
  delzinv = nz_pppm/zprd_slab;

  delvolinv = delxinv*delyinv*delzinv;

  const double unitkx = (2.0*MY_PI/xprd);
  const double unitky = (2.0*MY_PI/yprd);
  const double unitkz = (2.0*MY_PI/zprd_slab);

  // fkx,fky,fkz for my FFT grid pts

  double per;

  for (i = nxlo_fft; i <= nxhi_fft; i++) {
    per = i - nx_pppm*(2*i/nx_pppm);
    fkx[i] = unitkx*per;
  }

  for (i = nylo_fft; i <= nyhi_fft; i++) {
    per = i - ny_pppm*(2*i/ny_pppm);
    fky[i] = unitky*per;
  }

  for (i = nzlo_fft; i <= nzhi_fft; i++) {
    per = i - nz_pppm*(2*i/nz_pppm);
    fkz[i] = unitkz*per;
  }

  // virial coefficients

  double sqk,vterm;

  n = 0;
  for (k = nzlo_fft; k <= nzhi_fft; k++) {
    for (j = nylo_fft; j <= nyhi_fft; j++) {
      for (i = nxlo_fft; i <= nxhi_fft; i++) {
	sqk = fkx[i]*fkx[i] + fky[j]*fky[j] + fkz[k]*fkz[k];
	if (sqk == 0.0) {
	  vg[n][0] = 0.0;
	  vg[n][1] = 0.0;
	  vg[n][2] = 0.0;
	  vg[n][3] = 0.0;
	  vg[n][4] = 0.0;
	  vg[n][5] = 0.0;
	} else {
	  vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald));
	  vg[n][0] = 1.0 + vterm*fkx[i]*fkx[i];
	  vg[n][1] = 1.0 + vterm*fky[j]*fky[j];
	  vg[n][2] = 1.0 + vterm*fkz[k]*fkz[k];
	  vg[n][3] = vterm*fkx[i]*fky[j];
	  vg[n][4] = vterm*fkx[i]*fkz[k];
	  vg[n][5] = vterm*fky[j]*fkz[k];
	}
	n++;
      }
    }
  }

  // modified (Hockney-Eastwood) Coulomb Green's function

  const int nbx = static_cast<int> ((g_ewald*xprd/(MY_PI*nx_pppm)) *
				    pow(-log(EPS_HOC),0.25));
  const int nby = static_cast<int> ((g_ewald*yprd/(MY_PI*ny_pppm)) *
				    pow(-log(EPS_HOC),0.25));
  const int nbz = static_cast<int> ((g_ewald*zprd_slab/(MY_PI*nz_pppm)) *
				    pow(-log(EPS_HOC),0.25));

  const double form = 1.0;

#if defined(_OPENMP)
#pragma omp parallel default(none)
#endif
  {
    int tid,nn,nnfrom,nnto,nx,ny,nz,k,l,m;
    double snx,sny,snz,sqk;
    double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
    double sum1,dot1,dot2;
    double numerator,denominator;
    const double gew2 = -4.0*g_ewald*g_ewald;

    const int nnx = nxhi_fft-nxlo_fft+1;
    const int nny = nyhi_fft-nylo_fft+1;

    loop_setup_thr(nnfrom, nnto, tid, nfft, comm->nthreads);

    for (m = nzlo_fft; m <= nzhi_fft; m++) {

      const double fkzm = fkz[m];
      snz = sin(0.5*fkzm*zprd_slab/nz_pppm);
      snz *= snz;

      for (l = nylo_fft; l <= nyhi_fft; l++) {
	const double fkyl = fky[l];
	sny = sin(0.5*fkyl*yprd/ny_pppm);
	sny *= sny;

	for (k = nxlo_fft; k <= nxhi_fft; k++) {

	  /* only compute the part designated to this thread */
	  nn = k-nxlo_fft + nnx*(l-nylo_fft + nny*(m-nzlo_fft));
	  if ((nn < nnfrom) || (nn >=nnto)) continue;

	  const double fkxk = fkx[k];
	  snx = sin(0.5*fkxk*xprd/nx_pppm);
	  snx *= snx;

	  sqk = fkxk*fkxk + fkyl*fkyl + fkzm*fkzm;

	  if (sqk != 0.0) {
	    numerator = form*MY_4PI/sqk;
	    denominator = gf_denom(snx,sny,snz);
	    sum1 = 0.0;
	    for (nx = -nbx; nx <= nbx; nx++) {
	      qx = fkxk + unitkx*nx_pppm*nx;
	      sx = exp(qx*qx/gew2);
	      wx = 1.0;
	      argx = 0.5*qx*xprd/nx_pppm;
	      if (argx != 0.0) wx = pow(sin(argx)/argx,order);
	      wx *=wx;

	      for (ny = -nby; ny <= nby; ny++) {
		qy = fkyl + unitky*ny_pppm*ny;
		sy = exp(qy*qy/gew2);
		wy = 1.0;
		argy = 0.5*qy*yprd/ny_pppm;
		if (argy != 0.0) wy = pow(sin(argy)/argy,order);
		wy *= wy;

		for (nz = -nbz; nz <= nbz; nz++) {
		  qz = fkzm + unitkz*nz_pppm*nz;
		  sz = exp(qz*qz/gew2);
		  wz = 1.0;
		  argz = 0.5*qz*zprd_slab/nz_pppm;
		  if (argz != 0.0) wz = pow(sin(argz)/argz,order);
		  wz *= wz;

		  dot1 = fkxk*qx + fkyl*qy + fkzm*qz;
		  dot2 = qx*qx+qy*qy+qz*qz;
		  sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz;
		}
	      }
	    }
	    greensfn[nn] = numerator*sum1/denominator;
	  } else greensfn[nn] = 0.0;
	}
      }
    }
  }
}

/* ----------------------------------------------------------------------
   run the regular toplevel compute method from plain PPPPM
   which will have individual methods replaced by our threaded
   versions and then call the obligatory force reduction.
------------------------------------------------------------------------- */

void PPPMOMP::compute(int eflag, int vflag)
{

  PPPM::compute(eflag,vflag);

#if defined(_OPENMP)
#pragma omp parallel default(none) shared(eflag,vflag)
#endif
  {
#if defined(_OPENMP)
    const int tid = omp_get_thread_num();
#else
    const int tid = 0;
#endif
    ThrData *thr = fix->get_thr(tid);
    reduce_thr(this, eflag, vflag, thr);
  } // end of omp parallel region
}

/* ----------------------------------------------------------------------
   create discretized "density" on section of global grid due to my particles
   density(x,y,z) = charge "density" at grid points of my 3d brick
   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
   in global grid
------------------------------------------------------------------------- */

void PPPMOMP::make_rho()
{
  const double * const q = atom->q;
  const double * const * const x = atom->x;
  const int nthreads = comm->nthreads;
  const int nlocal = atom->nlocal;

#if defined(_OPENMP)
#pragma omp parallel default(none)
#endif
  {
#if defined(_OPENMP)
    // each thread works on a fixed chunk of atoms.
    const int tid = omp_get_thread_num();
    const int inum = nlocal;
    const int idelta = 1 + inum/nthreads;
    const int ifrom = tid*idelta;
    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
#else
    const int tid = 0;
    const int ifrom = 0;
    const int ito = nlocal;
#endif

    int l,m,n,nx,ny,nz,mx,my,mz;
    FFT_SCALAR dx,dy,dz,x0,y0,z0;

    // set up clear 3d density array
    const int nzoffs = (nzhi_out-nzlo_out+1)*tid;
    FFT_SCALAR * const * const * const db = &(density_brick[nzoffs]);
    memset(&(db[nzlo_out][nylo_out][nxlo_out]),0,ngrid*sizeof(FFT_SCALAR));

    ThrData *thr = fix->get_thr(tid);
    FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());

    // loop over my charges, add their contribution to nearby grid points
    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
    // (dx,dy,dz) = distance to "lower left" grid pt
    // (mx,my,mz) = global coords of moving stencil pt

    // this if protects against having more threads than local atoms
    if (ifrom < nlocal) {
      for (int i = ifrom; i < ito; i++) {

	nx = part2grid[i][0];
	ny = part2grid[i][1];
	nz = part2grid[i][2];
	dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
	dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
	dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;

	compute_rho1d_thr(r1d,dx,dy,dz);

	z0 = delvolinv * q[i];
	for (n = nlower; n <= nupper; n++) {
	  mz = n+nz;
	  y0 = z0*r1d[2][n];
	  for (m = nlower; m <= nupper; m++) {
	    my = m+ny;
	    x0 = y0*r1d[1][m];
	    for (l = nlower; l <= nupper; l++) {
	      mx = l+nx;
	      db[mz][my][mx] += x0*r1d[0][l];
	    }
	  }
	}
      }
    }
#if defined(_OPENMP)
    // reduce 3d density array
    if (nthreads > 1) {
      data_reduce_fft(&(density_brick[nzlo_out][nylo_out][nxlo_out]),ngrid,nthreads,1,tid);
    }
#endif
  }
}

/* ----------------------------------------------------------------------
   interpolate from grid to get electric field & force on my particles
------------------------------------------------------------------------- */

void PPPMOMP::fieldforce()
{
  // loop over my charges, interpolate electric field from nearby grid points
  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
  // (dx,dy,dz) = distance to "lower left" grid pt
  // (mx,my,mz) = global coords of moving stencil pt
  // ek = 3 components of E-field on particle

  const double * const q = atom->q;
  const double * const * const x = atom->x;
  const int nthreads = comm->nthreads;
  const int nlocal = atom->nlocal;
  const double qqrd2e = force->qqrd2e;

#if defined(_OPENMP)
#pragma omp parallel default(none)
  {
    // each thread works on a fixed chunk of atoms.
    const int tid = omp_get_thread_num();
    const int inum = nlocal;
    const int idelta = 1 + inum/nthreads;
    const int ifrom = tid*idelta;
    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
#else
    const int ifrom = 0;
    const int ito = nlocal;
    const int tid = 0;
#endif
    ThrData *thr = fix->get_thr(tid);
    double * const * const f = thr->get_f();
    FFT_SCALAR * const * const r1d =  static_cast<FFT_SCALAR **>(thr->get_rho1d());

    int l,m,n,nx,ny,nz,mx,my,mz;
    FFT_SCALAR dx,dy,dz,x0,y0,z0;
    FFT_SCALAR ekx,eky,ekz;

    // this if protects against having more threads than local atoms
    if (ifrom < nlocal) {
      for (int i = ifrom; i < ito; i++) {

	nx = part2grid[i][0];
	ny = part2grid[i][1];
	nz = part2grid[i][2];
	dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
	dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
	dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;

	compute_rho1d_thr(r1d,dx,dy,dz);

	ekx = eky = ekz = ZEROF;
	for (n = nlower; n <= nupper; n++) {
	  mz = n+nz;
	  z0 = r1d[2][n];
	  for (m = nlower; m <= nupper; m++) {
	    my = m+ny;
	    y0 = z0*r1d[1][m];
	    for (l = nlower; l <= nupper; l++) {
	      mx = l+nx;
	      x0 = y0*r1d[0][l];
	      ekx -= x0*vdx_brick[mz][my][mx];
	      eky -= x0*vdy_brick[mz][my][mx];
	      ekz -= x0*vdz_brick[mz][my][mx];
	    }
	  }
	}

	// convert E-field to force
	const double qfactor = qqrd2e*scale*q[i];
	f[i][0] += qfactor*ekx;
	f[i][1] += qfactor*eky;
	f[i][2] += qfactor*ekz;
      }
    }
#if defined(_OPENMP)
  }
#endif
}

/* ----------------------------------------------------------------------
   charge assignment into rho1d
   dx,dy,dz = distance of particle from "lower left" grid point
------------------------------------------------------------------------- */
void PPPMOMP::compute_rho1d_thr(FFT_SCALAR * const * const r1d, const FFT_SCALAR &dx,
				const FFT_SCALAR &dy, const FFT_SCALAR &dz)
{
  int k,l;
  FFT_SCALAR r1,r2,r3;

  for (k = (1-order)/2; k <= order/2; k++) {
    r1 = r2 = r3 = ZEROF;

    for (l = order-1; l >= 0; l--) {
      r1 = rho_coeff[l][k] + r1*dx;
      r2 = rho_coeff[l][k] + r2*dy;
      r3 = rho_coeff[l][k] + r3*dz;
    }
    r1d[0][k] = r1;
    r1d[1][k] = r2;
    r1d[2][k] = r3;
  }
}