lammps/src/USER-OMP/pppm_cg_omp.cpp

/* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov

   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.

   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */

/* ----------------------------------------------------------------------
   Contributing author: Axel Kohlmeyer (Temple U)
------------------------------------------------------------------------- */

#include "pppm_cg_omp.h"
#include "atom.h"
#include "comm.h"
#include "domain.h"
#include "error.h"
#include "fix_omp.h"
#include "force.h"
#include "memory.h"
#include "math_const.h"
#include "math_special.h"

#include <string.h>
#include <math.h>

#include "suffix.h"
using namespace LAMMPS_NS;
using namespace MathConst;
using namespace MathSpecial;

#ifdef FFT_SINGLE
#define ZEROF 0.0f
#define ONEF  1.0f
#else
#define ZEROF 0.0
#define ONEF  1.0
#endif

#define EPS_HOC 1.0e-7

/* ---------------------------------------------------------------------- */

PPPMCGOMP::PPPMCGOMP(LAMMPS *lmp, int narg, char **arg) :
  PPPMCG(lmp, narg, arg), ThrOMP(lmp, THR_KSPACE)
{
  triclinic_support = 0;
  suffix_flag |= Suffix::OMP;
}

/* ----------------------------------------------------------------------
   allocate memory that depends on # of K-vectors and order
------------------------------------------------------------------------- */

void PPPMCGOMP::allocate()
{
  PPPMCG::allocate();

#if defined(_OPENMP)
#pragma omp parallel default(none)
#endif
  {
#if defined(_OPENMP)
    const int tid = omp_get_thread_num();
#else
    const int tid = 0;
#endif
    ThrData *thr = fix->get_thr(tid);
    thr->init_pppm(order,memory);
  }
}

/* ----------------------------------------------------------------------
   free memory that depends on # of K-vectors and order
------------------------------------------------------------------------- */

void PPPMCGOMP::deallocate()
{
  PPPMCG::deallocate();

#if defined(_OPENMP)
#pragma omp parallel default(none)
#endif
  {
#if defined(_OPENMP)
    const int tid = omp_get_thread_num();
#else
    const int tid = 0;
#endif
    ThrData *thr = fix->get_thr(tid);
    thr->init_pppm(-order,memory);
  }
}

/* ----------------------------------------------------------------------
   pre-compute modified (Hockney-Eastwood) Coulomb Green's function
------------------------------------------------------------------------- */

void PPPMCGOMP::compute_gf_ik()
{
  const double * const prd = (triclinic==0) ? domain->prd : domain->prd_lamda;

  const double xprd = prd[0];
  const double yprd = prd[1];
  const double zprd = prd[2];
  const double zprd_slab = zprd*slab_volfactor;
  const double unitkx = (MY_2PI/xprd);
  const double unitky = (MY_2PI/yprd);
  const double unitkz = (MY_2PI/zprd_slab);

  const int nbx = static_cast<int> ((g_ewald*xprd/(MY_PI*nx_pppm)) *
                                    pow(-log(EPS_HOC),0.25));
  const int nby = static_cast<int> ((g_ewald*yprd/(MY_PI*ny_pppm)) *
                                    pow(-log(EPS_HOC),0.25));
  const int nbz = static_cast<int> ((g_ewald*zprd_slab/(MY_PI*nz_pppm)) *
                                    pow(-log(EPS_HOC),0.25));
  const int numk = nxhi_fft - nxlo_fft + 1;
  const int numl = nyhi_fft - nylo_fft + 1;

  const int twoorder = 2*order;

#if defined(_OPENMP)
#pragma omp parallel default(none)
#endif
  {
    double snx,sny,snz;
    double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
    double sum1,dot1,dot2;
    double numerator,denominator;
    double sqk;

    int k,l,m,nx,ny,nz,kper,lper,mper,n,nfrom,nto,tid;

    loop_setup_thr(nfrom, nto, tid, nfft, comm->nthreads);

    for (n = nfrom; n < nto; ++n) {
      m = n / (numl*numk);
      l = (n - m*numl*numk) / numk;
      k = n - m*numl*numk - l*numk;
      m += nzlo_fft;
      l += nylo_fft;
      k += nxlo_fft;

      mper = m - nz_pppm*(2*m/nz_pppm);
      snz = square(sin(0.5*unitkz*mper*zprd_slab/nz_pppm));

      lper = l - ny_pppm*(2*l/ny_pppm);
      sny = square(sin(0.5*unitky*lper*yprd/ny_pppm));

      kper = k - nx_pppm*(2*k/nx_pppm);
      snx = square(sin(0.5*unitkx*kper*xprd/nx_pppm));

      sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper);

      if (sqk != 0.0) {
	numerator = 12.5663706/sqk;
	denominator = gf_denom(snx,sny,snz);
	sum1 = 0.0;

	for (nx = -nbx; nx <= nbx; nx++) {
	  qx = unitkx*(kper+nx_pppm*nx);
	  sx = exp(-0.25*square(qx/g_ewald));
	  argx = 0.5*qx*xprd/nx_pppm;
	  wx = powsinxx(argx,twoorder);

	  for (ny = -nby; ny <= nby; ny++) {
	    qy = unitky*(lper+ny_pppm*ny);
	    sy = exp(-0.25*square(qy/g_ewald));
	    argy = 0.5*qy*yprd/ny_pppm;
	    wy = powsinxx(argy,twoorder);

	    for (nz = -nbz; nz <= nbz; nz++) {
	      qz = unitkz*(mper+nz_pppm*nz);
	      sz = exp(-0.25*square(qz/g_ewald));
	      argz = 0.5*qz*zprd_slab/nz_pppm;
	      wz = powsinxx(argz,twoorder);

	      dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
	      dot2 = qx*qx+qy*qy+qz*qz;
	      sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz;
	    }
	  }
	}
	greensfn[n] = numerator*sum1/denominator;
      } else greensfn[n] = 0.0;
    }
  } // end of parallel region
}

/* ----------------------------------------------------------------------
   compute optimized Green's function for energy calculation
------------------------------------------------------------------------- */

void PPPMCGOMP::compute_gf_ad()
{

  const double * const prd = (triclinic==0) ? domain->prd : domain->prd_lamda;

  const double xprd = prd[0];
  const double yprd = prd[1];
  const double zprd = prd[2];
  const double zprd_slab = zprd*slab_volfactor;
  const double unitkx = (MY_2PI/xprd);
  const double unitky = (MY_2PI/yprd);
  const double unitkz = (MY_2PI/zprd_slab);

  const int numk = nxhi_fft - nxlo_fft + 1;
  const int numl = nyhi_fft - nylo_fft + 1;

  const int twoorder = 2*order;
  double sf0=0.0,sf1=0.0,sf2=0.0,sf3=0.0,sf4=0.0,sf5=0.0;

#if defined(_OPENMP)
#pragma omp parallel default(none) reduction(+:sf0,sf1,sf2,sf3,sf4,sf5)
#endif
  {
    double snx,sny,snz,sqk;
    double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
    double numerator,denominator;
    int k,l,m,kper,lper,mper,n,nfrom,nto,tid;

    loop_setup_thr(nfrom, nto, tid, nfft, comm->nthreads);

    for (n = nfrom; n < nto; ++n) {

      m = n / (numl*numk);
      l = (n - m*numl*numk) / numk;
      k = n - m*numl*numk - l*numk;
      m += nzlo_fft;
      l += nylo_fft;
      k += nxlo_fft;

      mper = m - nz_pppm*(2*m/nz_pppm);
      qz = unitkz*mper;
      snz = square(sin(0.5*qz*zprd_slab/nz_pppm));
      sz = exp(-0.25*square(qz/g_ewald));
      argz = 0.5*qz*zprd_slab/nz_pppm;
      wz = powsinxx(argz,twoorder);

      lper = l - ny_pppm*(2*l/ny_pppm);
      qy = unitky*lper;
      sny = square(sin(0.5*qy*yprd/ny_pppm));
      sy = exp(-0.25*square(qy/g_ewald));
      argy = 0.5*qy*yprd/ny_pppm;
      wy = powsinxx(argy,twoorder);

      kper = k - nx_pppm*(2*k/nx_pppm);
      qx = unitkx*kper;
      snx = square(sin(0.5*qx*xprd/nx_pppm));
      sx = exp(-0.25*square(qx/g_ewald));
      argx = 0.5*qx*xprd/nx_pppm;
      wx = powsinxx(argx,twoorder);

      sqk = qx*qx + qy*qy + qz*qz;

      if (sqk != 0.0) {
	numerator = MY_4PI/sqk;
	denominator = gf_denom(snx,sny,snz);
	greensfn[n] = numerator*sx*sy*sz*wx*wy*wz/denominator;
	sf0 += sf_precoeff1[n]*greensfn[n];
	sf1 += sf_precoeff2[n]*greensfn[n];
	sf2 += sf_precoeff3[n]*greensfn[n];
	sf3 += sf_precoeff4[n]*greensfn[n];
	sf4 += sf_precoeff5[n]*greensfn[n];
	sf5 += sf_precoeff6[n]*greensfn[n];
      } else {
	greensfn[n] = 0.0;
	sf0 += sf_precoeff1[n]*greensfn[n];
	sf1 += sf_precoeff2[n]*greensfn[n];
	sf2 += sf_precoeff3[n]*greensfn[n];
	sf3 += sf_precoeff4[n]*greensfn[n];
	sf4 += sf_precoeff5[n]*greensfn[n];
	sf5 += sf_precoeff6[n]*greensfn[n];
      }
    }
  } // end of paralle region

  // compute the coefficients for the self-force correction

  double prex, prey, prez, tmp[6];
  prex = prey = prez = MY_PI/volume;
  prex *= nx_pppm/xprd;
  prey *= ny_pppm/yprd;
  prez *= nz_pppm/zprd_slab;
  tmp[0] = sf0 * prex;
  tmp[1] = sf1 * prex*2;
  tmp[2] = sf2 * prey;
  tmp[3] = sf3 * prey*2;
  tmp[4] = sf4 * prez;
  tmp[5] = sf5 * prez*2;

  // communicate values with other procs

  MPI_Allreduce(tmp,sf_coeff,6,MPI_DOUBLE,MPI_SUM,world);
}

/* ----------------------------------------------------------------------
   run the regular toplevel compute method from plain PPPMCG
   which will have individual methods replaced by our threaded
   versions and then call the obligatory force reduction.
------------------------------------------------------------------------- */

void PPPMCGOMP::compute(int eflag, int vflag)
{

  PPPMCG::compute(eflag,vflag);

#if defined(_OPENMP)
#pragma omp parallel default(none) shared(eflag,vflag)
#endif
  {
#if defined(_OPENMP)
    const int tid = omp_get_thread_num();
#else
    const int tid = 0;
#endif
    ThrData *thr = fix->get_thr(tid);
    reduce_thr(this, eflag, vflag, thr);
  } // end of omp parallel region
}

/* ----------------------------------------------------------------------
   create discretized "density" on section of global grid due to my particles
   density(x,y,z) = charge "density" at grid points of my 3d brick
   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
   in global grid
------------------------------------------------------------------------- */

void PPPMCGOMP::make_rho()
{

  // clear 3d density array

  FFT_SCALAR * _noalias const d = &(density_brick[nzlo_out][nylo_out][nxlo_out]);
  memset(d,0,ngrid*sizeof(FFT_SCALAR));

  // no local atoms with a charge => nothing else to do

  if (num_charged == 0) return;

  const int ix = nxhi_out - nxlo_out + 1;
  const int iy = nyhi_out - nylo_out + 1;

#if defined(_OPENMP)
#pragma omp parallel default(none)
#endif
  {
    const double * _noalias const q = atom->q;
    const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
    const int3_t * _noalias const p2g = (int3_t *) part2grid[0];

    const double boxlox = boxlo[0];
    const double boxloy = boxlo[1];
    const double boxloz = boxlo[2];

    // determine range of grid points handled by this thread
    int i,jfrom,jto,tid;
    loop_setup_thr(jfrom,jto,tid,ngrid,comm->nthreads);

    // get per thread data
    ThrData *thr = fix->get_thr(tid);
    FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());

    // loop over my charges, add their contribution to nearby grid points
    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
    // (dx,dy,dz) = distance to "lower left" grid pt

    // loop over all local atoms for all threads

    for (int j = 0; j < num_charged; j++) {
      i = is_charged[j];

      const int nx = p2g[i].a;
      const int ny = p2g[i].b;
      const int nz = p2g[i].t;

      // pre-screen whether this atom will ever come within
      // reach of the data segement this thread is updating.
      if ( ((nz+nlower-nzlo_out)*ix*iy >= jto)
           || ((nz+nupper-nzlo_out+1)*ix*iy < jfrom) ) continue;

      const FFT_SCALAR dx = nx+shiftone - (x[i].x-boxlox)*delxinv;
      const FFT_SCALAR dy = ny+shiftone - (x[i].y-boxloy)*delyinv;
      const FFT_SCALAR dz = nz+shiftone - (x[i].z-boxloz)*delzinv;

      compute_rho1d_thr(r1d,dx,dy,dz);

      const FFT_SCALAR z0 = delvolinv * q[i];

      for (int n = nlower; n <= nupper; ++n) {
        const int jn = (nz+n-nzlo_out)*ix*iy;
        const FFT_SCALAR y0 = z0*r1d[2][n];

        for (int m = nlower; m <= nupper; ++m) {
          const int jm = jn+(ny+m-nylo_out)*ix;
          const FFT_SCALAR x0 = y0*r1d[1][m];

          for (int l = nlower; l <= nupper; ++l) {
            const int jl = jm+nx+l-nxlo_out;
            // make sure each thread only updates
            // "his" elements of the density grid
            if (jl >= jto) break;
            if (jl < jfrom) continue;

            d[jl] += x0*r1d[0][l];
          }
        }
      }
    }
  }
}

/* ----------------------------------------------------------------------
   interpolate from grid to get electric field & force on my particles for ik
------------------------------------------------------------------------- */

void PPPMCGOMP::fieldforce_ik()
{
  // no local atoms with a charge => nothing to do

  if (num_charged == 0) return;

  // loop over my charges, interpolate electric field from nearby grid points
  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
  // (dx,dy,dz) = distance to "lower left" grid pt
  // (mx,my,mz) = global coords of moving stencil pt
  // ek = 3 components of E-field on particle

  const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
  const double * _noalias const q = atom->q;
  const double qqrd2e = force->qqrd2e;
  const int nthreads = comm->nthreads;

#if defined(_OPENMP)
#pragma omp parallel default(none)
#endif
  {
    FFT_SCALAR dx,dy,dz,x0,y0,z0,ekx,eky,ekz;
    int i,ifrom,ito,tid,l,m,n,nx,ny,nz,mx,my,mz;

    loop_setup_thr(ifrom,ito,tid,num_charged,nthreads);

    // get per thread data
    ThrData *thr = fix->get_thr(tid);
    dbl3_t * _noalias const f = (dbl3_t *) thr->get_f()[0];
    FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());

    for (int j = ifrom; j < ito; ++j) {
      i = is_charged[j];

      nx = part2grid[i][0];
      ny = part2grid[i][1];
      nz = part2grid[i][2];
      dx = nx+shiftone - (x[i].x-boxlo[0])*delxinv;
      dy = ny+shiftone - (x[i].y-boxlo[1])*delyinv;
      dz = nz+shiftone - (x[i].z-boxlo[2])*delzinv;

      compute_rho1d_thr(r1d,dx,dy,dz);

      ekx = eky = ekz = ZEROF;
      for (n = nlower; n <= nupper; n++) {
        mz = n+nz;
        z0 = r1d[2][n];
        for (m = nlower; m <= nupper; m++) {
          my = m+ny;
          y0 = z0*r1d[1][m];
          for (l = nlower; l <= nupper; l++) {
            mx = l+nx;
            x0 = y0*r1d[0][l];
            ekx -= x0*vdx_brick[mz][my][mx];
            eky -= x0*vdy_brick[mz][my][mx];
            ekz -= x0*vdz_brick[mz][my][mx];
          }
        }
      }

      // convert E-field to force

      const double qfactor = qqrd2e * scale * q[i];
      f[i].x += qfactor*ekx;
      f[i].y += qfactor*eky;
      if (slabflag != 2) f[i].z += qfactor*ekz;
    }
  } // end of parallel region
}

/* ----------------------------------------------------------------------
   interpolate from grid to get electric field & force on my particles for ad
------------------------------------------------------------------------- */

void PPPMCGOMP::fieldforce_ad()
{
  // no local atoms with a charge => nothing to do

  if (num_charged == 0) return;

  const double *prd = (triclinic == 0) ? domain->prd : domain->prd_lamda;
  const double hx_inv = nx_pppm/prd[0];
  const double hy_inv = ny_pppm/prd[1];
  const double hz_inv = nz_pppm/prd[2];

  // loop over my charges, interpolate electric field from nearby grid points
  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
  // (dx,dy,dz) = distance to "lower left" grid pt
  // (mx,my,mz) = global coords of moving stencil pt
  // ek = 3 components of E-field on particle

  const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
  const double * _noalias const q = atom->q;
  const double qqrd2e = force->qqrd2e;
  const int nthreads = comm->nthreads;

#if defined(_OPENMP)
#pragma omp parallel default(none)
#endif
  {
    int i,ifrom,ito,tid,l,m,n,nx,ny,nz,mx,my,mz;
    FFT_SCALAR dx,dy,dz,ekx,eky,ekz;
    double s1,s2,s3,sf;

    loop_setup_thr(ifrom,ito,tid,num_charged,nthreads);

    // get per thread data
    ThrData *thr = fix->get_thr(tid);
    dbl3_t * _noalias const f = (dbl3_t *) thr->get_f()[0];
    FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
    FFT_SCALAR * const * const d1d = static_cast<FFT_SCALAR **>(thr->get_drho1d());

    for (int j = ifrom; j < ito; ++j) {
      i = is_charged[j];

      nx = part2grid[i][0];
      ny = part2grid[i][1];
      nz = part2grid[i][2];
      dx = nx+shiftone - (x[i].x-boxlo[0])*delxinv;
      dy = ny+shiftone - (x[i].y-boxlo[1])*delyinv;
      dz = nz+shiftone - (x[i].z-boxlo[2])*delzinv;

      compute_rho1d_thr(r1d,dx,dy,dz);
      compute_drho1d_thr(d1d,dx,dy,dz);

      ekx = eky = ekz = ZEROF;
      for (n = nlower; n <= nupper; n++) {
        mz = n+nz;
        for (m = nlower; m <= nupper; m++) {
          my = m+ny;
          for (l = nlower; l <= nupper; l++) {
            mx = l+nx;
            ekx += d1d[0][l]*r1d[1][m]*r1d[2][n]*u_brick[mz][my][mx];
            eky += r1d[0][l]*d1d[1][m]*r1d[2][n]*u_brick[mz][my][mx];
            ekz += r1d[0][l]*r1d[1][m]*d1d[2][n]*u_brick[mz][my][mx];
          }
        }
      }
      ekx *= hx_inv;
      eky *= hy_inv;
      ekz *= hz_inv;

      // convert E-field to force and substract self forces

      const double qi = q[i];
      const double qfactor = qqrd2e * scale * qi;

      s1 = x[i].x*hx_inv;
      sf = sf_coeff[0]*sin(MY_2PI*s1);
      sf += sf_coeff[1]*sin(MY_4PI*s1);
      sf *= 2.0*qi;
      f[i].x += qfactor*(ekx - sf);

      s2 = x[i].y*hy_inv;
      sf = sf_coeff[2]*sin(MY_2PI*s2);
      sf += sf_coeff[3]*sin(MY_4PI*s2);
      sf *= 2*qi;
      f[i].y += qfactor*(eky - sf);

      s3 = x[i].z*hz_inv;
      sf = sf_coeff[4]*sin(MY_2PI*s3);
      sf += sf_coeff[5]*sin(MY_4PI*s3);
      sf *= 2*qi;
      if (slabflag != 2) f[i].z += qfactor*(ekz - sf);
    }
  } // end of parallel region
}

/* ----------------------------------------------------------------------
   interpolate from grid to get per-atom energy/virial
------------------------------------------------------------------------- */

void PPPMCGOMP::fieldforce_peratom()
{
  // no local atoms with a charge => nothing to do

  if (num_charged == 0) return;

  // loop over my charges, interpolate from nearby grid points
  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
  // (dx,dy,dz) = distance to "lower left" grid pt
  // (mx,my,mz) = global coords of moving stencil pt

  const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
  const double * _noalias const q = atom->q;
  const int nthreads = comm->nthreads;

#if defined(_OPENMP)
#pragma omp parallel default(none)
#endif
  {
    FFT_SCALAR dx,dy,dz,x0,y0,z0;
    FFT_SCALAR u,v0,v1,v2,v3,v4,v5;
    int i,ifrom,ito,tid,l,m,n,nx,ny,nz,mx,my,mz;

    loop_setup_thr(ifrom,ito,tid,num_charged,nthreads);

    // get per thread data
    ThrData *thr = fix->get_thr(tid);
    FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());

    for (int j=ifrom; j < ito; ++j) {
      i = is_charged[j];

      nx = part2grid[i][0];
      ny = part2grid[i][1];
      nz = part2grid[i][2];
      dx = nx+shiftone - (x[i].x-boxlo[0])*delxinv;
      dy = ny+shiftone - (x[i].y-boxlo[1])*delyinv;
      dz = nz+shiftone - (x[i].z-boxlo[2])*delzinv;

      compute_rho1d_thr(r1d,dx,dy,dz);

      u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF;
      for (n = nlower; n <= nupper; n++) {
        mz = n+nz;
        z0 = r1d[2][n];
        for (m = nlower; m <= nupper; m++) {
          my = m+ny;
          y0 = z0*r1d[1][m];
          for (l = nlower; l <= nupper; l++) {
            mx = l+nx;
            x0 = y0*r1d[0][l];
            if (eflag_atom) u += x0*u_brick[mz][my][mx];
            if (vflag_atom) {
              v0 += x0*v0_brick[mz][my][mx];
              v1 += x0*v1_brick[mz][my][mx];
              v2 += x0*v2_brick[mz][my][mx];
              v3 += x0*v3_brick[mz][my][mx];
              v4 += x0*v4_brick[mz][my][mx];
              v5 += x0*v5_brick[mz][my][mx];
            }
          }
        }
      }

      const double qi = q[i];
      if (eflag_atom) eatom[i] += qi*u;
      if (vflag_atom) {
        vatom[i][0] += qi*v0;
        vatom[i][1] += qi*v1;
        vatom[i][2] += qi*v2;
        vatom[i][3] += qi*v3;
        vatom[i][4] += qi*v4;
        vatom[i][5] += qi*v5;
      }
    }
  } // end of parallel region
}

/* ----------------------------------------------------------------------
   charge assignment into rho1d
   dx,dy,dz = distance of particle from "lower left" grid point
------------------------------------------------------------------------- */
void PPPMCGOMP::compute_rho1d_thr(FFT_SCALAR * const * const r1d, const FFT_SCALAR &dx,
                                const FFT_SCALAR &dy, const FFT_SCALAR &dz)
{
  int k,l;
  FFT_SCALAR r1,r2,r3;

  for (k = (1-order)/2; k <= order/2; k++) {
    r1 = r2 = r3 = ZEROF;

    for (l = order-1; l >= 0; l--) {
      r1 = rho_coeff[l][k] + r1*dx;
      r2 = rho_coeff[l][k] + r2*dy;
      r3 = rho_coeff[l][k] + r3*dz;
    }
    r1d[0][k] = r1;
    r1d[1][k] = r2;
    r1d[2][k] = r3;
  }
}

/* ----------------------------------------------------------------------
   charge assignment into drho1d
   dx,dy,dz = distance of particle from "lower left" grid point
------------------------------------------------------------------------- */

void PPPMCGOMP::compute_drho1d_thr(FFT_SCALAR * const * const d1d, const FFT_SCALAR &dx,
			      const FFT_SCALAR &dy, const FFT_SCALAR &dz)
{
  int k,l;
  FFT_SCALAR r1,r2,r3;

  for (k = (1-order)/2; k <= order/2; k++) {
    r1 = r2 = r3 = ZEROF;

    for (l = order-2; l >= 0; l--) {
      r1 = drho_coeff[l][k] + r1*dx;
      r2 = drho_coeff[l][k] + r2*dy;
      r3 = drho_coeff[l][k] + r3*dz;
    }
    d1d[0][k] = r1;
    d1d[1][k] = r2;
    d1d[2][k] = r3;
  }
}