git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@9449 f3b2605a-c512-4ea7-a41b-209d697bcdaa

This commit is contained in:
sjplimp
2013-02-12 15:38:24 +00:00
parent 0fa04785e3
commit aae97983b8
12 changed files with 1776 additions and 1156 deletions

View File

@ -19,8 +19,10 @@
#include "mpi.h" #include "mpi.h"
#include "math.h" #include "math.h"
#include "stdlib.h" #include "stdlib.h"
#include "string.h"
#include "atom.h" #include "atom.h"
#include "commgrid.h"
#include "domain.h" #include "domain.h"
#include "error.h" #include "error.h"
#include "force.h" #include "force.h"
@ -34,7 +36,11 @@ using namespace MathConst;
#define OFFSET 16384 #define OFFSET 16384
#define SMALLQ 0.00001 #define SMALLQ 0.00001
#if defined(FFT_SINGLE)
enum{REVERSE_RHO};
enum{FORWARD_IK,FORWARD_AD,FORWARD_IK_PERATOM,FORWARD_AD_PERATOM};
#ifdef FFT_SINGLE
#define ZEROF 0.0f #define ZEROF 0.0f
#else #else
#define ZEROF 0.0 #define ZEROF 0.0
@ -42,7 +48,7 @@ using namespace MathConst;
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
PPPMCG::PPPMCG(LAMMPS *lmp, int narg, char **arg) : PPPMOld(lmp, narg, arg) PPPMCG::PPPMCG(LAMMPS *lmp, int narg, char **arg) : PPPM(lmp, narg, arg)
{ {
if ((narg < 1) || (narg > 2)) if ((narg < 1) || (narg > 2))
error->all(FLERR,"Illegal kspace_style pppm/cg command"); error->all(FLERR,"Illegal kspace_style pppm/cg command");
@ -54,6 +60,7 @@ PPPMCG::PPPMCG(LAMMPS *lmp, int narg, char **arg) : PPPMOld(lmp, narg, arg)
num_charged = -1; num_charged = -1;
is_charged = NULL; is_charged = NULL;
group_group_enable = 1;
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
@ -71,7 +78,6 @@ PPPMCG::~PPPMCG()
void PPPMCG::compute(int eflag, int vflag) void PPPMCG::compute(int eflag, int vflag)
{ {
int i,j;
// set energy/virial flags // set energy/virial flags
// invoke allocate_peratom() if needed for first time // invoke allocate_peratom() if needed for first time
@ -82,6 +88,8 @@ void PPPMCG::compute(int eflag, int vflag)
if (evflag_atom && !peratom_allocate_flag) { if (evflag_atom && !peratom_allocate_flag) {
allocate_peratom(); allocate_peratom();
cg_peratom->ghost_notify();
cg_peratom->setup();
peratom_allocate_flag = 1; peratom_allocate_flag = 1;
} }
@ -110,7 +118,7 @@ void PPPMCG::compute(int eflag, int vflag)
double charged_frac, charged_fmax, charged_fmin; double charged_frac, charged_fmax, charged_fmin;
num_charged=0; num_charged=0;
for (i=0; i < atom->nlocal; ++i) for (int i=0; i < atom->nlocal; ++i)
if (fabs(atom->q[i]) > smallq) if (fabs(atom->q[i]) > smallq)
++num_charged; ++num_charged;
@ -148,7 +156,7 @@ void PPPMCG::compute(int eflag, int vflag)
} }
num_charged = 0; num_charged = 0;
for (i = 0; i < atom->nlocal; ++i) for (int i = 0; i < atom->nlocal; ++i)
if (fabs(atom->q[i]) > smallq) { if (fabs(atom->q[i]) > smallq) {
is_charged[num_charged] = i; is_charged[num_charged] = i;
++num_charged; ++num_charged;
@ -164,6 +172,7 @@ void PPPMCG::compute(int eflag, int vflag)
// to fully sum contribution in their 3d bricks // to fully sum contribution in their 3d bricks
// remap from 3d decomposition to FFT decomposition // remap from 3d decomposition to FFT decomposition
cg->reverse_comm(this,REVERSE_RHO);
brick2fft(); brick2fft();
// compute potential gradient on my FFT grid and // compute potential gradient on my FFT grid and
@ -176,11 +185,17 @@ void PPPMCG::compute(int eflag, int vflag)
// all procs communicate E-field values // all procs communicate E-field values
// to fill ghost cells surrounding their 3d bricks // to fill ghost cells surrounding their 3d bricks
fillbrick(); if (differentiation_flag == 1) cg->forward_comm(this,FORWARD_AD);
else cg->forward_comm(this,FORWARD_IK);
// extra per-atom energy/virial communication // extra per-atom energy/virial communication
if (evflag_atom) fillbrick_peratom(); if (evflag_atom) {
if (differentiation_flag == 1 && vflag_atom)
cg_peratom->forward_comm(this,FORWARD_AD_PERATOM);
else if (differentiation_flag == 0)
cg_peratom->forward_comm(this,FORWARD_IK_PERATOM);
}
// calculate the force on my particles // calculate the force on my particles
@ -210,19 +225,18 @@ void PPPMCG::compute(int eflag, int vflag)
if (vflag_global) { if (vflag_global) {
double virial_all[6]; double virial_all[6];
MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world); MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
for (i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i]; for (int i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i];
} }
// per-atom energy/virial // per-atom energy/virial
// energy includes self-energy correction // energy includes self-energy correction
if (evflag_atom) { if (evflag_atom) {
double *q = atom->q; const double * const q = atom->q;
int nlocal = atom->nlocal;
if (eflag_atom) { if (eflag_atom) {
for (int j = 0; j < num_charged; j++) { for (int j = 0; j < num_charged; j++) {
int i = is_charged[j]; const int i = is_charged[j];
eatom[i] *= 0.5; eatom[i] *= 0.5;
eatom[i] -= g_ewald*q[i]*q[i]/MY_PIS + MY_PI2*q[i]*qsum / eatom[i] -= g_ewald*q[i]*q[i]/MY_PIS + MY_PI2*q[i]*qsum /
(g_ewald*g_ewald*volume); (g_ewald*g_ewald*volume);
@ -231,16 +245,16 @@ void PPPMCG::compute(int eflag, int vflag)
} }
if (vflag_atom) { if (vflag_atom) {
for (int j = 0; j < num_charged; j++) { for (int n = 0; n < num_charged; n++) {
int i = is_charged[j]; const int i = is_charged[n];
for (int n = 0; n < 6; n++) vatom[i][n] *= 0.5*q[i]*qscale; for (int j = 0; j < 6; j++) vatom[i][j] *= 0.5*qscale;
} }
} }
} }
// 2d slab correction // 2d slab correction
if (slabflag) slabcorr(); if (slabflag == 1) slabcorr();
// convert atoms back from lamda to box coords // convert atoms back from lamda to box coords
@ -279,7 +293,8 @@ void PPPMCG::particle_map()
if (nx+nlower < nxlo_out || nx+nupper > nxhi_out || if (nx+nlower < nxlo_out || nx+nupper > nxhi_out ||
ny+nlower < nylo_out || ny+nupper > nyhi_out || ny+nlower < nylo_out || ny+nupper > nyhi_out ||
nz+nlower < nzlo_out || nz+nupper > nzhi_out) flag = 1; nz+nlower < nzlo_out || nz+nupper > nzhi_out)
flag = 1;
} }
if (flag) error->one(FLERR,"Out of range atoms - cannot compute PPPM"); if (flag) error->one(FLERR,"Out of range atoms - cannot compute PPPM");
@ -299,8 +314,8 @@ void PPPMCG::make_rho()
// clear 3d density array // clear 3d density array
FFT_SCALAR *vec = &density_brick[nzlo_out][nylo_out][nxlo_out]; memset(&(density_brick[nzlo_out][nylo_out][nxlo_out]),0,
for (i = 0; i < ngrid; i++) vec[i] = ZEROF; ngrid*sizeof(FFT_SCALAR));
// loop over my charges, add their contribution to nearby grid points // loop over my charges, add their contribution to nearby grid points
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
@ -311,7 +326,7 @@ void PPPMCG::make_rho()
double **x = atom->x; double **x = atom->x;
for (int j = 0; j < num_charged; j++) { for (int j = 0; j < num_charged; j++) {
int i = is_charged[j]; i = is_charged[j];
nx = part2grid[i][0]; nx = part2grid[i][0];
ny = part2grid[i][1]; ny = part2grid[i][1];
@ -337,12 +352,11 @@ void PPPMCG::make_rho()
} }
} }
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
interpolate from grid to get electric field & force on my particles interpolate from grid to get electric field & force on my particles for ik
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
void PPPMCG::fieldforce() void PPPMCG::fieldforce_ik()
{ {
int i,l,m,n,nx,ny,nz,mx,my,mz; int i,l,m,n,nx,ny,nz,mx,my,mz;
FFT_SCALAR dx,dy,dz,x0,y0,z0; FFT_SCALAR dx,dy,dz,x0,y0,z0;
@ -392,74 +406,162 @@ void PPPMCG::fieldforce()
const double qfactor = force->qqrd2e * scale * q[i]; const double qfactor = force->qqrd2e * scale * q[i];
f[i][0] += qfactor*ekx; f[i][0] += qfactor*ekx;
f[i][1] += qfactor*eky; f[i][1] += qfactor*eky;
f[i][2] += qfactor*ekz; if (slabflag != 2) f[i][2] += qfactor*ekz;
} }
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
interpolate from grid to get per-atom energy/virial interpolate from grid to get electric field & force on my particles for ad
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
void PPPMCG::fieldforce_ad()
{
int i,l,m,n,nx,ny,nz,mx,my,mz;
FFT_SCALAR dx,dy,dz;
FFT_SCALAR ekx,eky,ekz;
double s1,s2,s3;
double sf = 0.0;
double *prd;
if (triclinic == 0) prd = domain->prd;
else prd = domain->prd_lamda;
double xprd = prd[0];
double yprd = prd[1];
double zprd = prd[2];
double hx_inv = nx_pppm/xprd;
double hy_inv = ny_pppm/yprd;
double hz_inv = nz_pppm/zprd;
// loop over my charges, interpolate electric field from nearby grid points
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
// (dx,dy,dz) = distance to "lower left" grid pt
// (mx,my,mz) = global coords of moving stencil pt
// ek = 3 components of E-field on particle
double *q = atom->q;
double **x = atom->x;
double **f = atom->f;
for (int j = 0; j < num_charged; j++) {
i = is_charged[j];
nx = part2grid[i][0];
ny = part2grid[i][1];
nz = part2grid[i][2];
dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
compute_rho1d(dx,dy,dz);
compute_drho1d(dx,dy,dz);
ekx = eky = ekz = ZEROF;
for (n = nlower; n <= nupper; n++) {
mz = n+nz;
for (m = nlower; m <= nupper; m++) {
my = m+ny;
for (l = nlower; l <= nupper; l++) {
mx = l+nx;
ekx += drho1d[0][l]*rho1d[1][m]*rho1d[2][n]*u_brick[mz][my][mx];
eky += rho1d[0][l]*drho1d[1][m]*rho1d[2][n]*u_brick[mz][my][mx];
ekz += rho1d[0][l]*rho1d[1][m]*drho1d[2][n]*u_brick[mz][my][mx];
}
}
}
ekx *= hx_inv;
eky *= hy_inv;
ekz *= hz_inv;
// convert E-field to force and substract self forces
const double qfactor = force->qqrd2e * scale;
s1 = x[i][0]*hx_inv;
s2 = x[i][1]*hy_inv;
s3 = x[i][2]*hz_inv;
sf = sf_coeff[0]*sin(2*MY_PI*s1);
sf += sf_coeff[1]*sin(4*MY_PI*s1);
sf *= 2*q[i]*q[i];
f[i][0] += qfactor*(ekx*q[i] - sf);
sf = sf_coeff[2]*sin(2*MY_PI*s2);
sf += sf_coeff[3]*sin(4*MY_PI*s2);
sf *= 2*q[i]*q[i];
f[i][1] += qfactor*(eky*q[i] - sf);
sf = sf_coeff[4]*sin(2*MY_PI*s3);
sf += sf_coeff[5]*sin(4*MY_PI*s3);
sf *= 2*q[i]*q[i];
if (slabflag != 2) f[i][2] += qfactor*(ekz*q[i] - sf);
}
}
/* ----------------------------------------------------------------------
interpolate from grid to get per-atom energy/virial
------------------------------------------------------------------------- */
void PPPMCG::fieldforce_peratom() void PPPMCG::fieldforce_peratom()
{ {
int i,l,m,n,nx,ny,nz,mx,my,mz; int i,l,m,n,nx,ny,nz,mx,my,mz;
FFT_SCALAR dx,dy,dz,x0,y0,z0; FFT_SCALAR dx,dy,dz,x0,y0,z0;
FFT_SCALAR u,v0,v1,v2,v3,v4,v5; FFT_SCALAR u,v0,v1,v2,v3,v4,v5;
// loop over my charges, interpolate from nearby grid points // loop over my charges, interpolate from nearby grid points
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
// (dx,dy,dz) = distance to "lower left" grid pt // (dx,dy,dz) = distance to "lower left" grid pt
// (mx,my,mz) = global coords of moving stencil pt // (mx,my,mz) = global coords of moving stencil pt
double *q = atom->q; double *q = atom->q;
double **x = atom->x; double **x = atom->x;
double **f = atom->f;
for (int j = 0; j < num_charged; j++) { for (int j = 0; j < num_charged; j++) {
i = is_charged[j]; i = is_charged[j];
nx = part2grid[i][0]; nx = part2grid[i][0];
ny = part2grid[i][1]; ny = part2grid[i][1];
nz = part2grid[i][2]; nz = part2grid[i][2];
dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv; dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv; dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv; dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
compute_rho1d(dx,dy,dz); compute_rho1d(dx,dy,dz);
u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF; u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF;
for (n = nlower; n <= nupper; n++) { for (n = nlower; n <= nupper; n++) {
mz = n+nz; mz = n+nz;
z0 = rho1d[2][n]; z0 = rho1d[2][n];
for (m = nlower; m <= nupper; m++) { for (m = nlower; m <= nupper; m++) {
my = m+ny; my = m+ny;
y0 = z0*rho1d[1][m]; y0 = z0*rho1d[1][m];
for (l = nlower; l <= nupper; l++) { for (l = nlower; l <= nupper; l++) {
mx = l+nx; mx = l+nx;
x0 = y0*rho1d[0][l]; x0 = y0*rho1d[0][l];
if (eflag_atom) u += x0*u_brick[mz][my][mx]; if (eflag_atom) u += x0*u_brick[mz][my][mx];
if (vflag_atom) { if (vflag_atom) {
v0 += x0*v0_brick[mz][my][mx]; v0 += x0*v0_brick[mz][my][mx];
v1 += x0*v1_brick[mz][my][mx]; v1 += x0*v1_brick[mz][my][mx];
v2 += x0*v2_brick[mz][my][mx]; v2 += x0*v2_brick[mz][my][mx];
v3 += x0*v3_brick[mz][my][mx]; v3 += x0*v3_brick[mz][my][mx];
v4 += x0*v4_brick[mz][my][mx]; v4 += x0*v4_brick[mz][my][mx];
v5 += x0*v5_brick[mz][my][mx]; v5 += x0*v5_brick[mz][my][mx];
} }
}
}
}
if (eflag_atom) eatom[i] += q[i]*u;
if (vflag_atom) {
vatom[i][0] += v0;
vatom[i][1] += v1;
vatom[i][2] += v2;
vatom[i][3] += v3;
vatom[i][4] += v4;
vatom[i][5] += v5;
} }
}
} }
if (eflag_atom) eatom[i] += q[i]*u;
if (vflag_atom) {
vatom[i][0] += q[i]*v0;
vatom[i][1] += q[i]*v1;
vatom[i][2] += q[i]*v2;
vatom[i][3] += q[i]*v3;
vatom[i][4] += q[i]*v4;
vatom[i][5] += q[i]*v5;
}
}
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
@ -471,14 +573,17 @@ void PPPMCG::fieldforce_peratom()
void PPPMCG::slabcorr() void PPPMCG::slabcorr()
{ {
int i,j;
// compute local contribution to global dipole moment // compute local contribution to global dipole moment
double *q = atom->q; const double * const q = atom->q;
double **x = atom->x; const double * const * const x = atom->x;
double dipole = 0.0; double dipole = 0.0;
for (int j = 0; j < num_charged; j++) {
int i = is_charged[j];
for (j = 0; j < num_charged; j++) {
i = is_charged[j];
dipole += q[i]*x[i][2]; dipole += q[i]*x[i][2];
} }
@ -489,29 +594,101 @@ void PPPMCG::slabcorr()
// compute corrections // compute corrections
const double e_slabcorr = 2.0*MY_PI*dipole_all*dipole_all/volume; const double e_slabcorr = MY_2PI*dipole_all*dipole_all/volume;
const double qscale = force->qqrd2e * scale; const double qscale = force->qqrd2e * scale;
if (eflag_global) energy += qscale * e_slabcorr; if (eflag_global) energy += qscale * e_slabcorr;
//per-atom energy // per-atom energy
if (eflag_atom) { if (eflag_atom) {
double efact = 2.0*MY_PI*dipole_all/volume; const double efact = MY_2PI*dipole_all/volume;
for (int j = 0; j < num_charged; j++) { for (j = 0; j < num_charged; j++) {
int i = is_charged[j]; i = is_charged[j];
eatom[i] += qscale * q[i]*x[i][2]*efact; eatom[i] += qscale * q[i]*x[i][2]*efact;
} }
} }
// add on force corrections // add on force corrections
const double ffact = -4.0*MY_PI*dipole_all/volume * qscale; const double ffact = -MY_4PI*dipole_all/volume;
double **f = atom->f; double * const * const f = atom->f;
for (j = 0; j < num_charged; j++) {
i = is_charged[j];
f[i][2] += qscale * q[i]*ffact;
}
}
/* ----------------------------------------------------------------------
create discretized "density" on section of global grid due to my particles
density(x,y,z) = charge "density" at grid points of my 3d brick
(nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
in global grid for group-group interactions
------------------------------------------------------------------------- */
void PPPMCG::make_rho_groups(int groupbit_A, int groupbit_B, int BA_flag)
{
int i,l,m,n,nx,ny,nz,mx,my,mz;
FFT_SCALAR dx,dy,dz,x0,y0,z0;
// clear 3d density arrays
memset(&(density_A_brick[nzlo_out][nylo_out][nxlo_out]),0,
ngrid*sizeof(FFT_SCALAR));
memset(&(density_B_brick[nzlo_out][nylo_out][nxlo_out]),0,
ngrid*sizeof(FFT_SCALAR));
// loop over my charges, add their contribution to nearby grid points
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
// (dx,dy,dz) = distance to "lower left" grid pt
// (mx,my,mz) = global coords of moving stencil pt
const double * const q = atom->q;
const double * const * const x = atom->x;
const int * const mask = atom->mask;
for (int j = 0; j < num_charged; j++) { for (int j = 0; j < num_charged; j++) {
int i = is_charged[j]; i = is_charged[j];
f[i][2] += q[i]*ffact;
if ((mask[i] & groupbit_A) && (mask[i] & groupbit_B))
if (BA_flag) continue;
if ((mask[i] & groupbit_A) || (mask[i] & groupbit_B)) {
nx = part2grid[i][0];
ny = part2grid[i][1];
nz = part2grid[i][2];
dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
compute_rho1d(dx,dy,dz);
z0 = delvolinv * q[i];
for (n = nlower; n <= nupper; n++) {
mz = n+nz;
y0 = z0*rho1d[2][n];
for (m = nlower; m <= nupper; m++) {
my = m+ny;
x0 = y0*rho1d[1][m];
for (l = nlower; l <= nupper; l++) {
mx = l+nx;
// group A
if (mask[i] & groupbit_A)
density_A_brick[mz][my][mx] += x0*rho1d[0][l];
// group B
if (mask[i] & groupbit_B)
density_B_brick[mz][my][mx] += x0*rho1d[0][l];
}
}
}
}
} }
} }
@ -521,7 +698,7 @@ void PPPMCG::slabcorr()
double PPPMCG::memory_usage() double PPPMCG::memory_usage()
{ {
double bytes = PPPMOld::memory_usage(); double bytes = PPPM::memory_usage();
bytes += nmax * sizeof(int); bytes += nmax * sizeof(int);
return bytes; return bytes;
} }

View File

@ -20,11 +20,11 @@ KSpaceStyle(pppm/cg,PPPMCG)
#ifndef LMP_PPPM_CG_H #ifndef LMP_PPPM_CG_H
#define LMP_PPPM_CG_H #define LMP_PPPM_CG_H
#include "pppm_old.h" #include "pppm.h"
namespace LAMMPS_NS { namespace LAMMPS_NS {
class PPPMCG : public PPPMOld { class PPPMCG : public PPPM {
public: public:
PPPMCG(class LAMMPS *, int, char **); PPPMCG(class LAMMPS *, int, char **);
virtual ~PPPMCG(); virtual ~PPPMCG();
@ -38,9 +38,11 @@ class PPPMCG : public PPPMOld {
virtual void particle_map(); virtual void particle_map();
virtual void make_rho(); virtual void make_rho();
virtual void fieldforce(); virtual void fieldforce_ik();
virtual void fieldforce_ad();
virtual void fieldforce_peratom(); virtual void fieldforce_peratom();
virtual void slabcorr(); virtual void slabcorr();
virtual void make_rho_groups(int, int, int);
}; };
} }

View File

@ -226,7 +226,7 @@ void PPPMTIP4P::fieldforce_ik()
if (type[i] != typeO) { if (type[i] != typeO) {
f[i][0] += qfactor*ekx; f[i][0] += qfactor*ekx;
f[i][1] += qfactor*eky; f[i][1] += qfactor*eky;
f[i][2] += qfactor*ekz; if (slabflag != 2) f[i][2] += qfactor*ekz;
} else { } else {
fx = qfactor * ekx; fx = qfactor * ekx;
@ -236,15 +236,15 @@ void PPPMTIP4P::fieldforce_ik()
f[i][0] += fx*(1 - alpha); f[i][0] += fx*(1 - alpha);
f[i][1] += fy*(1 - alpha); f[i][1] += fy*(1 - alpha);
f[i][2] += fz*(1 - alpha); if (slabflag != 2) f[i][2] += fz*(1 - alpha);
f[iH1][0] += 0.5*alpha*fx; f[iH1][0] += 0.5*alpha*fx;
f[iH1][1] += 0.5*alpha*fy; f[iH1][1] += 0.5*alpha*fy;
f[iH1][2] += 0.5*alpha*fz; if (slabflag != 2) f[iH1][2] += 0.5*alpha*fz;
f[iH2][0] += 0.5*alpha*fx; f[iH2][0] += 0.5*alpha*fx;
f[iH2][1] += 0.5*alpha*fy; f[iH2][1] += 0.5*alpha*fy;
f[iH2][2] += 0.5*alpha*fz; if (slabflag != 2) f[iH2][2] += 0.5*alpha*fz;
} }
} }
} }
@ -256,7 +256,7 @@ void PPPMTIP4P::fieldforce_ik()
void PPPMTIP4P::fieldforce_ad() void PPPMTIP4P::fieldforce_ad()
{ {
int i,l,m,n,nx,ny,nz,mx,my,mz; int i,l,m,n,nx,ny,nz,mx,my,mz;
FFT_SCALAR dx,dy,dz,x0,y0,z0,dx0,dy0,dz0; FFT_SCALAR dx,dy,dz;
FFT_SCALAR ekx,eky,ekz; FFT_SCALAR ekx,eky,ekz;
double *xi; double *xi;
int iH1,iH2; int iH1,iH2;
@ -272,14 +272,11 @@ void PPPMTIP4P::fieldforce_ad()
double xprd = prd[0]; double xprd = prd[0];
double yprd = prd[1]; double yprd = prd[1];
double zprd = prd[2]; double zprd = prd[2];
double zprd_slab = zprd*slab_volfactor;
double hx_inv = nx_pppm/xprd; double hx_inv = nx_pppm/xprd;
double hy_inv = ny_pppm/yprd; double hy_inv = ny_pppm/yprd;
double hz_inv = nz_pppm/zprd; double hz_inv = nz_pppm/zprd;
// loop over my charges, interpolate electric field from nearby grid points // loop over my charges, interpolate electric field from nearby grid points
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
// (dx,dy,dz) = distance to "lower left" grid pt // (dx,dy,dz) = distance to "lower left" grid pt
@ -302,9 +299,9 @@ void PPPMTIP4P::fieldforce_ad()
nx = part2grid[i][0]; nx = part2grid[i][0];
ny = part2grid[i][1]; ny = part2grid[i][1];
nz = part2grid[i][2]; nz = part2grid[i][2];
dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv; dx = nx+shiftone - (xi[0]-boxlo[0])*delxinv;
dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv; dy = ny+shiftone - (xi[1]-boxlo[1])*delyinv;
dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv; dz = nz+shiftone - (xi[2]-boxlo[2])*delzinv;
compute_rho1d(dx,dy,dz); compute_rho1d(dx,dy,dz);
compute_drho1d(dx,dy,dz); compute_drho1d(dx,dy,dz);
@ -335,38 +332,38 @@ void PPPMTIP4P::fieldforce_ad()
s3 = x[i][2]*hz_inv; s3 = x[i][2]*hz_inv;
sf = sf_coeff[0]*sin(2*MY_PI*s1); sf = sf_coeff[0]*sin(2*MY_PI*s1);
sf += sf_coeff[1]*sin(4*MY_PI*s1); sf += sf_coeff[1]*sin(4*MY_PI*s1);
sf *= 2*q[i]*q[i]; sf *= 2.0*q[i]*q[i];
fx = qfactor*(ekx*q[i] - sf); fx = qfactor*(ekx*q[i] - sf);
sf = sf_coeff[2]*sin(2*MY_PI*s2); sf = sf_coeff[2]*sin(2*MY_PI*s2);
sf += sf_coeff[3]*sin(4*MY_PI*s2); sf += sf_coeff[3]*sin(4*MY_PI*s2);
sf *= 2*q[i]*q[i]; sf *= 2.0*q[i]*q[i];
fy = qfactor*(eky*q[i] - sf); fy = qfactor*(eky*q[i] - sf);
sf = sf_coeff[4]*sin(2*MY_PI*s3); sf = sf_coeff[4]*sin(2*MY_PI*s3);
sf += sf_coeff[5]*sin(4*MY_PI*s3); sf += sf_coeff[5]*sin(4*MY_PI*s3);
sf *= 2*q[i]*q[i]; sf *= 2.0*q[i]*q[i];
fz = qfactor*(ekz*q[i] - sf); fz = qfactor*(ekz*q[i] - sf);
if (type[i] != typeO) { if (type[i] != typeO) {
f[i][0] += fx; f[i][0] += fx;
f[i][1] += fy; f[i][1] += fy;
f[i][2] += fz; if (slabflag != 2) f[i][2] += fz;
} else { } else {
find_M(i,iH1,iH2,xM); find_M(i,iH1,iH2,xM);
f[i][0] += fx*(1 - alpha); f[i][0] += fx*(1 - alpha);
f[i][1] += fy*(1 - alpha); f[i][1] += fy*(1 - alpha);
f[i][2] += fz*(1 - alpha); if (slabflag != 2) f[i][2] += fz*(1 - alpha);
f[iH1][0] += 0.5*alpha*fx; f[iH1][0] += 0.5*alpha*fx;
f[iH1][1] += 0.5*alpha*fy; f[iH1][1] += 0.5*alpha*fy;
f[iH1][2] += 0.5*alpha*fz; if (slabflag != 2) f[iH1][2] += 0.5*alpha*fz;
f[iH2][0] += 0.5*alpha*fx; f[iH2][0] += 0.5*alpha*fx;
f[iH2][1] += 0.5*alpha*fy; f[iH2][1] += 0.5*alpha*fy;
f[iH2][2] += 0.5*alpha*fz; if (slabflag != 2) f[iH2][2] += 0.5*alpha*fz;
} }
} }
} }
@ -392,7 +389,6 @@ void PPPMTIP4P::fieldforce_peratom()
// ek = 3 components of E-field on particle // ek = 3 components of E-field on particle
double *q = atom->q; double *q = atom->q;
double **x = atom->x; double **x = atom->x;
double **f = atom->f;
int *type = atom->type; int *type = atom->type;
int nlocal = atom->nlocal; int nlocal = atom->nlocal;

View File

@ -19,9 +19,12 @@
#include "atom.h" #include "atom.h"
#include "comm.h" #include "comm.h"
#include "domain.h" #include "domain.h"
#include "error.h"
#include "fix_omp.h"
#include "force.h" #include "force.h"
#include "memory.h" #include "memory.h"
#include "math_const.h" #include "math_const.h"
#include "math_special.h"
#include <string.h> #include <string.h>
#include <math.h> #include <math.h>
@ -29,12 +32,14 @@
#include "suffix.h" #include "suffix.h"
using namespace LAMMPS_NS; using namespace LAMMPS_NS;
using namespace MathConst; using namespace MathConst;
using namespace MathSpecial;
#define SMALLQ 0.00001 #ifdef FFT_SINGLE
#if defined(FFT_SINGLE)
#define ZEROF 0.0f #define ZEROF 0.0f
#define ONEF 1.0f
#else #else
#define ZEROF 0.0 #define ZEROF 0.0
#define ONEF 1.0
#endif #endif
#define EPS_HOC 1.0e-7 #define EPS_HOC 1.0e-7
@ -55,7 +60,27 @@ void PPPMCGOMP::allocate()
{ {
PPPMCG::allocate(); PPPMCG::allocate();
const int nthreads = comm->nthreads; #if defined(_OPENMP)
#pragma omp parallel default(none)
#endif
{
#if defined(_OPENMP)
const int tid = omp_get_thread_num();
#else
const int tid = 0;
#endif
ThrData *thr = fix->get_thr(tid);
thr->init_pppm(order,memory);
}
}
/* ----------------------------------------------------------------------
free memory that depends on # of K-vectors and order
------------------------------------------------------------------------- */
void PPPMCGOMP::deallocate()
{
PPPMCG::deallocate();
#if defined(_OPENMP) #if defined(_OPENMP)
#pragma omp parallel default(none) #pragma omp parallel default(none)
@ -66,153 +91,26 @@ void PPPMCGOMP::allocate()
#else #else
const int tid = 0; const int tid = 0;
#endif #endif
FFT_SCALAR **rho1d_thr;
memory->create2d_offset(rho1d_thr,3,-order/2,order/2,"pppm:rho1d_thr");
ThrData *thr = fix->get_thr(tid); ThrData *thr = fix->get_thr(tid);
thr->init_pppm(static_cast<void *>(rho1d_thr)); thr->init_pppm(-order,memory);
}
const int nzend = (nzhi_out-nzlo_out+1)*nthreads + nzlo_out -1;
// reallocate density brick, so it fits our needs
memory->destroy3d_offset(density_brick,nzlo_out,nylo_out,nxlo_out);
memory->create3d_offset(density_brick,nzlo_out,nzend,nylo_out,nyhi_out,
nxlo_out,nxhi_out,"pppm:density_brick");
}
// NOTE: special version of reduce_data for FFT_SCALAR data type.
// reduce per thread data into the first part of the data
// array that is used for the non-threaded parts and reset
// the temporary storage to 0.0. this routine depends on
// multi-dimensional arrays like force stored in this order
// x1,y1,z1,x2,y2,z2,...
// we need to post a barrier to wait until all threads are done
// writing to the array.
static void data_reduce_fft(FFT_SCALAR *dall, int nall, int nthreads, int ndim, int tid)
{
#if defined(_OPENMP)
// NOOP in non-threaded execution.
if (nthreads == 1) return;
#pragma omp barrier
{
const int nvals = ndim*nall;
const int idelta = nvals/nthreads + 1;
const int ifrom = tid*idelta;
const int ito = ((ifrom + idelta) > nvals) ? nvals : (ifrom + idelta);
// this if protects against having more threads than atoms
if (ifrom < nall) {
for (int m = ifrom; m < ito; ++m) {
for (int n = 1; n < nthreads; ++n) {
dall[m] += dall[n*nvals + m];
dall[n*nvals + m] = 0.0;
}
}
}
}
#else
// NOOP in non-threaded execution.
return;
#endif
}
/* ----------------------------------------------------------------------
free memory that depends on # of K-vectors and order
------------------------------------------------------------------------- */
void PPPMCGOMP::deallocate()
{
PPPMCG::deallocate();
for (int i=0; i < comm->nthreads; ++i) {
ThrData * thr = fix->get_thr(i);
FFT_SCALAR ** rho1d_thr = static_cast<FFT_SCALAR **>(thr->get_rho1d());
memory->destroy2d_offset(rho1d_thr,-order/2);
} }
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
adjust PPPMCG coeffs, called initially and whenever volume has changed pre-compute modified (Hockney-Eastwood) Coulomb Green's function
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
void PPPMCGOMP::setup() void PPPMCGOMP::compute_gf_ik()
{ {
int i,j,k,n; const double * const prd = (triclinic==0) ? domain->prd : domain->prd_lamda;
double *prd;
// volume-dependent factors
// adjust z dimension for 2d slab PPPM
// z dimension for 3d PPPM is zprd since slab_volfactor = 1.0
if (triclinic == 0) prd = domain->prd;
else prd = domain->prd_lamda;
const double xprd = prd[0]; const double xprd = prd[0];
const double yprd = prd[1]; const double yprd = prd[1];
const double zprd = prd[2]; const double zprd = prd[2];
const double zprd_slab = zprd*slab_volfactor; const double zprd_slab = zprd*slab_volfactor;
volume = xprd * yprd * zprd_slab; const double unitkx = (MY_2PI/xprd);
const double unitky = (MY_2PI/yprd);
delxinv = nx_pppm/xprd; const double unitkz = (MY_2PI/zprd_slab);
delyinv = ny_pppm/yprd;
delzinv = nz_pppm/zprd_slab;
delvolinv = delxinv*delyinv*delzinv;
const double unitkx = (2.0*MY_PI/xprd);
const double unitky = (2.0*MY_PI/yprd);
const double unitkz = (2.0*MY_PI/zprd_slab);
// fkx,fky,fkz for my FFT grid pts
double per;
for (i = nxlo_fft; i <= nxhi_fft; i++) {
per = i - nx_pppm*(2*i/nx_pppm);
fkx[i] = unitkx*per;
}
for (i = nylo_fft; i <= nyhi_fft; i++) {
per = i - ny_pppm*(2*i/ny_pppm);
fky[i] = unitky*per;
}
for (i = nzlo_fft; i <= nzhi_fft; i++) {
per = i - nz_pppm*(2*i/nz_pppm);
fkz[i] = unitkz*per;
}
// virial coefficients
double sqk,vterm;
n = 0;
for (k = nzlo_fft; k <= nzhi_fft; k++) {
for (j = nylo_fft; j <= nyhi_fft; j++) {
for (i = nxlo_fft; i <= nxhi_fft; i++) {
sqk = fkx[i]*fkx[i] + fky[j]*fky[j] + fkz[k]*fkz[k];
if (sqk == 0.0) {
vg[n][0] = 0.0;
vg[n][1] = 0.0;
vg[n][2] = 0.0;
vg[n][3] = 0.0;
vg[n][4] = 0.0;
vg[n][5] = 0.0;
} else {
vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald));
vg[n][0] = 1.0 + vterm*fkx[i]*fkx[i];
vg[n][1] = 1.0 + vterm*fky[j]*fky[j];
vg[n][2] = 1.0 + vterm*fkz[k]*fkz[k];
vg[n][3] = vterm*fkx[i]*fky[j];
vg[n][4] = vterm*fkx[i]*fkz[k];
vg[n][5] = vterm*fky[j]*fkz[k];
}
n++;
}
}
}
// modified (Hockney-Eastwood) Coulomb Green's function
const int nbx = static_cast<int> ((g_ewald*xprd/(MY_PI*nx_pppm)) * const int nbx = static_cast<int> ((g_ewald*xprd/(MY_PI*nx_pppm)) *
pow(-log(EPS_HOC),0.25)); pow(-log(EPS_HOC),0.25));
@ -220,93 +118,188 @@ void PPPMCGOMP::setup()
pow(-log(EPS_HOC),0.25)); pow(-log(EPS_HOC),0.25));
const int nbz = static_cast<int> ((g_ewald*zprd_slab/(MY_PI*nz_pppm)) * const int nbz = static_cast<int> ((g_ewald*zprd_slab/(MY_PI*nz_pppm)) *
pow(-log(EPS_HOC),0.25)); pow(-log(EPS_HOC),0.25));
const int numk = nxhi_fft - nxlo_fft + 1;
const int numl = nyhi_fft - nylo_fft + 1;
const double form = 1.0; const int twoorder = 2*order;
#if defined(_OPENMP) #if defined(_OPENMP)
#pragma omp parallel default(none) #pragma omp parallel default(none)
#endif #endif
{ {
int tid,nn,nnfrom,nnto,nx,ny,nz,k,l,m; double snx,sny,snz;
double snx,sny,snz,sqk;
double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz; double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
double sum1,dot1,dot2; double sum1,dot1,dot2;
double numerator,denominator; double numerator,denominator;
const double gew2 = -4.0*g_ewald*g_ewald; double sqk;
const int nnx = nxhi_fft-nxlo_fft+1; int k,l,m,nx,ny,nz,kper,lper,mper,n,nfrom,nto,tid;
const int nny = nyhi_fft-nylo_fft+1;
loop_setup_thr(nnfrom, nnto, tid, nfft, comm->nthreads); loop_setup_thr(nfrom, nto, tid, nfft, comm->nthreads);
for (m = nzlo_fft; m <= nzhi_fft; m++) { for (n = nfrom; n < nto; ++n) {
m = n / (numl*numk);
l = (n - m*numl*numk) / numk;
k = n - m*numl*numk - l*numk;
m += nzlo_fft;
l += nylo_fft;
k += nxlo_fft;
const double fkzm = fkz[m]; mper = m - nz_pppm*(2*m/nz_pppm);
snz = sin(0.5*fkzm*zprd_slab/nz_pppm); snz = square(sin(0.5*unitkz*mper*zprd_slab/nz_pppm));
snz *= snz;
for (l = nylo_fft; l <= nyhi_fft; l++) { lper = l - ny_pppm*(2*l/ny_pppm);
const double fkyl = fky[l]; sny = square(sin(0.5*unitky*lper*yprd/ny_pppm));
sny = sin(0.5*fkyl*yprd/ny_pppm);
sny *= sny;
for (k = nxlo_fft; k <= nxhi_fft; k++) { kper = k - nx_pppm*(2*k/nx_pppm);
snx = square(sin(0.5*unitkx*kper*xprd/nx_pppm));
/* only compute the part designated to this thread */ sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper);
nn = k-nxlo_fft + nnx*(l-nylo_fft + nny*(m-nzlo_fft));
if ((nn < nnfrom) || (nn >=nnto)) continue;
const double fkxk = fkx[k]; if (sqk != 0.0) {
snx = sin(0.5*fkxk*xprd/nx_pppm); numerator = 12.5663706/sqk;
snx *= snx; denominator = gf_denom(snx,sny,snz);
sum1 = 0.0;
sqk = fkxk*fkxk + fkyl*fkyl + fkzm*fkzm; for (nx = -nbx; nx <= nbx; nx++) {
qx = unitkx*(kper+nx_pppm*nx);
sx = exp(-0.25*square(qx/g_ewald));
argx = 0.5*qx*xprd/nx_pppm;
wx = powsinxx(argx,twoorder);
if (sqk != 0.0) { for (ny = -nby; ny <= nby; ny++) {
numerator = form*MY_4PI/sqk; qy = unitky*(lper+ny_pppm*ny);
denominator = gf_denom(snx,sny,snz); sy = exp(-0.25*square(qy/g_ewald));
const double dorder = static_cast<double>(order); argy = 0.5*qy*yprd/ny_pppm;
sum1 = 0.0; wy = powsinxx(argy,twoorder);
for (nx = -nbx; nx <= nbx; nx++) {
qx = fkxk + unitkx*nx_pppm*nx;
sx = exp(qx*qx/gew2);
wx = 1.0;
argx = 0.5*qx*xprd/nx_pppm;
if (argx != 0.0) wx = pow(sin(argx)/argx,dorder);
wx *=wx;
for (ny = -nby; ny <= nby; ny++) { for (nz = -nbz; nz <= nbz; nz++) {
qy = fkyl + unitky*ny_pppm*ny; qz = unitkz*(mper+nz_pppm*nz);
sy = exp(qy*qy/gew2); sz = exp(-0.25*square(qz/g_ewald));
wy = 1.0; argz = 0.5*qz*zprd_slab/nz_pppm;
argy = 0.5*qy*yprd/ny_pppm; wz = powsinxx(argz,twoorder);
if (argy != 0.0) wy = pow(sin(argy)/argy,dorder);
wy *= wy;
for (nz = -nbz; nz <= nbz; nz++) { dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
qz = fkzm + unitkz*nz_pppm*nz; dot2 = qx*qx+qy*qy+qz*qz;
sz = exp(qz*qz/gew2); sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz;
wz = 1.0; }
argz = 0.5*qz*zprd_slab/nz_pppm; }
if (argz != 0.0) wz = pow(sin(argz)/argz,dorder); }
wz *= wz; greensfn[n] = numerator*sum1/denominator;
} else greensfn[n] = 0.0;
dot1 = fkxk*qx + fkyl*qy + fkzm*qz;
dot2 = qx*qx+qy*qy+qz*qz;
sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz;
}
}
}
greensfn[nn] = numerator*sum1/denominator;
} else greensfn[nn] = 0.0;
}
}
} }
} } // end of parallel region
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
run the regular toplevel compute method from plain PPPPMCG compute optimized Green's function for energy calculation
------------------------------------------------------------------------- */
void PPPMCGOMP::compute_gf_ad()
{
const double * const prd = (triclinic==0) ? domain->prd : domain->prd_lamda;
const double xprd = prd[0];
const double yprd = prd[1];
const double zprd = prd[2];
const double zprd_slab = zprd*slab_volfactor;
const double unitkx = (MY_2PI/xprd);
const double unitky = (MY_2PI/yprd);
const double unitkz = (MY_2PI/zprd_slab);
const int numk = nxhi_fft - nxlo_fft + 1;
const int numl = nyhi_fft - nylo_fft + 1;
const int twoorder = 2*order;
double sf0=0.0,sf1=0.0,sf2=0.0,sf3=0.0,sf4=0.0,sf5=0.0;
#if defined(_OPENMP)
#pragma omp parallel default(none) reduction(+:sf0,sf1,sf2,sf3,sf4,sf5)
#endif
{
double snx,sny,snz,sqk;
double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
double numerator,denominator;
int k,l,m,kper,lper,mper,n,nfrom,nto,tid;
loop_setup_thr(nfrom, nto, tid, nfft, comm->nthreads);
for (n = nfrom; n < nto; ++n) {
m = n / (numl*numk);
l = (n - m*numl*numk) / numk;
k = n - m*numl*numk - l*numk;
m += nzlo_fft;
l += nylo_fft;
k += nxlo_fft;
mper = m - nz_pppm*(2*m/nz_pppm);
qz = unitkz*mper;
snz = square(sin(0.5*qz*zprd_slab/nz_pppm));
sz = exp(-0.25*square(qz/g_ewald));
argz = 0.5*qz*zprd_slab/nz_pppm;
wz = powsinxx(argz,twoorder);
lper = l - ny_pppm*(2*l/ny_pppm);
qy = unitky*lper;
sny = square(sin(0.5*qy*yprd/ny_pppm));
sy = exp(-0.25*square(qy/g_ewald));
argy = 0.5*qy*yprd/ny_pppm;
wy = powsinxx(argy,twoorder);
kper = k - nx_pppm*(2*k/nx_pppm);
qx = unitkx*kper;
snx = square(sin(0.5*qx*xprd/nx_pppm));
sx = exp(-0.25*square(qx/g_ewald));
argx = 0.5*qx*xprd/nx_pppm;
wx = powsinxx(argx,twoorder);
sqk = qx*qx + qy*qy + qz*qz;
if (sqk != 0.0) {
numerator = MY_4PI/sqk;
denominator = gf_denom(snx,sny,snz);
greensfn[n] = numerator*sx*sy*sz*wx*wy*wz/denominator;
sf0 += sf_precoeff1[n]*greensfn[n];
sf1 += sf_precoeff2[n]*greensfn[n];
sf2 += sf_precoeff3[n]*greensfn[n];
sf3 += sf_precoeff4[n]*greensfn[n];
sf4 += sf_precoeff5[n]*greensfn[n];
sf5 += sf_precoeff6[n]*greensfn[n];
} else {
greensfn[n] = 0.0;
sf0 += sf_precoeff1[n]*greensfn[n];
sf1 += sf_precoeff2[n]*greensfn[n];
sf2 += sf_precoeff3[n]*greensfn[n];
sf3 += sf_precoeff4[n]*greensfn[n];
sf4 += sf_precoeff5[n]*greensfn[n];
sf5 += sf_precoeff6[n]*greensfn[n];
}
}
} // end of paralle region
// compute the coefficients for the self-force correction
double prex, prey, prez, tmp[6];
prex = prey = prez = MY_PI/volume;
prex *= nx_pppm/xprd;
prey *= ny_pppm/yprd;
prez *= nz_pppm/zprd_slab;
tmp[0] = sf0 * prex;
tmp[1] = sf1 * prex*2;
tmp[2] = sf2 * prey;
tmp[3] = sf3 * prey*2;
tmp[4] = sf4 * prez;
tmp[5] = sf5 * prez*2;
// communicate values with other procs
MPI_Allreduce(tmp,sf_coeff,6,MPI_DOUBLE,MPI_SUM,world);
}
/* ----------------------------------------------------------------------
run the regular toplevel compute method from plain PPPMCG
which will have individual methods replaced by our threaded which will have individual methods replaced by our threaded
versions and then call the obligatory force reduction. versions and then call the obligatory force reduction.
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
@ -331,94 +324,10 @@ void PPPMCGOMP::compute(int eflag, int vflag)
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
create discretized "density" on section of global grid due to my particles interpolate from grid to get electric field & force on my particles for ik
density(x,y,z) = charge "density" at grid points of my 3d brick
(nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
in global grid
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
void PPPMCGOMP::make_rho() void PPPMCGOMP::fieldforce_ik()
{
const double * const q = atom->q;
const double * const * const x = atom->x;
const int nthreads = comm->nthreads;
#if defined(_OPENMP)
#pragma omp parallel default(none)
#endif
{
#if defined(_OPENMP)
// each thread works on a fixed chunk of atoms.
const int tid = omp_get_thread_num();
const int inum = num_charged;
const int idelta = 1 + inum/nthreads;
const int ifrom = tid*idelta;
const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
#else
const int tid = 0;
const int ifrom = 0;
const int ito = num_charged;
#endif
int i,j,l,m,n,nx,ny,nz,mx,my,mz;
FFT_SCALAR dx,dy,dz,x0,y0,z0;
// set up clear 3d density array
const int nzoffs = (nzhi_out-nzlo_out+1)*tid;
FFT_SCALAR * const * const * const db = &(density_brick[nzoffs]);
memset(&(db[nzlo_out][nylo_out][nxlo_out]),0,ngrid*sizeof(FFT_SCALAR));
ThrData *thr = fix->get_thr(tid);
FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
// loop over my charges, add their contribution to nearby grid points
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
// (dx,dy,dz) = distance to "lower left" grid pt
// (mx,my,mz) = global coords of moving stencil pt
// this if protects against having more threads than charged local atoms
if (ifrom < num_charged) {
for (j = ifrom; j < ito; j++) {
i = is_charged[j];
nx = part2grid[i][0];
ny = part2grid[i][1];
nz = part2grid[i][2];
dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
compute_rho1d_thr(r1d,dx,dy,dz);
z0 = delvolinv * q[i];
for (n = nlower; n <= nupper; n++) {
mz = n+nz;
y0 = z0*r1d[2][n];
for (m = nlower; m <= nupper; m++) {
my = m+ny;
x0 = y0*r1d[1][m];
for (l = nlower; l <= nupper; l++) {
mx = l+nx;
db[mz][my][mx] += x0*r1d[0][l];
}
}
}
}
}
#if defined(_OPENMP)
// reduce 3d density array
if (nthreads > 1) {
data_reduce_fft(&(density_brick[nzlo_out][nylo_out][nxlo_out]),ngrid,nthreads,1,tid);
}
#endif
}
}
/* ----------------------------------------------------------------------
interpolate from grid to get electric field & force on my particles
------------------------------------------------------------------------- */
void PPPMCGOMP::fieldforce()
{ {
// loop over my charges, interpolate electric field from nearby grid points // loop over my charges, interpolate electric field from nearby grid points
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
@ -426,79 +335,161 @@ void PPPMCGOMP::fieldforce()
// (mx,my,mz) = global coords of moving stencil pt // (mx,my,mz) = global coords of moving stencil pt
// ek = 3 components of E-field on particle // ek = 3 components of E-field on particle
const double * const q = atom->q; const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
const double * const * const x = atom->x; const double * _noalias const q = atom->q;
const int nthreads = comm->nthreads;
const double qqrd2e = force->qqrd2e; const double qqrd2e = force->qqrd2e;
const int nthreads = comm->nthreads;
#if defined(_OPENMP) #if defined(_OPENMP)
#pragma omp parallel default(none) #pragma omp parallel default(none)
#endif #endif
{ {
#if defined(_OPENMP) FFT_SCALAR dx,dy,dz,x0,y0,z0,ekx,eky,ekz;
// each thread works on a fixed chunk of atoms. int i,ifrom,ito,tid,l,m,n,nx,ny,nz,mx,my,mz;
const int tid = omp_get_thread_num();
const int inum = num_charged; loop_setup_thr(ifrom,ito,tid,num_charged,nthreads);
const int idelta = 1 + inum/nthreads;
const int ifrom = tid*idelta; // get per thread data
const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
#else
const int ifrom = 0;
const int ito = num_charged;
const int tid = 0;
#endif
ThrData *thr = fix->get_thr(tid); ThrData *thr = fix->get_thr(tid);
double * const * const f = thr->get_f(); dbl3_t * _noalias const f = (dbl3_t *) thr->get_f()[0];
FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d()); FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
int i,j,l,m,n,nx,ny,nz,mx,my,mz; for (int j = ifrom; j < ito; ++j) {
FFT_SCALAR dx,dy,dz,x0,y0,z0; i = is_charged[j];
FFT_SCALAR ekx,eky,ekz;
// this if protects against having more threads than charged local atoms nx = part2grid[i][0];
if (ifrom < num_charged) { ny = part2grid[i][1];
for (j = ifrom; j < ito; j++) { nz = part2grid[i][2];
i = is_charged[j]; dx = nx+shiftone - (x[i].x-boxlo[0])*delxinv;
dy = ny+shiftone - (x[i].y-boxlo[1])*delyinv;
dz = nz+shiftone - (x[i].z-boxlo[2])*delzinv;
nx = part2grid[i][0]; compute_rho1d_thr(r1d,dx,dy,dz);
ny = part2grid[i][1];
nz = part2grid[i][2];
dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
compute_rho1d_thr(r1d,dx,dy,dz); ekx = eky = ekz = ZEROF;
for (n = nlower; n <= nupper; n++) {
ekx = eky = ekz = ZEROF; mz = n+nz;
for (n = nlower; n <= nupper; n++) { z0 = r1d[2][n];
mz = n+nz; for (m = nlower; m <= nupper; m++) {
z0 = r1d[2][n]; my = m+ny;
for (m = nlower; m <= nupper; m++) { y0 = z0*r1d[1][m];
my = m+ny; for (l = nlower; l <= nupper; l++) {
y0 = z0*r1d[1][m]; mx = l+nx;
for (l = nlower; l <= nupper; l++) { x0 = y0*r1d[0][l];
mx = l+nx; ekx -= x0*vdx_brick[mz][my][mx];
x0 = y0*r1d[0][l]; eky -= x0*vdy_brick[mz][my][mx];
ekx -= x0*vdx_brick[mz][my][mx]; ekz -= x0*vdz_brick[mz][my][mx];
eky -= x0*vdy_brick[mz][my][mx]; }
ekz -= x0*vdz_brick[mz][my][mx]; }
}
}
}
// convert E-field to force
const double qfactor = qqrd2e*scale*q[i];
f[i][0] += qfactor*ekx;
f[i][1] += qfactor*eky;
f[i][2] += qfactor*ekz;
} }
// convert E-field to force
const double qfactor = qqrd2e * scale * q[i];
f[i].x += qfactor*ekx;
f[i].y += qfactor*eky;
if (slabflag != 2) f[i].z += qfactor*ekz;
} }
} } // end of parallel region
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
interpolate from grid to get per-atom energy/virial interpolate from grid to get electric field & force on my particles for ad
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
void PPPMCGOMP::fieldforce_ad()
{
const double *prd = (triclinic == 0) ? domain->prd : domain->prd_lamda;
const double hx_inv = nx_pppm/prd[0];
const double hy_inv = ny_pppm/prd[1];
const double hz_inv = nz_pppm/prd[2];
// loop over my charges, interpolate electric field from nearby grid points
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
// (dx,dy,dz) = distance to "lower left" grid pt
// (mx,my,mz) = global coords of moving stencil pt
// ek = 3 components of E-field on particle
const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
const double * _noalias const q = atom->q;
const double qqrd2e = force->qqrd2e;
const int nthreads = comm->nthreads;
#if defined(_OPENMP)
#pragma omp parallel default(none)
#endif
{
int i,ifrom,ito,tid,l,m,n,nx,ny,nz,mx,my,mz;
FFT_SCALAR dx,dy,dz,ekx,eky,ekz;
double s1,s2,s3,sf;
loop_setup_thr(ifrom,ito,tid,num_charged,nthreads);
// get per thread data
ThrData *thr = fix->get_thr(tid);
dbl3_t * _noalias const f = (dbl3_t *) thr->get_f()[0];
FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
FFT_SCALAR * const * const d1d = static_cast<FFT_SCALAR **>(thr->get_drho1d());
for (int j = ifrom; j < ito; ++j) {
i = is_charged[j];
nx = part2grid[i][0];
ny = part2grid[i][1];
nz = part2grid[i][2];
dx = nx+shiftone - (x[i].x-boxlo[0])*delxinv;
dy = ny+shiftone - (x[i].y-boxlo[1])*delyinv;
dz = nz+shiftone - (x[i].z-boxlo[2])*delzinv;
compute_rho1d_thr(r1d,dx,dy,dz);
compute_drho1d_thr(d1d,dx,dy,dz);
ekx = eky = ekz = ZEROF;
for (n = nlower; n <= nupper; n++) {
mz = n+nz;
for (m = nlower; m <= nupper; m++) {
my = m+ny;
for (l = nlower; l <= nupper; l++) {
mx = l+nx;
ekx += d1d[0][l]*r1d[1][m]*r1d[2][n]*u_brick[mz][my][mx];
eky += r1d[0][l]*d1d[1][m]*r1d[2][n]*u_brick[mz][my][mx];
ekz += r1d[0][l]*r1d[1][m]*d1d[2][n]*u_brick[mz][my][mx];
}
}
}
ekx *= hx_inv;
eky *= hy_inv;
ekz *= hz_inv;
// convert E-field to force and substract self forces
const double qi = q[i];
const double qfactor = qqrd2e * scale * qi;
s1 = x[i].x*hx_inv;
sf = sf_coeff[0]*sin(MY_2PI*s1);
sf += sf_coeff[1]*sin(MY_4PI*s1);
sf *= 2.0*qi;
f[i].x += qfactor*(ekx - sf);
s2 = x[i].y*hy_inv;
sf = sf_coeff[2]*sin(MY_2PI*s2);
sf += sf_coeff[3]*sin(MY_4PI*s2);
sf *= 2*qi;
f[i].y += qfactor*(eky - sf);
s3 = x[i].z*hz_inv;
sf = sf_coeff[4]*sin(MY_2PI*s3);
sf += sf_coeff[5]*sin(MY_4PI*s3);
sf *= 2*qi;
if (slabflag != 2) f[i].z += qfactor*(ekz - sf);
}
} // end of parallel region
}
/* ----------------------------------------------------------------------
interpolate from grid to get per-atom energy/virial
------------------------------------------------------------------------- */
void PPPMCGOMP::fieldforce_peratom() void PPPMCGOMP::fieldforce_peratom()
{ {
@ -507,82 +498,71 @@ void PPPMCGOMP::fieldforce_peratom()
// (dx,dy,dz) = distance to "lower left" grid pt // (dx,dy,dz) = distance to "lower left" grid pt
// (mx,my,mz) = global coords of moving stencil pt // (mx,my,mz) = global coords of moving stencil pt
const double * const q = atom->q; const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
const double * const * const x = atom->x; const double * _noalias const q = atom->q;
const int nthreads = comm->nthreads; const int nthreads = comm->nthreads;
#if defined(_OPENMP) #if defined(_OPENMP)
#pragma omp parallel default(none) #pragma omp parallel default(none)
#endif #endif
{ {
#if defined(_OPENMP)
// each thread works on a fixed chunk of atoms.
const int tid = omp_get_thread_num();
const int inum = num_charged;
const int idelta = 1 + inum/nthreads;
const int ifrom = tid*idelta;
const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
#else
const int ifrom = 0;
const int ito = num_charged;
const int tid = 0;
#endif
ThrData *thr = fix->get_thr(tid);
FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
int i,j,l,m,n,nx,ny,nz,mx,my,mz;
FFT_SCALAR dx,dy,dz,x0,y0,z0; FFT_SCALAR dx,dy,dz,x0,y0,z0;
FFT_SCALAR u,v0,v1,v2,v3,v4,v5; FFT_SCALAR u,v0,v1,v2,v3,v4,v5;
int i,ifrom,ito,tid,l,m,n,nx,ny,nz,mx,my,mz;
// this if protects against having more threads than charged local atoms loop_setup_thr(ifrom,ito,tid,num_charged,nthreads);
if (ifrom < num_charged) {
for (j = ifrom; j < ito; j++) {
i = is_charged[j];
nx = part2grid[i][0]; // get per thread data
ny = part2grid[i][1]; ThrData *thr = fix->get_thr(tid);
nz = part2grid[i][2]; FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
compute_rho1d_thr(r1d,dx,dy,dz); for (int j=ifrom; j < ito; ++j) {
i = is_charged[j];
u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF; nx = part2grid[i][0];
for (n = nlower; n <= nupper; n++) { ny = part2grid[i][1];
mz = n+nz; nz = part2grid[i][2];
z0 = r1d[2][n]; dx = nx+shiftone - (x[i].x-boxlo[0])*delxinv;
for (m = nlower; m <= nupper; m++) { dy = ny+shiftone - (x[i].y-boxlo[1])*delyinv;
my = m+ny; dz = nz+shiftone - (x[i].z-boxlo[2])*delzinv;
y0 = z0*r1d[1][m];
for (l = nlower; l <= nupper; l++) {
mx = l+nx;
x0 = y0*r1d[0][l];
if (eflag_atom) u += x0*u_brick[mz][my][mx];
if (vflag_atom) {
v0 += x0*v0_brick[mz][my][mx];
v1 += x0*v1_brick[mz][my][mx];
v2 += x0*v2_brick[mz][my][mx];
v3 += x0*v3_brick[mz][my][mx];
v4 += x0*v4_brick[mz][my][mx];
v5 += x0*v5_brick[mz][my][mx];
}
}
}
}
if (eflag_atom) eatom[i] += q[i]*u; compute_rho1d_thr(r1d,dx,dy,dz);
if (vflag_atom) {
vatom[i][0] += v0; u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF;
vatom[i][1] += v1; for (n = nlower; n <= nupper; n++) {
vatom[i][2] += v2; mz = n+nz;
vatom[i][3] += v3; z0 = r1d[2][n];
vatom[i][4] += v4; for (m = nlower; m <= nupper; m++) {
vatom[i][5] += v5; my = m+ny;
} y0 = z0*r1d[1][m];
for (l = nlower; l <= nupper; l++) {
mx = l+nx;
x0 = y0*r1d[0][l];
if (eflag_atom) u += x0*u_brick[mz][my][mx];
if (vflag_atom) {
v0 += x0*v0_brick[mz][my][mx];
v1 += x0*v1_brick[mz][my][mx];
v2 += x0*v2_brick[mz][my][mx];
v3 += x0*v3_brick[mz][my][mx];
v4 += x0*v4_brick[mz][my][mx];
v5 += x0*v5_brick[mz][my][mx];
}
}
}
}
const double qi = q[i];
if (eflag_atom) eatom[i] += qi*u;
if (vflag_atom) {
vatom[i][0] += qi*v0;
vatom[i][1] += qi*v1;
vatom[i][2] += qi*v2;
vatom[i][3] += qi*v3;
vatom[i][4] += qi*v4;
vatom[i][5] += qi*v5;
} }
} }
} } // end of parallel region
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
@ -590,7 +570,7 @@ void PPPMCGOMP::fieldforce_peratom()
dx,dy,dz = distance of particle from "lower left" grid point dx,dy,dz = distance of particle from "lower left" grid point
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
void PPPMCGOMP::compute_rho1d_thr(FFT_SCALAR * const * const r1d, const FFT_SCALAR &dx, void PPPMCGOMP::compute_rho1d_thr(FFT_SCALAR * const * const r1d, const FFT_SCALAR &dx,
const FFT_SCALAR &dy, const FFT_SCALAR &dz) const FFT_SCALAR &dy, const FFT_SCALAR &dz)
{ {
int k,l; int k,l;
FFT_SCALAR r1,r2,r3; FFT_SCALAR r1,r2,r3;
@ -608,3 +588,28 @@ void PPPMCGOMP::compute_rho1d_thr(FFT_SCALAR * const * const r1d, const FFT_SCAL
r1d[2][k] = r3; r1d[2][k] = r3;
} }
} }
/* ----------------------------------------------------------------------
charge assignment into drho1d
dx,dy,dz = distance of particle from "lower left" grid point
------------------------------------------------------------------------- */
void PPPMCGOMP::compute_drho1d_thr(FFT_SCALAR * const * const d1d, const FFT_SCALAR &dx,
const FFT_SCALAR &dy, const FFT_SCALAR &dz)
{
int k,l;
FFT_SCALAR r1,r2,r3;
for (k = (1-order)/2; k <= order/2; k++) {
r1 = r2 = r3 = ZEROF;
for (l = order-2; l >= 0; l--) {
r1 = drho_coeff[l][k] + r1*dx;
r2 = drho_coeff[l][k] + r2*dy;
r3 = drho_coeff[l][k] + r3*dz;
}
d1d[0][k] = r1;
d1d[1][k] = r2;
d1d[2][k] = r3;
}
}

View File

@ -25,23 +25,29 @@ KSpaceStyle(pppm/cg/omp,PPPMCGOMP)
namespace LAMMPS_NS { namespace LAMMPS_NS {
class PPPMCGOMP : public PPPMCG, public ThrOMP { class PPPMCGOMP : public PPPMCG, public ThrOMP {
public: public:
PPPMCGOMP(class LAMMPS *, int, char **); PPPMCGOMP(class LAMMPS *, int, char **);
virtual void compute(int, int);
virtual void setup();
virtual ~PPPMCGOMP () {}; virtual ~PPPMCGOMP () {};
virtual void compute(int, int);
protected: protected:
virtual void allocate(); virtual void allocate();
virtual void deallocate(); virtual void deallocate();
virtual void fieldforce();
virtual void fieldforce_peratom();
virtual void make_rho();
virtual void compute_gf_ik();
virtual void compute_gf_ad();
virtual void fieldforce_ik();
virtual void fieldforce_ad();
virtual void fieldforce_peratom();
// virtual void make_rho();
private:
void compute_rho1d_thr(FFT_SCALAR * const * const, const FFT_SCALAR &, void compute_rho1d_thr(FFT_SCALAR * const * const, const FFT_SCALAR &,
const FFT_SCALAR &, const FFT_SCALAR &); const FFT_SCALAR &, const FFT_SCALAR &);
// void compute_rho_coeff(); void compute_drho1d_thr(FFT_SCALAR * const * const, const FFT_SCALAR &,
const FFT_SCALAR &, const FFT_SCALAR &);
// void slabcorr(int); // void slabcorr(int);
}; };

View File

@ -19,9 +19,12 @@
#include "atom.h" #include "atom.h"
#include "comm.h" #include "comm.h"
#include "domain.h" #include "domain.h"
#include "error.h"
#include "fix_omp.h"
#include "force.h" #include "force.h"
#include "memory.h" #include "memory.h"
#include "math_const.h" #include "math_const.h"
#include "math_special.h"
#include <string.h> #include <string.h>
#include <math.h> #include <math.h>
@ -29,13 +32,12 @@
#include "suffix.h" #include "suffix.h"
using namespace LAMMPS_NS; using namespace LAMMPS_NS;
using namespace MathConst; using namespace MathConst;
using namespace MathSpecial;
#ifdef FFT_SINGLE #ifdef FFT_SINGLE
#define ZEROF 0.0f #define ZEROF 0.0f
#define ONEF 1.0f
#else #else
#define ZEROF 0.0 #define ZEROF 0.0
#define ONEF 1.0
#endif #endif
#define EPS_HOC 1.0e-7 #define EPS_HOC 1.0e-7
@ -56,7 +58,27 @@ void PPPMOMP::allocate()
{ {
PPPM::allocate(); PPPM::allocate();
const int nthreads = comm->nthreads; #if defined(_OPENMP)
#pragma omp parallel default(none)
#endif
{
#if defined(_OPENMP)
const int tid = omp_get_thread_num();
#else
const int tid = 0;
#endif
ThrData *thr = fix->get_thr(tid);
thr->init_pppm(order,memory);
}
}
/* ----------------------------------------------------------------------
free memory that depends on # of K-vectors and order
------------------------------------------------------------------------- */
void PPPMOMP::deallocate()
{
PPPM::deallocate();
#if defined(_OPENMP) #if defined(_OPENMP)
#pragma omp parallel default(none) #pragma omp parallel default(none)
@ -67,153 +89,26 @@ void PPPMOMP::allocate()
#else #else
const int tid = 0; const int tid = 0;
#endif #endif
FFT_SCALAR **rho1d_thr;
memory->create2d_offset(rho1d_thr,3,-order/2,order/2,"pppm:rho1d_thr");
ThrData *thr = fix->get_thr(tid); ThrData *thr = fix->get_thr(tid);
thr->init_pppm(static_cast<void *>(rho1d_thr)); thr->init_pppm(-order,memory);
}
const int nzend = (nzhi_out-nzlo_out+1)*nthreads + nzlo_out -1;
// reallocate density brick, so it fits our needs
memory->destroy3d_offset(density_brick,nzlo_out,nylo_out,nxlo_out);
memory->create3d_offset(density_brick,nzlo_out,nzend,nylo_out,nyhi_out,
nxlo_out,nxhi_out,"pppm:density_brick");
}
// NOTE: special version of reduce_data for FFT_SCALAR data type.
// reduce per thread data into the first part of the data
// array that is used for the non-threaded parts and reset
// the temporary storage to 0.0. this routine depends on
// multi-dimensional arrays like force stored in this order
// x1,y1,z1,x2,y2,z2,...
// we need to post a barrier to wait until all threads are done
// writing to the array.
static void data_reduce_fft(FFT_SCALAR *dall, int nall, int nthreads, int ndim, int tid)
{
#if defined(_OPENMP)
// NOOP in non-threaded execution.
if (nthreads == 1) return;
#pragma omp barrier
{
const int nvals = ndim*nall;
const int idelta = nvals/nthreads + 1;
const int ifrom = tid*idelta;
const int ito = ((ifrom + idelta) > nvals) ? nvals : (ifrom + idelta);
// this if protects against having more threads than atoms
if (ifrom < nall) {
for (int m = ifrom; m < ito; ++m) {
for (int n = 1; n < nthreads; ++n) {
dall[m] += dall[n*nvals + m];
dall[n*nvals + m] = 0.0;
}
}
}
}
#else
// NOOP in non-threaded execution.
return;
#endif
}
/* ----------------------------------------------------------------------
free memory that depends on # of K-vectors and order
------------------------------------------------------------------------- */
void PPPMOMP::deallocate()
{
PPPM::deallocate();
for (int i=0; i < comm->nthreads; ++i) {
ThrData * thr = fix->get_thr(i);
FFT_SCALAR ** rho1d_thr = static_cast<FFT_SCALAR **>(thr->get_rho1d());
memory->destroy2d_offset(rho1d_thr,-order/2);
} }
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
adjust PPPM coeffs, called initially and whenever volume has changed pre-compute modified (Hockney-Eastwood) Coulomb Green's function
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
void PPPMOMP::setup() void PPPMOMP::compute_gf_ik()
{ {
int i,j,k,n; const double * const prd = (triclinic==0) ? domain->prd : domain->prd_lamda;
double *prd;
// volume-dependent factors
// adjust z dimension for 2d slab PPPM
// z dimension for 3d PPPM is zprd since slab_volfactor = 1.0
if (triclinic == 0) prd = domain->prd;
else prd = domain->prd_lamda;
const double xprd = prd[0]; const double xprd = prd[0];
const double yprd = prd[1]; const double yprd = prd[1];
const double zprd = prd[2]; const double zprd = prd[2];
const double zprd_slab = zprd*slab_volfactor; const double zprd_slab = zprd*slab_volfactor;
volume = xprd * yprd * zprd_slab; const double unitkx = (MY_2PI/xprd);
const double unitky = (MY_2PI/yprd);
delxinv = nx_pppm/xprd; const double unitkz = (MY_2PI/zprd_slab);
delyinv = ny_pppm/yprd;
delzinv = nz_pppm/zprd_slab;
delvolinv = delxinv*delyinv*delzinv;
const double unitkx = (2.0*MY_PI/xprd);
const double unitky = (2.0*MY_PI/yprd);
const double unitkz = (2.0*MY_PI/zprd_slab);
// fkx,fky,fkz for my FFT grid pts
double per;
for (i = nxlo_fft; i <= nxhi_fft; i++) {
per = i - nx_pppm*(2*i/nx_pppm);
fkx[i] = unitkx*per;
}
for (i = nylo_fft; i <= nyhi_fft; i++) {
per = i - ny_pppm*(2*i/ny_pppm);
fky[i] = unitky*per;
}
for (i = nzlo_fft; i <= nzhi_fft; i++) {
per = i - nz_pppm*(2*i/nz_pppm);
fkz[i] = unitkz*per;
}
// virial coefficients
double sqk,vterm;
n = 0;
for (k = nzlo_fft; k <= nzhi_fft; k++) {
for (j = nylo_fft; j <= nyhi_fft; j++) {
for (i = nxlo_fft; i <= nxhi_fft; i++) {
sqk = fkx[i]*fkx[i] + fky[j]*fky[j] + fkz[k]*fkz[k];
if (sqk == 0.0) {
vg[n][0] = 0.0;
vg[n][1] = 0.0;
vg[n][2] = 0.0;
vg[n][3] = 0.0;
vg[n][4] = 0.0;
vg[n][5] = 0.0;
} else {
vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald));
vg[n][0] = 1.0 + vterm*fkx[i]*fkx[i];
vg[n][1] = 1.0 + vterm*fky[j]*fky[j];
vg[n][2] = 1.0 + vterm*fkz[k]*fkz[k];
vg[n][3] = vterm*fkx[i]*fky[j];
vg[n][4] = vterm*fkx[i]*fkz[k];
vg[n][5] = vterm*fky[j]*fkz[k];
}
n++;
}
}
}
// modified (Hockney-Eastwood) Coulomb Green's function
const int nbx = static_cast<int> ((g_ewald*xprd/(MY_PI*nx_pppm)) * const int nbx = static_cast<int> ((g_ewald*xprd/(MY_PI*nx_pppm)) *
pow(-log(EPS_HOC),0.25)); pow(-log(EPS_HOC),0.25));
@ -221,93 +116,188 @@ void PPPMOMP::setup()
pow(-log(EPS_HOC),0.25)); pow(-log(EPS_HOC),0.25));
const int nbz = static_cast<int> ((g_ewald*zprd_slab/(MY_PI*nz_pppm)) * const int nbz = static_cast<int> ((g_ewald*zprd_slab/(MY_PI*nz_pppm)) *
pow(-log(EPS_HOC),0.25)); pow(-log(EPS_HOC),0.25));
const int numk = nxhi_fft - nxlo_fft + 1;
const int numl = nyhi_fft - nylo_fft + 1;
const double form = 1.0; const int twoorder = 2*order;
#if defined(_OPENMP) #if defined(_OPENMP)
#pragma omp parallel default(none) #pragma omp parallel default(none)
#endif #endif
{ {
int tid,nn,nnfrom,nnto,nx,ny,nz,k,l,m; double snx,sny,snz;
double snx,sny,snz,sqk;
double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz; double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
double sum1,dot1,dot2; double sum1,dot1,dot2;
double numerator,denominator; double numerator,denominator;
const double gew2 = -4.0*g_ewald*g_ewald; double sqk;
const int nnx = nxhi_fft-nxlo_fft+1; int k,l,m,nx,ny,nz,kper,lper,mper,n,nfrom,nto,tid;
const int nny = nyhi_fft-nylo_fft+1;
loop_setup_thr(nnfrom, nnto, tid, nfft, comm->nthreads); loop_setup_thr(nfrom, nto, tid, nfft, comm->nthreads);
for (m = nzlo_fft; m <= nzhi_fft; m++) { for (n = nfrom; n < nto; ++n) {
m = n / (numl*numk);
l = (n - m*numl*numk) / numk;
k = n - m*numl*numk - l*numk;
m += nzlo_fft;
l += nylo_fft;
k += nxlo_fft;
const double fkzm = fkz[m]; mper = m - nz_pppm*(2*m/nz_pppm);
snz = sin(0.5*fkzm*zprd_slab/nz_pppm); snz = square(sin(0.5*unitkz*mper*zprd_slab/nz_pppm));
snz *= snz;
for (l = nylo_fft; l <= nyhi_fft; l++) { lper = l - ny_pppm*(2*l/ny_pppm);
const double fkyl = fky[l]; sny = square(sin(0.5*unitky*lper*yprd/ny_pppm));
sny = sin(0.5*fkyl*yprd/ny_pppm);
sny *= sny;
for (k = nxlo_fft; k <= nxhi_fft; k++) { kper = k - nx_pppm*(2*k/nx_pppm);
snx = square(sin(0.5*unitkx*kper*xprd/nx_pppm));
/* only compute the part designated to this thread */ sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper);
nn = k-nxlo_fft + nnx*(l-nylo_fft + nny*(m-nzlo_fft));
if ((nn < nnfrom) || (nn >=nnto)) continue;
const double fkxk = fkx[k]; if (sqk != 0.0) {
snx = sin(0.5*fkxk*xprd/nx_pppm); numerator = 12.5663706/sqk;
snx *= snx; denominator = gf_denom(snx,sny,snz);
sum1 = 0.0;
sqk = fkxk*fkxk + fkyl*fkyl + fkzm*fkzm; for (nx = -nbx; nx <= nbx; nx++) {
qx = unitkx*(kper+nx_pppm*nx);
sx = exp(-0.25*square(qx/g_ewald));
argx = 0.5*qx*xprd/nx_pppm;
wx = powsinxx(argx,twoorder);
if (sqk != 0.0) { for (ny = -nby; ny <= nby; ny++) {
numerator = form*MY_4PI/sqk; qy = unitky*(lper+ny_pppm*ny);
denominator = gf_denom(snx,sny,snz); sy = exp(-0.25*square(qy/g_ewald));
sum1 = 0.0; argy = 0.5*qy*yprd/ny_pppm;
const double dorder = static_cast<double>(order); wy = powsinxx(argy,twoorder);
for (nx = -nbx; nx <= nbx; nx++) {
qx = fkxk + unitkx*nx_pppm*nx;
sx = exp(qx*qx/gew2);
wx = 1.0;
argx = 0.5*qx*xprd/nx_pppm;
if (argx != 0.0) wx = pow(sin(argx)/argx,dorder);
wx *=wx;
for (ny = -nby; ny <= nby; ny++) { for (nz = -nbz; nz <= nbz; nz++) {
qy = fkyl + unitky*ny_pppm*ny; qz = unitkz*(mper+nz_pppm*nz);
sy = exp(qy*qy/gew2); sz = exp(-0.25*square(qz/g_ewald));
wy = 1.0; argz = 0.5*qz*zprd_slab/nz_pppm;
argy = 0.5*qy*yprd/ny_pppm; wz = powsinxx(argz,twoorder);
if (argy != 0.0) wy = pow(sin(argy)/argy,dorder);
wy *= wy;
for (nz = -nbz; nz <= nbz; nz++) { dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
qz = fkzm + unitkz*nz_pppm*nz; dot2 = qx*qx+qy*qy+qz*qz;
sz = exp(qz*qz/gew2); sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz;
wz = 1.0;
argz = 0.5*qz*zprd_slab/nz_pppm;
if (argz != 0.0) wz = pow(sin(argz)/argz,dorder);
wz *= wz;
dot1 = fkxk*qx + fkyl*qy + fkzm*qz;
dot2 = qx*qx+qy*qy+qz*qz;
sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz;
}
}
} }
greensfn[nn] = numerator*sum1/denominator; }
} else greensfn[nn] = 0.0;
} }
} greensfn[n] = numerator*sum1/denominator;
} else greensfn[n] = 0.0;
} }
} } // end of parallel region
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
run the regular toplevel compute method from plain PPPPM compute optimized Green's function for energy calculation
------------------------------------------------------------------------- */
void PPPMOMP::compute_gf_ad()
{
const double * const prd = (triclinic==0) ? domain->prd : domain->prd_lamda;
const double xprd = prd[0];
const double yprd = prd[1];
const double zprd = prd[2];
const double zprd_slab = zprd*slab_volfactor;
const double unitkx = (MY_2PI/xprd);
const double unitky = (MY_2PI/yprd);
const double unitkz = (MY_2PI/zprd_slab);
const int numk = nxhi_fft - nxlo_fft + 1;
const int numl = nyhi_fft - nylo_fft + 1;
const int twoorder = 2*order;
double sf0=0.0,sf1=0.0,sf2=0.0,sf3=0.0,sf4=0.0,sf5=0.0;
#if defined(_OPENMP)
#pragma omp parallel default(none) reduction(+:sf0,sf1,sf2,sf3,sf4,sf5)
#endif
{
double snx,sny,snz,sqk;
double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
double numerator,denominator;
int k,l,m,kper,lper,mper,n,nfrom,nto,tid;
loop_setup_thr(nfrom, nto, tid, nfft, comm->nthreads);
for (n = nfrom; n < nto; ++n) {
m = n / (numl*numk);
l = (n - m*numl*numk) / numk;
k = n - m*numl*numk - l*numk;
m += nzlo_fft;
l += nylo_fft;
k += nxlo_fft;
mper = m - nz_pppm*(2*m/nz_pppm);
qz = unitkz*mper;
snz = square(sin(0.5*qz*zprd_slab/nz_pppm));
sz = exp(-0.25*square(qz/g_ewald));
argz = 0.5*qz*zprd_slab/nz_pppm;
wz = powsinxx(argz,twoorder);
lper = l - ny_pppm*(2*l/ny_pppm);
qy = unitky*lper;
sny = square(sin(0.5*qy*yprd/ny_pppm));
sy = exp(-0.25*square(qy/g_ewald));
argy = 0.5*qy*yprd/ny_pppm;
wy = powsinxx(argy,twoorder);
kper = k - nx_pppm*(2*k/nx_pppm);
qx = unitkx*kper;
snx = square(sin(0.5*qx*xprd/nx_pppm));
sx = exp(-0.25*square(qx/g_ewald));
argx = 0.5*qx*xprd/nx_pppm;
wx = powsinxx(argx,twoorder);
sqk = qx*qx + qy*qy + qz*qz;
if (sqk != 0.0) {
numerator = MY_4PI/sqk;
denominator = gf_denom(snx,sny,snz);
greensfn[n] = numerator*sx*sy*sz*wx*wy*wz/denominator;
sf0 += sf_precoeff1[n]*greensfn[n];
sf1 += sf_precoeff2[n]*greensfn[n];
sf2 += sf_precoeff3[n]*greensfn[n];
sf3 += sf_precoeff4[n]*greensfn[n];
sf4 += sf_precoeff5[n]*greensfn[n];
sf5 += sf_precoeff6[n]*greensfn[n];
} else {
greensfn[n] = 0.0;
sf0 += sf_precoeff1[n]*greensfn[n];
sf1 += sf_precoeff2[n]*greensfn[n];
sf2 += sf_precoeff3[n]*greensfn[n];
sf3 += sf_precoeff4[n]*greensfn[n];
sf4 += sf_precoeff5[n]*greensfn[n];
sf5 += sf_precoeff6[n]*greensfn[n];
}
}
} // end of paralle region
// compute the coefficients for the self-force correction
double prex, prey, prez, tmp[6];
prex = prey = prez = MY_PI/volume;
prex *= nx_pppm/xprd;
prey *= ny_pppm/yprd;
prez *= nz_pppm/zprd_slab;
tmp[0] = sf0 * prex;
tmp[1] = sf1 * prex*2;
tmp[2] = sf2 * prey;
tmp[3] = sf3 * prey*2;
tmp[4] = sf4 * prez;
tmp[5] = sf5 * prez*2;
// communicate values with other procs
MPI_Allreduce(tmp,sf_coeff,6,MPI_DOUBLE,MPI_SUM,world);
}
/* ----------------------------------------------------------------------
run the regular toplevel compute method from plain PPPM
which will have individual methods replaced by our threaded which will have individual methods replaced by our threaded
versions and then call the obligatory force reduction. versions and then call the obligatory force reduction.
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
@ -332,94 +322,10 @@ void PPPMOMP::compute(int eflag, int vflag)
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
create discretized "density" on section of global grid due to my particles interpolate from grid to get electric field & force on my particles for ik
density(x,y,z) = charge "density" at grid points of my 3d brick
(nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
in global grid
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
void PPPMOMP::make_rho() void PPPMOMP::fieldforce_ik()
{
const double * const q = atom->q;
const double * const * const x = atom->x;
const int nthreads = comm->nthreads;
const int nlocal = atom->nlocal;
#if defined(_OPENMP)
#pragma omp parallel default(none)
#endif
{
#if defined(_OPENMP)
// each thread works on a fixed chunk of atoms.
const int tid = omp_get_thread_num();
const int inum = nlocal;
const int idelta = 1 + inum/nthreads;
const int ifrom = tid*idelta;
const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
#else
const int tid = 0;
const int ifrom = 0;
const int ito = nlocal;
#endif
int l,m,n,nx,ny,nz,mx,my,mz;
FFT_SCALAR dx,dy,dz,x0,y0,z0;
// set up clear 3d density array
const int nzoffs = (nzhi_out-nzlo_out+1)*tid;
FFT_SCALAR * const * const * const db = &(density_brick[nzoffs]);
memset(&(db[nzlo_out][nylo_out][nxlo_out]),0,ngrid*sizeof(FFT_SCALAR));
ThrData *thr = fix->get_thr(tid);
FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
// loop over my charges, add their contribution to nearby grid points
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
// (dx,dy,dz) = distance to "lower left" grid pt
// (mx,my,mz) = global coords of moving stencil pt
// this if protects against having more threads than local atoms
if (ifrom < nlocal) {
for (int i = ifrom; i < ito; i++) {
nx = part2grid[i][0];
ny = part2grid[i][1];
nz = part2grid[i][2];
dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
compute_rho1d_thr(r1d,dx,dy,dz);
z0 = delvolinv * q[i];
for (n = nlower; n <= nupper; n++) {
mz = n+nz;
y0 = z0*r1d[2][n];
for (m = nlower; m <= nupper; m++) {
my = m+ny;
x0 = y0*r1d[1][m];
for (l = nlower; l <= nupper; l++) {
mx = l+nx;
db[mz][my][mx] += x0*r1d[0][l];
}
}
}
}
}
#if defined(_OPENMP)
// reduce 3d density array
if (nthreads > 1) {
data_reduce_fft(&(density_brick[nzlo_out][nylo_out][nxlo_out]),ngrid,nthreads,1,tid);
}
#endif
}
}
/* ----------------------------------------------------------------------
interpolate from grid to get electric field & force on my particles
------------------------------------------------------------------------- */
void PPPMOMP::fieldforce()
{ {
// loop over my charges, interpolate electric field from nearby grid points // loop over my charges, interpolate electric field from nearby grid points
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
@ -427,79 +333,170 @@ void PPPMOMP::fieldforce()
// (mx,my,mz) = global coords of moving stencil pt // (mx,my,mz) = global coords of moving stencil pt
// ek = 3 components of E-field on particle // ek = 3 components of E-field on particle
const double * const q = atom->q; const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
const double * const * const x = atom->x; const double * _noalias const q = atom->q;
const int3_t * _noalias const p2g = (int3_t *) part2grid[0];
const double qqrd2e = force->qqrd2e;
const double boxlox = boxlo[0];
const double boxloy = boxlo[1];
const double boxloz = boxlo[2];
const int nthreads = comm->nthreads; const int nthreads = comm->nthreads;
const int nlocal = atom->nlocal; const int nlocal = atom->nlocal;
const double qqrd2e = force->qqrd2e;
#if defined(_OPENMP) #if defined(_OPENMP)
#pragma omp parallel default(none) #pragma omp parallel default(none)
#endif #endif
{ {
#if defined(_OPENMP) FFT_SCALAR x0,y0,z0,ekx,eky,ekz;
// each thread works on a fixed chunk of atoms. int i,ifrom,ito,tid,l,m,n,mx,my,mz;
const int tid = omp_get_thread_num();
const int inum = nlocal; loop_setup_thr(ifrom,ito,tid,nlocal,nthreads);
const int idelta = 1 + inum/nthreads;
const int ifrom = tid*idelta; // get per thread data
const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
#else
const int ifrom = 0;
const int ito = nlocal;
const int tid = 0;
#endif
ThrData *thr = fix->get_thr(tid); ThrData *thr = fix->get_thr(tid);
double * const * const f = thr->get_f(); dbl3_t * _noalias const f = (dbl3_t *) thr->get_f()[0];
FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d()); FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
int l,m,n,nx,ny,nz,mx,my,mz; for (i = ifrom; i < ito; ++i) {
FFT_SCALAR dx,dy,dz,x0,y0,z0; const int nx = p2g[i].a;
FFT_SCALAR ekx,eky,ekz; const int ny = p2g[i].b;
const int nz = p2g[i].t;
const FFT_SCALAR dx = nx+shiftone - (x[i].x-boxlox)*delxinv;
const FFT_SCALAR dy = ny+shiftone - (x[i].y-boxloy)*delyinv;
const FFT_SCALAR dz = nz+shiftone - (x[i].z-boxloz)*delzinv;
// this if protects against having more threads than local atoms compute_rho1d_thr(r1d,dx,dy,dz);
if (ifrom < nlocal) {
for (int i = ifrom; i < ito; i++) {
nx = part2grid[i][0]; ekx = eky = ekz = ZEROF;
ny = part2grid[i][1]; for (n = nlower; n <= nupper; n++) {
nz = part2grid[i][2]; mz = n+nz;
dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv; z0 = r1d[2][n];
dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv; for (m = nlower; m <= nupper; m++) {
dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv; my = m+ny;
y0 = z0*r1d[1][m];
compute_rho1d_thr(r1d,dx,dy,dz); for (l = nlower; l <= nupper; l++) {
mx = l+nx;
ekx = eky = ekz = ZEROF; x0 = y0*r1d[0][l];
for (n = nlower; n <= nupper; n++) { ekx -= x0*vdx_brick[mz][my][mx];
mz = n+nz; eky -= x0*vdy_brick[mz][my][mx];
z0 = r1d[2][n]; ekz -= x0*vdz_brick[mz][my][mx];
for (m = nlower; m <= nupper; m++) {
my = m+ny;
y0 = z0*r1d[1][m];
for (l = nlower; l <= nupper; l++) {
mx = l+nx;
x0 = y0*r1d[0][l];
ekx -= x0*vdx_brick[mz][my][mx];
eky -= x0*vdy_brick[mz][my][mx];
ekz -= x0*vdz_brick[mz][my][mx];
}
} }
} }
// convert E-field to force
const double qfactor = qqrd2e*scale*q[i];
f[i][0] += qfactor*ekx;
f[i][1] += qfactor*eky;
f[i][2] += qfactor*ekz;
} }
// convert E-field to force
const double qfactor = qqrd2e * scale * q[i];
f[i].x += qfactor*ekx;
f[i].y += qfactor*eky;
if (slabflag != 2) f[i].z += qfactor*ekz;
} }
} } // end of parallel region
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
interpolate from grid to get per-atom energy/virial interpolate from grid to get electric field & force on my particles for ad
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
void PPPMOMP::fieldforce_ad()
{
const double *prd = (triclinic == 0) ? domain->prd : domain->prd_lamda;
const double hx_inv = nx_pppm/prd[0];
const double hy_inv = ny_pppm/prd[1];
const double hz_inv = nz_pppm/prd[2];
// loop over my charges, interpolate electric field from nearby grid points
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
// (dx,dy,dz) = distance to "lower left" grid pt
// (mx,my,mz) = global coords of moving stencil pt
// ek = 3 components of E-field on particle
const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
const double * _noalias const q = atom->q;
const int3_t * _noalias const p2g = (int3_t *) part2grid[0];
const double qqrd2e = force->qqrd2e;
const double boxlox = boxlo[0];
const double boxloy = boxlo[1];
const double boxloz = boxlo[2];
const int nthreads = comm->nthreads;
const int nlocal = atom->nlocal;
#if defined(_OPENMP)
#pragma omp parallel default(none)
#endif
{
double s1,s2,s3,sf;
FFT_SCALAR ekx,eky,ekz;
int i,ifrom,ito,tid,l,m,n,mx,my,mz;
loop_setup_thr(ifrom,ito,tid,nlocal,nthreads);
// get per thread data
ThrData *thr = fix->get_thr(tid);
dbl3_t * _noalias const f = (dbl3_t *) thr->get_f()[0];
FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
FFT_SCALAR * const * const d1d = static_cast<FFT_SCALAR **>(thr->get_drho1d());
for (i = ifrom; i < ito; ++i) {
const int nx = p2g[i].a;
const int ny = p2g[i].b;
const int nz = p2g[i].t;
const FFT_SCALAR dx = nx+shiftone - (x[i].x-boxlox)*delxinv;
const FFT_SCALAR dy = ny+shiftone - (x[i].y-boxloy)*delyinv;
const FFT_SCALAR dz = nz+shiftone - (x[i].z-boxloz)*delzinv;
compute_rho1d_thr(r1d,dx,dy,dz);
compute_drho1d_thr(d1d,dx,dy,dz);
ekx = eky = ekz = ZEROF;
for (n = nlower; n <= nupper; n++) {
mz = n+nz;
for (m = nlower; m <= nupper; m++) {
my = m+ny;
for (l = nlower; l <= nupper; l++) {
mx = l+nx;
ekx += d1d[0][l]*r1d[1][m]*r1d[2][n]*u_brick[mz][my][mx];
eky += r1d[0][l]*d1d[1][m]*r1d[2][n]*u_brick[mz][my][mx];
ekz += r1d[0][l]*r1d[1][m]*d1d[2][n]*u_brick[mz][my][mx];
}
}
}
ekx *= hx_inv;
eky *= hy_inv;
ekz *= hz_inv;
// convert E-field to force and substract self forces
const double qi = q[i];
const double qfactor = qqrd2e * scale * qi;
s1 = x[i].x*hx_inv;
sf = sf_coeff[0]*sin(MY_2PI*s1);
sf += sf_coeff[1]*sin(MY_4PI*s1);
sf *= 2.0*qi;
f[i].x += qfactor*(ekx - sf);
s2 = x[i].y*hy_inv;
sf = sf_coeff[2]*sin(MY_2PI*s2);
sf += sf_coeff[3]*sin(MY_4PI*s2);
sf *= 2.0*qi;
f[i].y += qfactor*(eky - sf);
s3 = x[i].z*hz_inv;
sf = sf_coeff[4]*sin(MY_2PI*s3);
sf += sf_coeff[5]*sin(MY_4PI*s3);
sf *= 2.0*qi;
if (slabflag != 2) f[i].z += qfactor*(ekz - sf);
}
} // end of parallel region
}
/* ----------------------------------------------------------------------
interpolate from grid to get per-atom energy/virial
------------------------------------------------------------------------- */
void PPPMOMP::fieldforce_peratom() void PPPMOMP::fieldforce_peratom()
{ {
@ -508,8 +505,8 @@ void PPPMOMP::fieldforce_peratom()
// (dx,dy,dz) = distance to "lower left" grid pt // (dx,dy,dz) = distance to "lower left" grid pt
// (mx,my,mz) = global coords of moving stencil pt // (mx,my,mz) = global coords of moving stencil pt
const double * const q = atom->q; const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
const double * const * const x = atom->x; const double * _noalias const q = atom->q;
const int nthreads = comm->nthreads; const int nthreads = comm->nthreads;
const int nlocal = atom->nlocal; const int nlocal = atom->nlocal;
@ -517,73 +514,61 @@ void PPPMOMP::fieldforce_peratom()
#pragma omp parallel default(none) #pragma omp parallel default(none)
#endif #endif
{ {
#if defined(_OPENMP)
// each thread works on a fixed chunk of atoms.
const int tid = omp_get_thread_num();
const int inum = nlocal;
const int idelta = 1 + inum/nthreads;
const int ifrom = tid*idelta;
const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
#else
const int ifrom = 0;
const int ito = nlocal;
const int tid = 0;
#endif
ThrData *thr = fix->get_thr(tid);
FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
int i,l,m,n,nx,ny,nz,mx,my,mz;
FFT_SCALAR dx,dy,dz,x0,y0,z0; FFT_SCALAR dx,dy,dz,x0,y0,z0;
FFT_SCALAR u,v0,v1,v2,v3,v4,v5; FFT_SCALAR u,v0,v1,v2,v3,v4,v5;
int i,ifrom,ito,tid,l,m,n,nx,ny,nz,mx,my,mz;
// this if protects against having more threads than local atoms loop_setup_thr(ifrom,ito,tid,nlocal,nthreads);
if (ifrom < nlocal) {
for (int i = ifrom; i < ito; i++) {
nx = part2grid[i][0]; // get per thread data
ny = part2grid[i][1]; ThrData *thr = fix->get_thr(tid);
nz = part2grid[i][2]; FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
compute_rho1d_thr(r1d,dx,dy,dz); for (i = ifrom; i < ito; ++i) {
nx = part2grid[i][0];
ny = part2grid[i][1];
nz = part2grid[i][2];
dx = nx+shiftone - (x[i].x-boxlo[0])*delxinv;
dy = ny+shiftone - (x[i].y-boxlo[1])*delyinv;
dz = nz+shiftone - (x[i].z-boxlo[2])*delzinv;
u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF; compute_rho1d_thr(r1d,dx,dy,dz);
for (n = nlower; n <= nupper; n++) {
mz = n+nz; u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF;
z0 = r1d[2][n]; for (n = nlower; n <= nupper; n++) {
for (m = nlower; m <= nupper; m++) { mz = n+nz;
my = m+ny; z0 = r1d[2][n];
y0 = z0*r1d[1][m]; for (m = nlower; m <= nupper; m++) {
for (l = nlower; l <= nupper; l++) { my = m+ny;
mx = l+nx; y0 = z0*r1d[1][m];
x0 = y0*r1d[0][l]; for (l = nlower; l <= nupper; l++) {
if (eflag_atom) u += x0*u_brick[mz][my][mx]; mx = l+nx;
if (vflag_atom) { x0 = y0*r1d[0][l];
v0 += x0*v0_brick[mz][my][mx]; if (eflag_atom) u += x0*u_brick[mz][my][mx];
v1 += x0*v1_brick[mz][my][mx]; if (vflag_atom) {
v2 += x0*v2_brick[mz][my][mx]; v0 += x0*v0_brick[mz][my][mx];
v3 += x0*v3_brick[mz][my][mx]; v1 += x0*v1_brick[mz][my][mx];
v4 += x0*v4_brick[mz][my][mx]; v2 += x0*v2_brick[mz][my][mx];
v5 += x0*v5_brick[mz][my][mx]; v3 += x0*v3_brick[mz][my][mx];
} v4 += x0*v4_brick[mz][my][mx];
v5 += x0*v5_brick[mz][my][mx];
} }
} }
} }
}
if (eflag_atom) eatom[i] += q[i]*u; const double qi = q[i];
if (vflag_atom) { if (eflag_atom) eatom[i] += qi*u;
vatom[i][0] += v0; if (vflag_atom) {
vatom[i][1] += v1; vatom[i][0] += qi*v0;
vatom[i][2] += v2; vatom[i][1] += qi*v1;
vatom[i][3] += v3; vatom[i][2] += qi*v2;
vatom[i][4] += v4; vatom[i][3] += qi*v3;
vatom[i][5] += v5; vatom[i][4] += qi*v4;
} vatom[i][5] += qi*v5;
} }
} }
} } // end of parallel region
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
@ -609,3 +594,28 @@ void PPPMOMP::compute_rho1d_thr(FFT_SCALAR * const * const r1d, const FFT_SCALAR
r1d[2][k] = r3; r1d[2][k] = r3;
} }
} }
/* ----------------------------------------------------------------------
charge assignment into drho1d
dx,dy,dz = distance of particle from "lower left" grid point
------------------------------------------------------------------------- */
void PPPMOMP::compute_drho1d_thr(FFT_SCALAR * const * const d1d, const FFT_SCALAR &dx,
const FFT_SCALAR &dy, const FFT_SCALAR &dz)
{
int k,l;
FFT_SCALAR r1,r2,r3;
for (k = (1-order)/2; k <= order/2; k++) {
r1 = r2 = r3 = ZEROF;
for (l = order-2; l >= 0; l--) {
r1 = drho_coeff[l][k] + r1*dx;
r2 = drho_coeff[l][k] + r2*dy;
r3 = drho_coeff[l][k] + r3*dz;
}
d1d[0][k] = r1;
d1d[1][k] = r2;
d1d[2][k] = r3;
}
}

View File

@ -25,23 +25,29 @@ KSpaceStyle(pppm/omp,PPPMOMP)
namespace LAMMPS_NS { namespace LAMMPS_NS {
class PPPMOMP : public PPPM, public ThrOMP { class PPPMOMP : public PPPM, public ThrOMP {
public: public:
PPPMOMP(class LAMMPS *, int, char **); PPPMOMP(class LAMMPS *, int, char **);
virtual ~PPPMOMP () {}; virtual ~PPPMOMP () {};
virtual void setup();
virtual void compute(int, int); virtual void compute(int, int);
protected: protected:
virtual void allocate(); virtual void allocate();
virtual void deallocate(); virtual void deallocate();
virtual void fieldforce();
virtual void fieldforce_peratom();
virtual void make_rho();
virtual void compute_gf_ik();
virtual void compute_gf_ad();
virtual void fieldforce_ik();
virtual void fieldforce_ad();
virtual void fieldforce_peratom();
// virtual void make_rho();
private:
void compute_rho1d_thr(FFT_SCALAR * const * const, const FFT_SCALAR &, void compute_rho1d_thr(FFT_SCALAR * const * const, const FFT_SCALAR &,
const FFT_SCALAR &, const FFT_SCALAR &); const FFT_SCALAR &, const FFT_SCALAR &);
// void compute_rho_coeff(); void compute_drho1d_thr(FFT_SCALAR * const * const, const FFT_SCALAR &,
const FFT_SCALAR &, const FFT_SCALAR &);
// void slabcorr(int); // void slabcorr(int);
}; };

View File

@ -19,9 +19,12 @@
#include "atom.h" #include "atom.h"
#include "comm.h" #include "comm.h"
#include "domain.h" #include "domain.h"
#include "error.h"
#include "fix_omp.h"
#include "force.h" #include "force.h"
#include "memory.h" #include "memory.h"
#include "math_const.h" #include "math_const.h"
#include "math_special.h"
#include <string.h> #include <string.h>
#include <math.h> #include <math.h>
@ -29,74 +32,294 @@
#include "suffix.h" #include "suffix.h"
using namespace LAMMPS_NS; using namespace LAMMPS_NS;
using namespace MathConst; using namespace MathConst;
using namespace MathSpecial;
#define OFFSET 16384
#ifdef FFT_SINGLE #ifdef FFT_SINGLE
#define ZEROF 0.0f #define ZEROF 0.0f
#define ONEF 1.0f
#else #else
#define ZEROF 0.0 #define ZEROF 0.0
#define ONEF 1.0
#endif #endif
#define EPS_HOC 1.0e-7
#define OFFSET 16384
/* ---------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- */
PPPMTIP4POMP::PPPMTIP4POMP(LAMMPS *lmp, int narg, char **arg) : PPPMTIP4POMP::PPPMTIP4POMP(LAMMPS *lmp, int narg, char **arg) :
PPPMOMP(lmp, narg, arg) PPPMTIP4P(lmp, narg, arg), ThrOMP(lmp, THR_KSPACE)
{ {
tip4pflag = 1;
suffix_flag |= Suffix::OMP; suffix_flag |= Suffix::OMP;
} }
/* ---------------------------------------------------------------------- */ /* ----------------------------------------------------------------------
allocate memory that depends on # of K-vectors and order
------------------------------------------------------------------------- */
// NOTE: special version of reduce_data for FFT_SCALAR data type. void PPPMTIP4POMP::allocate()
// reduce per thread data into the first part of the data
// array that is used for the non-threaded parts and reset
// the temporary storage to 0.0. this routine depends on
// multi-dimensional arrays like force stored in this order
// x1,y1,z1,x2,y2,z2,...
// we need to post a barrier to wait until all threads are done
// writing to the array.
static void data_reduce_fft(FFT_SCALAR *dall, int nall, int nthreads, int ndim, int tid)
{ {
#if defined(_OPENMP) PPPMTIP4P::allocate();
// NOOP in non-threaded execution.
if (nthreads == 1) return;
#pragma omp barrier
{
const int nvals = ndim*nall;
const int idelta = nvals/nthreads + 1;
const int ifrom = tid*idelta;
const int ito = ((ifrom + idelta) > nvals) ? nvals : (ifrom + idelta);
// this if protects against having more threads than atoms #if defined(_OPENMP)
if (ifrom < nall) { #pragma omp parallel default(none)
for (int m = ifrom; m < ito; ++m) {
for (int n = 1; n < nthreads; ++n) {
dall[m] += dall[n*nvals + m];
dall[n*nvals + m] = 0.0;
}
}
}
}
#else
// NOOP in non-threaded execution.
return;
#endif #endif
{
#if defined(_OPENMP)
const int tid = omp_get_thread_num();
#else
const int tid = 0;
#endif
ThrData *thr = fix->get_thr(tid);
thr->init_pppm(order,memory);
}
} }
/* ---------------------------------------------------------------------- */ /* ----------------------------------------------------------------------
free memory that depends on # of K-vectors and order
------------------------------------------------------------------------- */
void PPPMTIP4POMP::init() void PPPMTIP4POMP::deallocate()
{ {
// TIP4P PPPM requires newton on, b/c it computes forces on ghost atoms PPPMTIP4P::deallocate();
if (force->newton == 0) #if defined(_OPENMP)
error->all(FLERR,"Kspace style pppm/tip4p/omp requires newton on"); #pragma omp parallel default(none)
#endif
{
#if defined(_OPENMP)
const int tid = omp_get_thread_num();
#else
const int tid = 0;
#endif
ThrData *thr = fix->get_thr(tid);
thr->init_pppm(-order,memory);
}
}
PPPMOMP::init(); /* ----------------------------------------------------------------------
pre-compute modified (Hockney-Eastwood) Coulomb Green's function
------------------------------------------------------------------------- */
void PPPMTIP4POMP::compute_gf_ik()
{
const double * const prd = (triclinic==0) ? domain->prd : domain->prd_lamda;
const double xprd = prd[0];
const double yprd = prd[1];
const double zprd = prd[2];
const double zprd_slab = zprd*slab_volfactor;
const double unitkx = (MY_2PI/xprd);
const double unitky = (MY_2PI/yprd);
const double unitkz = (MY_2PI/zprd_slab);
const int nbx = static_cast<int> ((g_ewald*xprd/(MY_PI*nx_pppm)) *
pow(-log(EPS_HOC),0.25));
const int nby = static_cast<int> ((g_ewald*yprd/(MY_PI*ny_pppm)) *
pow(-log(EPS_HOC),0.25));
const int nbz = static_cast<int> ((g_ewald*zprd_slab/(MY_PI*nz_pppm)) *
pow(-log(EPS_HOC),0.25));
const int numk = nxhi_fft - nxlo_fft + 1;
const int numl = nyhi_fft - nylo_fft + 1;
const int twoorder = 2*order;
#if defined(_OPENMP)
#pragma omp parallel default(none)
#endif
{
double snx,sny,snz;
double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
double sum1,dot1,dot2;
double numerator,denominator;
double sqk;
int k,l,m,nx,ny,nz,kper,lper,mper,n,nfrom,nto,tid;
loop_setup_thr(nfrom, nto, tid, nfft, comm->nthreads);
for (n = nfrom; n < nto; ++n) {
m = n / (numl*numk);
l = (n - m*numl*numk) / numk;
k = n - m*numl*numk - l*numk;
m += nzlo_fft;
l += nylo_fft;
k += nxlo_fft;
mper = m - nz_pppm*(2*m/nz_pppm);
snz = square(sin(0.5*unitkz*mper*zprd_slab/nz_pppm));
lper = l - ny_pppm*(2*l/ny_pppm);
sny = square(sin(0.5*unitky*lper*yprd/ny_pppm));
kper = k - nx_pppm*(2*k/nx_pppm);
snx = square(sin(0.5*unitkx*kper*xprd/nx_pppm));
sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper);
if (sqk != 0.0) {
numerator = 12.5663706/sqk;
denominator = gf_denom(snx,sny,snz);
sum1 = 0.0;
for (nx = -nbx; nx <= nbx; nx++) {
qx = unitkx*(kper+nx_pppm*nx);
sx = exp(-0.25*square(qx/g_ewald));
argx = 0.5*qx*xprd/nx_pppm;
wx = powsinxx(argx,twoorder);
for (ny = -nby; ny <= nby; ny++) {
qy = unitky*(lper+ny_pppm*ny);
sy = exp(-0.25*square(qy/g_ewald));
argy = 0.5*qy*yprd/ny_pppm;
wy = powsinxx(argy,twoorder);
for (nz = -nbz; nz <= nbz; nz++) {
qz = unitkz*(mper+nz_pppm*nz);
sz = exp(-0.25*square(qz/g_ewald));
argz = 0.5*qz*zprd_slab/nz_pppm;
wz = powsinxx(argz,twoorder);
dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
dot2 = qx*qx+qy*qy+qz*qz;
sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz;
}
}
}
greensfn[n] = numerator*sum1/denominator;
} else greensfn[n] = 0.0;
}
} // end of parallel region
}
/* ----------------------------------------------------------------------
compute optimized Green's function for energy calculation
------------------------------------------------------------------------- */
void PPPMTIP4POMP::compute_gf_ad()
{
const double * const prd = (triclinic==0) ? domain->prd : domain->prd_lamda;
const double xprd = prd[0];
const double yprd = prd[1];
const double zprd = prd[2];
const double zprd_slab = zprd*slab_volfactor;
const double unitkx = (MY_2PI/xprd);
const double unitky = (MY_2PI/yprd);
const double unitkz = (MY_2PI/zprd_slab);
const int numk = nxhi_fft - nxlo_fft + 1;
const int numl = nyhi_fft - nylo_fft + 1;
const int twoorder = 2*order;
double sf0=0.0,sf1=0.0,sf2=0.0,sf3=0.0,sf4=0.0,sf5=0.0;
#if defined(_OPENMP)
#pragma omp parallel default(none) reduction(+:sf0,sf1,sf2,sf3,sf4,sf5)
#endif
{
double snx,sny,snz,sqk;
double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
double numerator,denominator;
int k,l,m,kper,lper,mper,n,nfrom,nto,tid;
loop_setup_thr(nfrom, nto, tid, nfft, comm->nthreads);
for (n = nfrom; n < nto; ++n) {
m = n / (numl*numk);
l = (n - m*numl*numk) / numk;
k = n - m*numl*numk - l*numk;
m += nzlo_fft;
l += nylo_fft;
k += nxlo_fft;
mper = m - nz_pppm*(2*m/nz_pppm);
qz = unitkz*mper;
snz = square(sin(0.5*qz*zprd_slab/nz_pppm));
sz = exp(-0.25*square(qz/g_ewald));
argz = 0.5*qz*zprd_slab/nz_pppm;
wz = powsinxx(argz,twoorder);
lper = l - ny_pppm*(2*l/ny_pppm);
qy = unitky*lper;
sny = square(sin(0.5*qy*yprd/ny_pppm));
sy = exp(-0.25*square(qy/g_ewald));
argy = 0.5*qy*yprd/ny_pppm;
wy = powsinxx(argy,twoorder);
kper = k - nx_pppm*(2*k/nx_pppm);
qx = unitkx*kper;
snx = square(sin(0.5*qx*xprd/nx_pppm));
sx = exp(-0.25*square(qx/g_ewald));
argx = 0.5*qx*xprd/nx_pppm;
wx = powsinxx(argx,twoorder);
sqk = qx*qx + qy*qy + qz*qz;
if (sqk != 0.0) {
numerator = MY_4PI/sqk;
denominator = gf_denom(snx,sny,snz);
greensfn[n] = numerator*sx*sy*sz*wx*wy*wz/denominator;
sf0 += sf_precoeff1[n]*greensfn[n];
sf1 += sf_precoeff2[n]*greensfn[n];
sf2 += sf_precoeff3[n]*greensfn[n];
sf3 += sf_precoeff4[n]*greensfn[n];
sf4 += sf_precoeff5[n]*greensfn[n];
sf5 += sf_precoeff6[n]*greensfn[n];
} else {
greensfn[n] = 0.0;
sf0 += sf_precoeff1[n]*greensfn[n];
sf1 += sf_precoeff2[n]*greensfn[n];
sf2 += sf_precoeff3[n]*greensfn[n];
sf3 += sf_precoeff4[n]*greensfn[n];
sf4 += sf_precoeff5[n]*greensfn[n];
sf5 += sf_precoeff6[n]*greensfn[n];
}
}
} // end of paralle region
// compute the coefficients for the self-force correction
double prex, prey, prez, tmp[6];
prex = prey = prez = MY_PI/volume;
prex *= nx_pppm/xprd;
prey *= ny_pppm/yprd;
prez *= nz_pppm/zprd_slab;
tmp[0] = sf0 * prex;
tmp[1] = sf1 * prex*2;
tmp[2] = sf2 * prey;
tmp[3] = sf3 * prey*2;
tmp[4] = sf4 * prez;
tmp[5] = sf5 * prez*2;
// communicate values with other procs
MPI_Allreduce(tmp,sf_coeff,6,MPI_DOUBLE,MPI_SUM,world);
}
/* ----------------------------------------------------------------------
run the regular toplevel compute method from plain PPPM
which will have individual methods replaced by our threaded
versions and then call the obligatory force reduction.
------------------------------------------------------------------------- */
void PPPMTIP4POMP::compute(int eflag, int vflag)
{
PPPMTIP4P::compute(eflag,vflag);
#if defined(_OPENMP)
#pragma omp parallel default(none) shared(eflag,vflag)
#endif
{
#if defined(_OPENMP)
const int tid = omp_get_thread_num();
#else
const int tid = 0;
#endif
ThrData *thr = fix->get_thr(tid);
reduce_thr(this, eflag, vflag, thr);
} // end of omp parallel region
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
@ -107,8 +330,12 @@ void PPPMTIP4POMP::init()
void PPPMTIP4POMP::particle_map() void PPPMTIP4POMP::particle_map()
{ {
const int * const type = atom->type; const int * _noalias const type = atom->type;
const double * const * const x = atom->x; const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
int3_t * _noalias const p2g = (int3_t *) part2grid[0];
const double boxlox = boxlo[0];
const double boxloy = boxlo[1];
const double boxloz = boxlo[2];
const int nlocal = atom->nlocal; const int nlocal = atom->nlocal;
int i, flag = 0; int i, flag = 0;
@ -116,34 +343,33 @@ void PPPMTIP4POMP::particle_map()
#pragma omp parallel for private(i) default(none) reduction(+:flag) schedule(static) #pragma omp parallel for private(i) default(none) reduction(+:flag) schedule(static)
#endif #endif
for (i = 0; i < nlocal; i++) { for (i = 0; i < nlocal; i++) {
int nx,ny,nz,iH1,iH2; dbl3_t xM;
double xM[3]; int iH1,iH2;
if (type[i] == typeO) { if (type[i] == typeO) {
find_M(i,iH1,iH2,xM); find_M_thr(i,iH1,iH2,xM);
} else { } else {
xM[0] = x[i][0]; xM = x[i];
xM[1] = x[i][1];
xM[2] = x[i][2];
} }
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
// current particle coord can be outside global and local box // current particle coord can be outside global and local box
// add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1 // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1
nx = static_cast<int> ((xM[0]-boxlo[0])*delxinv+shift) - OFFSET; const int nx = static_cast<int> ((xM.x-boxlox)*delxinv+shift) - OFFSET;
ny = static_cast<int> ((xM[1]-boxlo[1])*delyinv+shift) - OFFSET; const int ny = static_cast<int> ((xM.y-boxloy)*delyinv+shift) - OFFSET;
nz = static_cast<int> ((xM[2]-boxlo[2])*delzinv+shift) - OFFSET; const int nz = static_cast<int> ((xM.z-boxloz)*delzinv+shift) - OFFSET;
part2grid[i][0] = nx; p2g[i].a = nx;
part2grid[i][1] = ny; p2g[i].b = ny;
part2grid[i][2] = nz; p2g[i].t = nz;
// check that entire stencil around nx,ny,nz will fit in my 3d brick // check that entire stencil around nx,ny,nz will fit in my 3d brick
if (nx+nlower < nxlo_out || nx+nupper > nxhi_out || if (nx+nlower < nxlo_out || nx+nupper > nxhi_out ||
ny+nlower < nylo_out || ny+nupper > nyhi_out || ny+nlower < nylo_out || ny+nupper > nyhi_out ||
nz+nlower < nzlo_out || nz+nupper > nzhi_out) flag++; nz+nlower < nzlo_out || nz+nupper > nzhi_out)
flag++;
} }
int flag_all; int flag_all;
@ -160,96 +386,68 @@ void PPPMTIP4POMP::particle_map()
void PPPMTIP4POMP::make_rho() void PPPMTIP4POMP::make_rho()
{ {
const double * const q = atom->q; const double * _noalias const q = atom->q;
const double * const * const x = atom->x; const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
const int * const type = atom->type; const int3_t * _noalias const p2g = (int3_t *) part2grid[0];
const int nthreads = comm->nthreads; const int * _noalias const type = atom->type;
dbl3_t xM;
FFT_SCALAR x0,y0,z0;
const double boxlox = boxlo[0];
const double boxloy = boxlo[1];
const double boxloz = boxlo[2];
int l,m,n,mx,my,mz,iH1,iH2;
const int nlocal = atom->nlocal; const int nlocal = atom->nlocal;
#if defined(_OPENMP) // clear 3d density array
#pragma omp parallel default(none)
#endif
{
#if defined(_OPENMP)
// each thread works on a fixed chunk of atoms.
const int tid = omp_get_thread_num();
const int inum = nlocal;
const int idelta = 1 + inum/nthreads;
const int ifrom = tid*idelta;
const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
#else
const int tid = 0;
const int ifrom = 0;
const int ito = nlocal;
#endif
int l,m,n,nx,ny,nz,mx,my,mz,iH1,iH2; FFT_SCALAR *vec = &density_brick[nzlo_out][nylo_out][nxlo_out];
FFT_SCALAR dx,dy,dz,x0,y0,z0; memset(vec,0,ngrid*sizeof(FFT_SCALAR));
double xM[3];
// set up clear 3d density array // loop over my charges, add their contribution to nearby grid points
const int nzoffs = (nzhi_out-nzlo_out+1)*tid; // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
FFT_SCALAR * const * const * const db = &(density_brick[nzoffs]); // (dx,dy,dz) = distance to "lower left" grid pt
memset(&(db[nzlo_out][nylo_out][nxlo_out]),0,ngrid*sizeof(FFT_SCALAR)); // (mx,my,mz) = global coords of moving stencil pt
ThrData *thr = fix->get_thr(tid); for (int i = 0; i < nlocal; i++) {
FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d()); if (type[i] == typeO) {
find_M_thr(i,iH1,iH2,xM);
} else {
xM = x[i];
}
// loop over my charges, add their contribution to nearby grid points const int nx = p2g[i].a;
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge const int ny = p2g[i].b;
// (dx,dy,dz) = distance to "lower left" grid pt const int nz = p2g[i].t;
// (mx,my,mz) = global coords of moving stencil pt const FFT_SCALAR dx = nx+shiftone - (xM.x-boxlox)*delxinv;
const FFT_SCALAR dy = ny+shiftone - (xM.y-boxloy)*delyinv;
const FFT_SCALAR dz = nz+shiftone - (xM.z-boxloz)*delzinv;
// this if protects against having more threads than local atoms compute_rho1d(dx,dy,dz);
if (ifrom < nlocal) {
for (int i = ifrom; i < ito; i++) {
if (type[i] == typeO) { z0 = delvolinv * q[i];
find_M(i,iH1,iH2,xM); for (n = nlower; n <= nupper; n++) {
} else { mz = n+nz;
xM[0] = x[i][0]; y0 = z0*rho1d[2][n];
xM[1] = x[i][1]; for (m = nlower; m <= nupper; m++) {
xM[2] = x[i][2]; my = m+ny;
} x0 = y0*rho1d[1][m];
for (l = nlower; l <= nupper; l++) {
nx = part2grid[i][0]; mx = l+nx;
ny = part2grid[i][1]; density_brick[mz][my][mx] += x0*rho1d[0][l];
nz = part2grid[i][2];
dx = nx+shiftone - (xM[0]-boxlo[0])*delxinv;
dy = ny+shiftone - (xM[1]-boxlo[1])*delyinv;
dz = nz+shiftone - (xM[2]-boxlo[2])*delzinv;
compute_rho1d_thr(r1d,dx,dy,dz);
z0 = delvolinv * q[i];
for (n = nlower; n <= nupper; n++) {
mz = n+nz;
y0 = z0*r1d[2][n];
for (m = nlower; m <= nupper; m++) {
my = m+ny;
x0 = y0*r1d[1][m];
for (l = nlower; l <= nupper; l++) {
mx = l+nx;
db[mz][my][mx] += x0*r1d[0][l];
}
}
} }
} }
} }
#if defined(_OPENMP)
// reduce 3d density array
if (nthreads > 1) {
data_reduce_fft(&(density_brick[nzlo_out][nylo_out][nxlo_out]),ngrid,nthreads,1,tid);
}
#endif
} }
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
interpolate from grid to get electric field & force on my particles interpolate from grid to get electric field & force on my particles for ik
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
void PPPMTIP4POMP::fieldforce() void PPPMTIP4POMP::fieldforce_ik()
{ {
// loop over my charges, interpolate electric field from nearby grid points // loop over my charges, interpolate electric field from nearby grid points
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
@ -257,106 +455,216 @@ void PPPMTIP4POMP::fieldforce()
// (mx,my,mz) = global coords of moving stencil pt // (mx,my,mz) = global coords of moving stencil pt
// ek = 3 components of E-field on particle // ek = 3 components of E-field on particle
const double * const q = atom->q; const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
const double * const * const x = atom->x; const double * _noalias const q = atom->q;
const int * const type = atom->type; const int3_t * _noalias const p2g = (int3_t *) part2grid[0];
const int * _noalias const type = atom->type;
const double qqrd2e = force->qqrd2e;
const double boxlox = boxlo[0];
const double boxloy = boxlo[1];
const double boxloz = boxlo[2];
const int nthreads = comm->nthreads; const int nthreads = comm->nthreads;
const int nlocal = atom->nlocal; const int nlocal = atom->nlocal;
const double qqrd2e = force->qqrd2e;
#if defined(_OPENMP) #if defined(_OPENMP)
#pragma omp parallel default(none) #pragma omp parallel default(none)
#endif #endif
{ {
#if defined(_OPENMP) dbl3_t xM;
// each thread works on a fixed chunk of atoms. FFT_SCALAR x0,y0,z0,ekx,eky,ekz;
const int tid = omp_get_thread_num(); int i,ifrom,ito,tid,iH1,iH2,l,m,n,mx,my,mz;
const int inum = nlocal;
const int idelta = 1 + inum/nthreads; loop_setup_thr(ifrom,ito,tid,nlocal,nthreads);
const int ifrom = tid*idelta;
const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta; // get per thread data
#else
const int ifrom = 0;
const int ito = nlocal;
const int tid = 0;
#endif
ThrData *thr = fix->get_thr(tid); ThrData *thr = fix->get_thr(tid);
double * const * const f = thr->get_f(); dbl3_t * _noalias const f = (dbl3_t *) thr->get_f()[0];
FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d()); FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
int l,m,n,nx,ny,nz,mx,my,mz; for (i = ifrom; i < ito; ++i) {
FFT_SCALAR dx,dy,dz,x0,y0,z0; if (type[i] == typeO) {
FFT_SCALAR ekx,eky,ekz; find_M_thr(i,iH1,iH2,xM);
int iH1,iH2; } else xM = x[i];
double xM[3], fx,fy,fz;
// this if protects against having more threads than local atoms const int nx = p2g[i].a;
if (ifrom < nlocal) { const int ny = p2g[i].b;
for (int i = ifrom; i < ito; i++) { const int nz = p2g[i].t;
const FFT_SCALAR dx = nx+shiftone - (xM.x-boxlox)*delxinv;
const FFT_SCALAR dy = ny+shiftone - (xM.y-boxloy)*delyinv;
const FFT_SCALAR dz = nz+shiftone - (xM.z-boxloz)*delzinv;
if (type[i] == typeO) { compute_rho1d_thr(r1d,dx,dy,dz);
find_M(i,iH1,iH2,xM);
} else {
xM[0] = x[i][0];
xM[1] = x[i][1];
xM[2] = x[i][2];
}
nx = part2grid[i][0]; ekx = eky = ekz = ZEROF;
ny = part2grid[i][1]; for (n = nlower; n <= nupper; n++) {
nz = part2grid[i][2]; mz = n+nz;
dx = nx+shiftone - (xM[0]-boxlo[0])*delxinv; z0 = r1d[2][n];
dy = ny+shiftone - (xM[1]-boxlo[1])*delyinv; for (m = nlower; m <= nupper; m++) {
dz = nz+shiftone - (xM[2]-boxlo[2])*delzinv; my = m+ny;
y0 = z0*r1d[1][m];
compute_rho1d_thr(r1d,dx,dy,dz); for (l = nlower; l <= nupper; l++) {
mx = l+nx;
ekx = eky = ekz = ZEROF; x0 = y0*r1d[0][l];
for (n = nlower; n <= nupper; n++) { ekx -= x0*vdx_brick[mz][my][mx];
mz = n+nz; eky -= x0*vdy_brick[mz][my][mx];
z0 = r1d[2][n]; ekz -= x0*vdz_brick[mz][my][mx];
for (m = nlower; m <= nupper; m++) {
my = m+ny;
y0 = z0*r1d[1][m];
for (l = nlower; l <= nupper; l++) {
mx = l+nx;
x0 = y0*r1d[0][l];
ekx -= x0*vdx_brick[mz][my][mx];
eky -= x0*vdy_brick[mz][my][mx];
ekz -= x0*vdz_brick[mz][my][mx];
}
} }
} }
}
// convert E-field to force // convert E-field to force
const double qfactor = qqrd2e*scale*q[i];
if (type[i] != typeO) { const double qfactor = qqrd2e * scale * q[i];
f[i][0] += qfactor*ekx; if (type[i] != typeO) {
f[i][1] += qfactor*eky; f[i].x += qfactor*ekx;
f[i][2] += qfactor*ekz; f[i].y += qfactor*eky;
if (slabflag != 2) f[i].z += qfactor*ekz;
} else { } else {
fx = qfactor * ekx; const double fx = qfactor * ekx;
fy = qfactor * eky; const double fy = qfactor * eky;
fz = qfactor * ekz; const double fz = qfactor * ekz;
find_M(i,iH1,iH2,xM);
f[i][0] += fx*(1 - alpha); f[i].x += fx*(1 - alpha);
f[i][1] += fy*(1 - alpha); f[i].y += fy*(1 - alpha);
f[i][2] += fz*(1 - alpha); if (slabflag != 2) f[i].z += fz*(1 - alpha);
f[iH1][0] += 0.5*alpha*fx; f[iH1].x += 0.5*alpha*fx;
f[iH1][1] += 0.5*alpha*fy; f[iH1].y += 0.5*alpha*fy;
f[iH1][2] += 0.5*alpha*fz; if (slabflag != 2) f[iH1].z += 0.5*alpha*fz;
f[iH2][0] += 0.5*alpha*fx; f[iH2].x += 0.5*alpha*fx;
f[iH2][1] += 0.5*alpha*fy; f[iH2].y += 0.5*alpha*fy;
f[iH2][2] += 0.5*alpha*fz; if (slabflag != 2) f[iH2].z += 0.5*alpha*fz;
}
} }
} }
} } // end of parallel region
}
/* ----------------------------------------------------------------------
interpolate from grid to get electric field & force on my particles for ad
------------------------------------------------------------------------- */
void PPPMTIP4POMP::fieldforce_ad()
{
const double *prd = (triclinic == 0) ? domain->prd : domain->prd_lamda;
const double hx_inv = nx_pppm/prd[0];
const double hy_inv = ny_pppm/prd[1];
const double hz_inv = nz_pppm/prd[2];
// loop over my charges, interpolate electric field from nearby grid points
// (nx,ny,nz) = global coords of grid pt to "lower left" of charge
// (dx,dy,dz) = distance to "lower left" grid pt
// (mx,my,mz) = global coords of moving stencil pt
// ek = 3 components of E-field on particle
const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
const double * _noalias const q = atom->q;
const int3_t * _noalias const p2g = (int3_t *) part2grid[0];
const int * _noalias const type = atom->type;
const double qqrd2e = force->qqrd2e;
const double boxlox = boxlo[0];
const double boxloy = boxlo[1];
const double boxloz = boxlo[2];
const int nthreads = comm->nthreads;
const int nlocal = atom->nlocal;
#if defined(_OPENMP)
#pragma omp parallel default(none)
#endif
{
double s1,s2,s3,sf;
dbl3_t xM;
FFT_SCALAR ekx,eky,ekz;
int i,ifrom,ito,tid,iH1,iH2,l,m,n,mx,my,mz;
loop_setup_thr(ifrom,ito,tid,nlocal,nthreads);
// get per thread data
ThrData *thr = fix->get_thr(tid);
dbl3_t * _noalias const f = (dbl3_t *) thr->get_f()[0];
FFT_SCALAR * const * const r1d = static_cast<FFT_SCALAR **>(thr->get_rho1d());
FFT_SCALAR * const * const d1d = static_cast<FFT_SCALAR **>(thr->get_drho1d());
for (i = ifrom; i < ito; ++i) {
if (type[i] == typeO) {
find_M_thr(i,iH1,iH2,xM);
} else xM = x[i];
const int nx = p2g[i].a;
const int ny = p2g[i].b;
const int nz = p2g[i].t;
const FFT_SCALAR dx = nx+shiftone - (xM.x-boxlox)*delxinv;
const FFT_SCALAR dy = ny+shiftone - (xM.y-boxloy)*delyinv;
const FFT_SCALAR dz = nz+shiftone - (xM.z-boxloz)*delzinv;
compute_rho1d_thr(r1d,dx,dy,dz);
compute_drho1d_thr(d1d,dx,dy,dz);
ekx = eky = ekz = ZEROF;
for (n = nlower; n <= nupper; n++) {
mz = n+nz;
for (m = nlower; m <= nupper; m++) {
my = m+ny;
for (l = nlower; l <= nupper; l++) {
mx = l+nx;
ekx += d1d[0][l]*r1d[1][m]*r1d[2][n]*u_brick[mz][my][mx];
eky += r1d[0][l]*d1d[1][m]*r1d[2][n]*u_brick[mz][my][mx];
ekz += r1d[0][l]*r1d[1][m]*d1d[2][n]*u_brick[mz][my][mx];
}
}
}
ekx *= hx_inv;
eky *= hy_inv;
ekz *= hz_inv;
// convert E-field to force and substract self forces
const double qi = q[i];
const double qfactor = qqrd2e * scale * qi;
s1 = x[i].x*hx_inv;
sf = sf_coeff[0]*sin(MY_2PI*s1);
sf += sf_coeff[1]*sin(MY_4PI*s1);
sf *= 2.0*qi;
const double fx = qfactor*(ekx - sf);
s2 = x[i].y*hy_inv;
sf = sf_coeff[2]*sin(MY_2PI*s2);
sf += sf_coeff[3]*sin(MY_4PI*s2);
sf *= 2.0*qi;
const double fy = qfactor*(eky - sf);
s3 = x[i].z*hz_inv;
sf = sf_coeff[4]*sin(MY_2PI*s3);
sf += sf_coeff[5]*sin(MY_4PI*s3);
sf *= 2.0*qi;
const double fz = qfactor*(ekz - sf);
if (type[i] != typeO) {
f[i].x += fx;
f[i].y += fy;
if (slabflag != 2) f[i].z += fz;
} else {
f[i].x += fx*(1 - alpha);
f[i].y += fy*(1 - alpha);
if (slabflag != 2) f[i].z += fz*(1 - alpha);
f[iH1].x += 0.5*alpha*fx;
f[iH1].y += 0.5*alpha*fy;
if (slabflag != 2) f[iH1].z += 0.5*alpha*fz;
f[iH2].x += 0.5*alpha*fx;
f[iH2].y += 0.5*alpha*fy;
if (slabflag != 2) f[iH2].z += 0.5*alpha*fz;
}
}
} // end of parallel region
} }
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
@ -365,7 +673,7 @@ void PPPMTIP4POMP::fieldforce()
also return local indices iH1,iH2 of H atoms also return local indices iH1,iH2 of H atoms
------------------------------------------------------------------------- */ ------------------------------------------------------------------------- */
void PPPMTIP4POMP::find_M(int i, int &iH1, int &iH2, double *xM) void PPPMTIP4POMP::find_M_thr(int i, int &iH1, int &iH2, dbl3_t &xM)
{ {
iH1 = atom->map(atom->tag[i] + 1); iH1 = atom->map(atom->tag[i] + 1);
iH2 = atom->map(atom->tag[i] + 2); iH2 = atom->map(atom->tag[i] + 2);
@ -374,19 +682,69 @@ void PPPMTIP4POMP::find_M(int i, int &iH1, int &iH2, double *xM)
if (atom->type[iH1] != typeH || atom->type[iH2] != typeH) if (atom->type[iH1] != typeH || atom->type[iH2] != typeH)
error->one(FLERR,"TIP4P hydrogen has incorrect atom type"); error->one(FLERR,"TIP4P hydrogen has incorrect atom type");
const double * const * const x = atom->x; const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
double delx1 = x[iH1][0] - x[i][0]; double delx1 = x[iH1].x - x[i].x;
double dely1 = x[iH1][1] - x[i][1]; double dely1 = x[iH1].y - x[i].y;
double delz1 = x[iH1][2] - x[i][2]; double delz1 = x[iH1].z - x[i].z;
domain->minimum_image(delx1,dely1,delz1); domain->minimum_image(delx1,dely1,delz1);
double delx2 = x[iH2][0] - x[i][0]; double delx2 = x[iH2].x - x[i].x;
double dely2 = x[iH2][1] - x[i][1]; double dely2 = x[iH2].y - x[i].y;
double delz2 = x[iH2][2] - x[i][2]; double delz2 = x[iH2].z - x[i].z;
domain->minimum_image(delx2,dely2,delz2); domain->minimum_image(delx2,dely2,delz2);
xM[0] = x[i][0] + alpha * 0.5 * (delx1 + delx2); xM.x = x[i].x + alpha * 0.5 * (delx1 + delx2);
xM[1] = x[i][1] + alpha * 0.5 * (dely1 + dely2); xM.y = x[i].y + alpha * 0.5 * (dely1 + dely2);
xM[2] = x[i][2] + alpha * 0.5 * (delz1 + delz2); xM.z = x[i].z + alpha * 0.5 * (delz1 + delz2);
}
/* ----------------------------------------------------------------------
charge assignment into rho1d
dx,dy,dz = distance of particle from "lower left" grid point
------------------------------------------------------------------------- */
void PPPMTIP4POMP::compute_rho1d_thr(FFT_SCALAR * const * const r1d, const FFT_SCALAR &dx,
const FFT_SCALAR &dy, const FFT_SCALAR &dz)
{
int k,l;
FFT_SCALAR r1,r2,r3;
for (k = (1-order)/2; k <= order/2; k++) {
r1 = r2 = r3 = ZEROF;
for (l = order-1; l >= 0; l--) {
r1 = rho_coeff[l][k] + r1*dx;
r2 = rho_coeff[l][k] + r2*dy;
r3 = rho_coeff[l][k] + r3*dz;
}
r1d[0][k] = r1;
r1d[1][k] = r2;
r1d[2][k] = r3;
}
}
/* ----------------------------------------------------------------------
charge assignment into drho1d
dx,dy,dz = distance of particle from "lower left" grid point
------------------------------------------------------------------------- */
void PPPMTIP4POMP::compute_drho1d_thr(FFT_SCALAR * const * const d1d, const FFT_SCALAR &dx,
const FFT_SCALAR &dy, const FFT_SCALAR &dz)
{
int k,l;
FFT_SCALAR r1,r2,r3;
for (k = (1-order)/2; k <= order/2; k++) {
r1 = r2 = r3 = ZEROF;
for (l = order-2; l >= 0; l--) {
r1 = drho_coeff[l][k] + r1*dx;
r2 = drho_coeff[l][k] + r2*dy;
r3 = drho_coeff[l][k] + r3*dz;
}
d1d[0][k] = r1;
d1d[1][k] = r2;
d1d[2][k] = r3;
}
} }

View File

@ -20,25 +20,40 @@ KSpaceStyle(pppm/tip4p/omp,PPPMTIP4POMP)
#ifndef LMP_PPPM_TIP4P_OMP_H #ifndef LMP_PPPM_TIP4P_OMP_H
#define LMP_PPPM_TIP4P_OMP_H #define LMP_PPPM_TIP4P_OMP_H
#include "pppm_omp.h" #include "pppm_tip4p.h"
#include "thr_omp.h"
namespace LAMMPS_NS { namespace LAMMPS_NS {
class PPPMTIP4POMP : public PPPMOMP { class PPPMTIP4POMP : public PPPMTIP4P, public ThrOMP {
public: public:
PPPMTIP4POMP(class LAMMPS *, int, char **); PPPMTIP4POMP(class LAMMPS *, int, char **);
virtual ~PPPMTIP4POMP () {}; virtual ~PPPMTIP4POMP () {};
virtual void init(); virtual void compute(int, int);
protected: protected:
virtual void allocate();
virtual void deallocate();
virtual void compute_gf_ik();
virtual void compute_gf_ad();
virtual void particle_map(); virtual void particle_map();
virtual void fieldforce(); virtual void make_rho(); // XXX: not (yet) multi-threaded
virtual void make_rho();
virtual void fieldforce_ik();
virtual void fieldforce_ad();
// virtual void fieldforce_peratom(); XXX: need to benchmark first.
private: private:
void find_M(int, int &, int &, double *); void compute_rho1d_thr(FFT_SCALAR * const * const, const FFT_SCALAR &,
const FFT_SCALAR &, const FFT_SCALAR &);
void compute_drho1d_thr(FFT_SCALAR * const * const, const FFT_SCALAR &,
const FFT_SCALAR &, const FFT_SCALAR &);
// void slabcorr(int); void find_M_thr(const int, int &, int &, dbl3_t &);
// void slabcorr(int); // XXX: not (yet) multi-threaded
}; };

View File

@ -21,13 +21,14 @@
#include <string.h> #include <string.h>
#include <stdio.h> #include <stdio.h>
#include "memory.h"
using namespace LAMMPS_NS; using namespace LAMMPS_NS;
ThrData::ThrData(int tid) ThrData::ThrData(int tid)
: _f(NULL), _torque(NULL), _erforce(NULL), _de(NULL), _drho(NULL), _mu(NULL), : _f(0),_torque(0),_erforce(0),_de(0),_drho(0),_mu(0),_lambda(0),_rhoB(0),
_lambda(NULL), _rhoB(NULL), _D_values(NULL), _rho(NULL), _fp(NULL), _D_values(0),_rho(0),_fp(0),_rho1d(0),_drho1d(0),_tid(tid)
_rho1d(NULL), _tid(tid)
{ {
// nothing else to do here. // nothing else to do here.
} }
@ -136,6 +137,33 @@ void ThrData::init_eim(int nall, double *rho, double *fp)
memset(_fp,0,nall*sizeof(double)); memset(_fp,0,nall*sizeof(double));
} }
/* ----------------------------------------------------------------------
if order > 0 : set up per thread storage for PPPM
if order < 0 : free per thread storage for PPPM
------------------------------------------------------------------------- */
#if defined(FFT_SINGLE)
typedef float FFT_SCALAR;
#else
typedef double FFT_SCALAR;
#endif
void ThrData::init_pppm(int order, Memory *memory)
{
FFT_SCALAR **rho1d, **drho1d;
if (order > 0) {
memory->create2d_offset(rho1d,3,-order/2,order/2,"thr_data:rho1d");
memory->create2d_offset(drho1d,3,-order/2,order/2,"thr_data:drho1d");
_rho1d = static_cast<void *>(rho1d);
_drho1d = static_cast<void *>(drho1d);
} else {
order = -order;
rho1d = static_cast<FFT_SCALAR **>(_rho1d);
drho1d = static_cast<FFT_SCALAR **>(_drho1d);
memory->destroy2d_offset(rho1d,-order/2);
memory->destroy2d_offset(drho1d,-order/2);
}
}
/* ---------------------------------------------------------------------- /* ----------------------------------------------------------------------
compute global pair virial via summing F dot r over own & ghost atoms compute global pair virial via summing F dot r over own & ghost atoms
at this point, only pairwise forces have been accumulated in atom->f at this point, only pairwise forces have been accumulated in atom->f

View File

@ -53,7 +53,7 @@ class ThrData {
void init_eam(int, double *); // EAM void init_eam(int, double *); // EAM
void init_eim(int, double *, double *); // EIM (+ EAM) void init_eim(int, double *, double *); // EIM (+ EAM)
void init_pppm(void *r1d) { _rho1d = r1d; }; void init_pppm(int, class Memory *);
// access methods for arrays that we handle in this class // access methods for arrays that we handle in this class
double **get_lambda() const { return _lambda; }; double **get_lambda() const { return _lambda; };
@ -63,6 +63,7 @@ class ThrData {
double *get_rho() const { return _rho; }; double *get_rho() const { return _rho; };
double *get_rhoB() const { return _rhoB; }; double *get_rhoB() const { return _rhoB; };
void *get_rho1d() const { return _rho1d; }; void *get_rho1d() const { return _rho1d; };
void *get_drho1d() const { return _drho1d; };
private: private:
double eng_vdwl; // non-bonded non-coulomb energy double eng_vdwl; // non-bonded non-coulomb energy
@ -108,6 +109,7 @@ class ThrData {
// this is for pppm/omp // this is for pppm/omp
void *_rho1d; void *_rho1d;
void *_drho1d;
// my thread id // my thread id
const int _tid; const int _tid;

View File

@ -186,4 +186,19 @@ static inline void loop_setup_thr(int &ifrom, int &ito, int &tid,
} }
} }
// helpful definitions to help compilers optimizing code better
typedef struct { double x,y,z; } dbl3_t;
typedef struct { double x,y,z,w; } dbl4_t;
typedef struct { int a,b,t; } int3_t;
typedef struct { int a,b,c,t; } int4_t;
typedef struct { int a,b,c,d,t; } int5_t;
#if defined(__GNUC__)
#define _noalias __restrict
#else
#define _noalias
#endif
#endif #endif