diff --git a/src/KSPACE/pppm_cg.cpp b/src/KSPACE/pppm_cg.cpp index 499b55c08f..93d93c48c3 100644 --- a/src/KSPACE/pppm_cg.cpp +++ b/src/KSPACE/pppm_cg.cpp @@ -19,8 +19,10 @@ #include "mpi.h" #include "math.h" #include "stdlib.h" +#include "string.h" #include "atom.h" +#include "commgrid.h" #include "domain.h" #include "error.h" #include "force.h" @@ -34,7 +36,11 @@ using namespace MathConst; #define OFFSET 16384 #define SMALLQ 0.00001 -#if defined(FFT_SINGLE) + +enum{REVERSE_RHO}; +enum{FORWARD_IK,FORWARD_AD,FORWARD_IK_PERATOM,FORWARD_AD_PERATOM}; + +#ifdef FFT_SINGLE #define ZEROF 0.0f #else #define ZEROF 0.0 @@ -42,7 +48,7 @@ using namespace MathConst; /* ---------------------------------------------------------------------- */ -PPPMCG::PPPMCG(LAMMPS *lmp, int narg, char **arg) : PPPMOld(lmp, narg, arg) +PPPMCG::PPPMCG(LAMMPS *lmp, int narg, char **arg) : PPPM(lmp, narg, arg) { if ((narg < 1) || (narg > 2)) error->all(FLERR,"Illegal kspace_style pppm/cg command"); @@ -54,6 +60,7 @@ PPPMCG::PPPMCG(LAMMPS *lmp, int narg, char **arg) : PPPMOld(lmp, narg, arg) num_charged = -1; is_charged = NULL; + group_group_enable = 1; } /* ---------------------------------------------------------------------- @@ -71,7 +78,6 @@ PPPMCG::~PPPMCG() void PPPMCG::compute(int eflag, int vflag) { - int i,j; // set energy/virial flags // invoke allocate_peratom() if needed for first time @@ -82,6 +88,8 @@ void PPPMCG::compute(int eflag, int vflag) if (evflag_atom && !peratom_allocate_flag) { allocate_peratom(); + cg_peratom->ghost_notify(); + cg_peratom->setup(); peratom_allocate_flag = 1; } @@ -110,7 +118,7 @@ void PPPMCG::compute(int eflag, int vflag) double charged_frac, charged_fmax, charged_fmin; num_charged=0; - for (i=0; i < atom->nlocal; ++i) + for (int i=0; i < atom->nlocal; ++i) if (fabs(atom->q[i]) > smallq) ++num_charged; @@ -148,7 +156,7 @@ void PPPMCG::compute(int eflag, int vflag) } num_charged = 0; - for (i = 0; i < atom->nlocal; ++i) + for (int i = 0; i < atom->nlocal; ++i) if (fabs(atom->q[i]) > smallq) { is_charged[num_charged] = i; ++num_charged; @@ -164,6 +172,7 @@ void PPPMCG::compute(int eflag, int vflag) // to fully sum contribution in their 3d bricks // remap from 3d decomposition to FFT decomposition + cg->reverse_comm(this,REVERSE_RHO); brick2fft(); // compute potential gradient on my FFT grid and @@ -176,11 +185,17 @@ void PPPMCG::compute(int eflag, int vflag) // all procs communicate E-field values // to fill ghost cells surrounding their 3d bricks - fillbrick(); + if (differentiation_flag == 1) cg->forward_comm(this,FORWARD_AD); + else cg->forward_comm(this,FORWARD_IK); // extra per-atom energy/virial communication - if (evflag_atom) fillbrick_peratom(); + if (evflag_atom) { + if (differentiation_flag == 1 && vflag_atom) + cg_peratom->forward_comm(this,FORWARD_AD_PERATOM); + else if (differentiation_flag == 0) + cg_peratom->forward_comm(this,FORWARD_IK_PERATOM); + } // calculate the force on my particles @@ -210,19 +225,18 @@ void PPPMCG::compute(int eflag, int vflag) if (vflag_global) { double virial_all[6]; MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world); - for (i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i]; + for (int i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i]; } // per-atom energy/virial // energy includes self-energy correction if (evflag_atom) { - double *q = atom->q; - int nlocal = atom->nlocal; + const double * const q = atom->q; if (eflag_atom) { for (int j = 0; j < num_charged; j++) { - int i = is_charged[j]; + const int i = is_charged[j]; eatom[i] *= 0.5; eatom[i] -= g_ewald*q[i]*q[i]/MY_PIS + MY_PI2*q[i]*qsum / (g_ewald*g_ewald*volume); @@ -231,16 +245,16 @@ void PPPMCG::compute(int eflag, int vflag) } if (vflag_atom) { - for (int j = 0; j < num_charged; j++) { - int i = is_charged[j]; - for (int n = 0; n < 6; n++) vatom[i][n] *= 0.5*q[i]*qscale; + for (int n = 0; n < num_charged; n++) { + const int i = is_charged[n]; + for (int j = 0; j < 6; j++) vatom[i][j] *= 0.5*qscale; } } } // 2d slab correction - if (slabflag) slabcorr(); + if (slabflag == 1) slabcorr(); // convert atoms back from lamda to box coords @@ -279,7 +293,8 @@ void PPPMCG::particle_map() if (nx+nlower < nxlo_out || nx+nupper > nxhi_out || ny+nlower < nylo_out || ny+nupper > nyhi_out || - nz+nlower < nzlo_out || nz+nupper > nzhi_out) flag = 1; + nz+nlower < nzlo_out || nz+nupper > nzhi_out) + flag = 1; } if (flag) error->one(FLERR,"Out of range atoms - cannot compute PPPM"); @@ -299,8 +314,8 @@ void PPPMCG::make_rho() // clear 3d density array - FFT_SCALAR *vec = &density_brick[nzlo_out][nylo_out][nxlo_out]; - for (i = 0; i < ngrid; i++) vec[i] = ZEROF; + memset(&(density_brick[nzlo_out][nylo_out][nxlo_out]),0, + ngrid*sizeof(FFT_SCALAR)); // loop over my charges, add their contribution to nearby grid points // (nx,ny,nz) = global coords of grid pt to "lower left" of charge @@ -311,7 +326,7 @@ void PPPMCG::make_rho() double **x = atom->x; for (int j = 0; j < num_charged; j++) { - int i = is_charged[j]; + i = is_charged[j]; nx = part2grid[i][0]; ny = part2grid[i][1]; @@ -337,12 +352,11 @@ void PPPMCG::make_rho() } } } - /* ---------------------------------------------------------------------- - interpolate from grid to get electric field & force on my particles + interpolate from grid to get electric field & force on my particles for ik ------------------------------------------------------------------------- */ -void PPPMCG::fieldforce() +void PPPMCG::fieldforce_ik() { int i,l,m,n,nx,ny,nz,mx,my,mz; FFT_SCALAR dx,dy,dz,x0,y0,z0; @@ -392,74 +406,162 @@ void PPPMCG::fieldforce() const double qfactor = force->qqrd2e * scale * q[i]; f[i][0] += qfactor*ekx; f[i][1] += qfactor*eky; - f[i][2] += qfactor*ekz; + if (slabflag != 2) f[i][2] += qfactor*ekz; } } /* ---------------------------------------------------------------------- - interpolate from grid to get per-atom energy/virial - ------------------------------------------------------------------------- */ + interpolate from grid to get electric field & force on my particles for ad +------------------------------------------------------------------------- */ + +void PPPMCG::fieldforce_ad() +{ + int i,l,m,n,nx,ny,nz,mx,my,mz; + FFT_SCALAR dx,dy,dz; + FFT_SCALAR ekx,eky,ekz; + double s1,s2,s3; + double sf = 0.0; + double *prd; + + if (triclinic == 0) prd = domain->prd; + else prd = domain->prd_lamda; + + double xprd = prd[0]; + double yprd = prd[1]; + double zprd = prd[2]; + + double hx_inv = nx_pppm/xprd; + double hy_inv = ny_pppm/yprd; + double hz_inv = nz_pppm/zprd; + + // loop over my charges, interpolate electric field from nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + // ek = 3 components of E-field on particle + + double *q = atom->q; + double **x = atom->x; + double **f = atom->f; + + for (int j = 0; j < num_charged; j++) { + i = is_charged[j]; + + nx = part2grid[i][0]; + ny = part2grid[i][1]; + nz = part2grid[i][2]; + dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv; + dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv; + dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv; + + compute_rho1d(dx,dy,dz); + compute_drho1d(dx,dy,dz); + + ekx = eky = ekz = ZEROF; + for (n = nlower; n <= nupper; n++) { + mz = n+nz; + for (m = nlower; m <= nupper; m++) { + my = m+ny; + for (l = nlower; l <= nupper; l++) { + mx = l+nx; + ekx += drho1d[0][l]*rho1d[1][m]*rho1d[2][n]*u_brick[mz][my][mx]; + eky += rho1d[0][l]*drho1d[1][m]*rho1d[2][n]*u_brick[mz][my][mx]; + ekz += rho1d[0][l]*rho1d[1][m]*drho1d[2][n]*u_brick[mz][my][mx]; + } + } + } + ekx *= hx_inv; + eky *= hy_inv; + ekz *= hz_inv; + + // convert E-field to force and substract self forces + + const double qfactor = force->qqrd2e * scale; + + s1 = x[i][0]*hx_inv; + s2 = x[i][1]*hy_inv; + s3 = x[i][2]*hz_inv; + sf = sf_coeff[0]*sin(2*MY_PI*s1); + sf += sf_coeff[1]*sin(4*MY_PI*s1); + sf *= 2*q[i]*q[i]; + f[i][0] += qfactor*(ekx*q[i] - sf); + + sf = sf_coeff[2]*sin(2*MY_PI*s2); + sf += sf_coeff[3]*sin(4*MY_PI*s2); + sf *= 2*q[i]*q[i]; + f[i][1] += qfactor*(eky*q[i] - sf); + + + sf = sf_coeff[4]*sin(2*MY_PI*s3); + sf += sf_coeff[5]*sin(4*MY_PI*s3); + sf *= 2*q[i]*q[i]; + if (slabflag != 2) f[i][2] += qfactor*(ekz*q[i] - sf); + } +} + +/* ---------------------------------------------------------------------- + interpolate from grid to get per-atom energy/virial +------------------------------------------------------------------------- */ void PPPMCG::fieldforce_peratom() { - int i,l,m,n,nx,ny,nz,mx,my,mz; - FFT_SCALAR dx,dy,dz,x0,y0,z0; - FFT_SCALAR u,v0,v1,v2,v3,v4,v5; + int i,l,m,n,nx,ny,nz,mx,my,mz; + FFT_SCALAR dx,dy,dz,x0,y0,z0; + FFT_SCALAR u,v0,v1,v2,v3,v4,v5; - // loop over my charges, interpolate from nearby grid points - // (nx,ny,nz) = global coords of grid pt to "lower left" of charge - // (dx,dy,dz) = distance to "lower left" grid pt - // (mx,my,mz) = global coords of moving stencil pt + // loop over my charges, interpolate from nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt - double *q = atom->q; - double **x = atom->x; - double **f = atom->f; + double *q = atom->q; + double **x = atom->x; - for (int j = 0; j < num_charged; j++) { - i = is_charged[j]; + for (int j = 0; j < num_charged; j++) { + i = is_charged[j]; - nx = part2grid[i][0]; - ny = part2grid[i][1]; - nz = part2grid[i][2]; - dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv; - dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv; - dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv; + nx = part2grid[i][0]; + ny = part2grid[i][1]; + nz = part2grid[i][2]; + dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv; + dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv; + dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv; - compute_rho1d(dx,dy,dz); + compute_rho1d(dx,dy,dz); - u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF; - for (n = nlower; n <= nupper; n++) { - mz = n+nz; - z0 = rho1d[2][n]; - for (m = nlower; m <= nupper; m++) { - my = m+ny; - y0 = z0*rho1d[1][m]; - for (l = nlower; l <= nupper; l++) { - mx = l+nx; - x0 = y0*rho1d[0][l]; - if (eflag_atom) u += x0*u_brick[mz][my][mx]; - if (vflag_atom) { - v0 += x0*v0_brick[mz][my][mx]; - v1 += x0*v1_brick[mz][my][mx]; - v2 += x0*v2_brick[mz][my][mx]; - v3 += x0*v3_brick[mz][my][mx]; - v4 += x0*v4_brick[mz][my][mx]; - v5 += x0*v5_brick[mz][my][mx]; - } - } - } - } - - if (eflag_atom) eatom[i] += q[i]*u; - if (vflag_atom) { - vatom[i][0] += v0; - vatom[i][1] += v1; - vatom[i][2] += v2; - vatom[i][3] += v3; - vatom[i][4] += v4; - vatom[i][5] += v5; + u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF; + for (n = nlower; n <= nupper; n++) { + mz = n+nz; + z0 = rho1d[2][n]; + for (m = nlower; m <= nupper; m++) { + my = m+ny; + y0 = z0*rho1d[1][m]; + for (l = nlower; l <= nupper; l++) { + mx = l+nx; + x0 = y0*rho1d[0][l]; + if (eflag_atom) u += x0*u_brick[mz][my][mx]; + if (vflag_atom) { + v0 += x0*v0_brick[mz][my][mx]; + v1 += x0*v1_brick[mz][my][mx]; + v2 += x0*v2_brick[mz][my][mx]; + v3 += x0*v3_brick[mz][my][mx]; + v4 += x0*v4_brick[mz][my][mx]; + v5 += x0*v5_brick[mz][my][mx]; + } } + } } + + if (eflag_atom) eatom[i] += q[i]*u; + if (vflag_atom) { + vatom[i][0] += q[i]*v0; + vatom[i][1] += q[i]*v1; + vatom[i][2] += q[i]*v2; + vatom[i][3] += q[i]*v3; + vatom[i][4] += q[i]*v4; + vatom[i][5] += q[i]*v5; + } + } } /* ---------------------------------------------------------------------- @@ -471,14 +573,17 @@ void PPPMCG::fieldforce_peratom() void PPPMCG::slabcorr() { + int i,j; + // compute local contribution to global dipole moment - double *q = atom->q; - double **x = atom->x; - + const double * const q = atom->q; + const double * const * const x = atom->x; double dipole = 0.0; - for (int j = 0; j < num_charged; j++) { - int i = is_charged[j]; + + + for (j = 0; j < num_charged; j++) { + i = is_charged[j]; dipole += q[i]*x[i][2]; } @@ -489,29 +594,101 @@ void PPPMCG::slabcorr() // compute corrections - const double e_slabcorr = 2.0*MY_PI*dipole_all*dipole_all/volume; + const double e_slabcorr = MY_2PI*dipole_all*dipole_all/volume; const double qscale = force->qqrd2e * scale; if (eflag_global) energy += qscale * e_slabcorr; - //per-atom energy + // per-atom energy if (eflag_atom) { - double efact = 2.0*MY_PI*dipole_all/volume; - for (int j = 0; j < num_charged; j++) { - int i = is_charged[j]; + const double efact = MY_2PI*dipole_all/volume; + for (j = 0; j < num_charged; j++) { + i = is_charged[j]; eatom[i] += qscale * q[i]*x[i][2]*efact; } } // add on force corrections - const double ffact = -4.0*MY_PI*dipole_all/volume * qscale; - double **f = atom->f; + const double ffact = -MY_4PI*dipole_all/volume; + double * const * const f = atom->f; + + for (j = 0; j < num_charged; j++) { + i = is_charged[j]; + f[i][2] += qscale * q[i]*ffact; + } +} + +/* ---------------------------------------------------------------------- + create discretized "density" on section of global grid due to my particles + density(x,y,z) = charge "density" at grid points of my 3d brick + (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts) + in global grid for group-group interactions + ------------------------------------------------------------------------- */ + +void PPPMCG::make_rho_groups(int groupbit_A, int groupbit_B, int BA_flag) +{ + int i,l,m,n,nx,ny,nz,mx,my,mz; + FFT_SCALAR dx,dy,dz,x0,y0,z0; + + // clear 3d density arrays + + memset(&(density_A_brick[nzlo_out][nylo_out][nxlo_out]),0, + ngrid*sizeof(FFT_SCALAR)); + + memset(&(density_B_brick[nzlo_out][nylo_out][nxlo_out]),0, + ngrid*sizeof(FFT_SCALAR)); + + // loop over my charges, add their contribution to nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + + const double * const q = atom->q; + const double * const * const x = atom->x; + const int * const mask = atom->mask; for (int j = 0; j < num_charged; j++) { - int i = is_charged[j]; - f[i][2] += q[i]*ffact; + i = is_charged[j]; + + if ((mask[i] & groupbit_A) && (mask[i] & groupbit_B)) + if (BA_flag) continue; + + if ((mask[i] & groupbit_A) || (mask[i] & groupbit_B)) { + + nx = part2grid[i][0]; + ny = part2grid[i][1]; + nz = part2grid[i][2]; + dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv; + dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv; + dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv; + + compute_rho1d(dx,dy,dz); + + z0 = delvolinv * q[i]; + for (n = nlower; n <= nupper; n++) { + mz = n+nz; + y0 = z0*rho1d[2][n]; + for (m = nlower; m <= nupper; m++) { + my = m+ny; + x0 = y0*rho1d[1][m]; + for (l = nlower; l <= nupper; l++) { + mx = l+nx; + + // group A + + if (mask[i] & groupbit_A) + density_A_brick[mz][my][mx] += x0*rho1d[0][l]; + + // group B + + if (mask[i] & groupbit_B) + density_B_brick[mz][my][mx] += x0*rho1d[0][l]; + } + } + } + } } } @@ -521,7 +698,7 @@ void PPPMCG::slabcorr() double PPPMCG::memory_usage() { - double bytes = PPPMOld::memory_usage(); + double bytes = PPPM::memory_usage(); bytes += nmax * sizeof(int); return bytes; } diff --git a/src/KSPACE/pppm_cg.h b/src/KSPACE/pppm_cg.h index 6215d13249..3fee5fb1e0 100644 --- a/src/KSPACE/pppm_cg.h +++ b/src/KSPACE/pppm_cg.h @@ -20,11 +20,11 @@ KSpaceStyle(pppm/cg,PPPMCG) #ifndef LMP_PPPM_CG_H #define LMP_PPPM_CG_H -#include "pppm_old.h" +#include "pppm.h" namespace LAMMPS_NS { -class PPPMCG : public PPPMOld { +class PPPMCG : public PPPM { public: PPPMCG(class LAMMPS *, int, char **); virtual ~PPPMCG(); @@ -38,9 +38,11 @@ class PPPMCG : public PPPMOld { virtual void particle_map(); virtual void make_rho(); - virtual void fieldforce(); + virtual void fieldforce_ik(); + virtual void fieldforce_ad(); virtual void fieldforce_peratom(); virtual void slabcorr(); + virtual void make_rho_groups(int, int, int); }; } diff --git a/src/KSPACE/pppm_tip4p.cpp b/src/KSPACE/pppm_tip4p.cpp index 4abd241006..dd2f3da0cc 100644 --- a/src/KSPACE/pppm_tip4p.cpp +++ b/src/KSPACE/pppm_tip4p.cpp @@ -226,7 +226,7 @@ void PPPMTIP4P::fieldforce_ik() if (type[i] != typeO) { f[i][0] += qfactor*ekx; f[i][1] += qfactor*eky; - f[i][2] += qfactor*ekz; + if (slabflag != 2) f[i][2] += qfactor*ekz; } else { fx = qfactor * ekx; @@ -236,15 +236,15 @@ void PPPMTIP4P::fieldforce_ik() f[i][0] += fx*(1 - alpha); f[i][1] += fy*(1 - alpha); - f[i][2] += fz*(1 - alpha); + if (slabflag != 2) f[i][2] += fz*(1 - alpha); f[iH1][0] += 0.5*alpha*fx; f[iH1][1] += 0.5*alpha*fy; - f[iH1][2] += 0.5*alpha*fz; + if (slabflag != 2) f[iH1][2] += 0.5*alpha*fz; f[iH2][0] += 0.5*alpha*fx; f[iH2][1] += 0.5*alpha*fy; - f[iH2][2] += 0.5*alpha*fz; + if (slabflag != 2) f[iH2][2] += 0.5*alpha*fz; } } } @@ -256,7 +256,7 @@ void PPPMTIP4P::fieldforce_ik() void PPPMTIP4P::fieldforce_ad() { int i,l,m,n,nx,ny,nz,mx,my,mz; - FFT_SCALAR dx,dy,dz,x0,y0,z0,dx0,dy0,dz0; + FFT_SCALAR dx,dy,dz; FFT_SCALAR ekx,eky,ekz; double *xi; int iH1,iH2; @@ -272,13 +272,10 @@ void PPPMTIP4P::fieldforce_ad() double xprd = prd[0]; double yprd = prd[1]; double zprd = prd[2]; - double zprd_slab = zprd*slab_volfactor; double hx_inv = nx_pppm/xprd; double hy_inv = ny_pppm/yprd; double hz_inv = nz_pppm/zprd; - - // loop over my charges, interpolate electric field from nearby grid points // (nx,ny,nz) = global coords of grid pt to "lower left" of charge @@ -302,9 +299,9 @@ void PPPMTIP4P::fieldforce_ad() nx = part2grid[i][0]; ny = part2grid[i][1]; nz = part2grid[i][2]; - dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv; - dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv; - dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv; + dx = nx+shiftone - (xi[0]-boxlo[0])*delxinv; + dy = ny+shiftone - (xi[1]-boxlo[1])*delyinv; + dz = nz+shiftone - (xi[2]-boxlo[2])*delzinv; compute_rho1d(dx,dy,dz); compute_drho1d(dx,dy,dz); @@ -335,38 +332,38 @@ void PPPMTIP4P::fieldforce_ad() s3 = x[i][2]*hz_inv; sf = sf_coeff[0]*sin(2*MY_PI*s1); sf += sf_coeff[1]*sin(4*MY_PI*s1); - sf *= 2*q[i]*q[i]; + sf *= 2.0*q[i]*q[i]; fx = qfactor*(ekx*q[i] - sf); sf = sf_coeff[2]*sin(2*MY_PI*s2); sf += sf_coeff[3]*sin(4*MY_PI*s2); - sf *= 2*q[i]*q[i]; + sf *= 2.0*q[i]*q[i]; fy = qfactor*(eky*q[i] - sf); sf = sf_coeff[4]*sin(2*MY_PI*s3); sf += sf_coeff[5]*sin(4*MY_PI*s3); - sf *= 2*q[i]*q[i]; + sf *= 2.0*q[i]*q[i]; fz = qfactor*(ekz*q[i] - sf); if (type[i] != typeO) { f[i][0] += fx; f[i][1] += fy; - f[i][2] += fz; + if (slabflag != 2) f[i][2] += fz; } else { find_M(i,iH1,iH2,xM); f[i][0] += fx*(1 - alpha); f[i][1] += fy*(1 - alpha); - f[i][2] += fz*(1 - alpha); + if (slabflag != 2) f[i][2] += fz*(1 - alpha); f[iH1][0] += 0.5*alpha*fx; f[iH1][1] += 0.5*alpha*fy; - f[iH1][2] += 0.5*alpha*fz; + if (slabflag != 2) f[iH1][2] += 0.5*alpha*fz; f[iH2][0] += 0.5*alpha*fx; f[iH2][1] += 0.5*alpha*fy; - f[iH2][2] += 0.5*alpha*fz; + if (slabflag != 2) f[iH2][2] += 0.5*alpha*fz; } } } @@ -392,7 +389,6 @@ void PPPMTIP4P::fieldforce_peratom() // ek = 3 components of E-field on particle double *q = atom->q; double **x = atom->x; - double **f = atom->f; int *type = atom->type; int nlocal = atom->nlocal; diff --git a/src/USER-OMP/pppm_cg_omp.cpp b/src/USER-OMP/pppm_cg_omp.cpp index 51fc048b3b..e65a9265e5 100644 --- a/src/USER-OMP/pppm_cg_omp.cpp +++ b/src/USER-OMP/pppm_cg_omp.cpp @@ -19,9 +19,12 @@ #include "atom.h" #include "comm.h" #include "domain.h" +#include "error.h" +#include "fix_omp.h" #include "force.h" #include "memory.h" #include "math_const.h" +#include "math_special.h" #include #include @@ -29,12 +32,14 @@ #include "suffix.h" using namespace LAMMPS_NS; using namespace MathConst; +using namespace MathSpecial; -#define SMALLQ 0.00001 -#if defined(FFT_SINGLE) +#ifdef FFT_SINGLE #define ZEROF 0.0f +#define ONEF 1.0f #else #define ZEROF 0.0 +#define ONEF 1.0 #endif #define EPS_HOC 1.0e-7 @@ -55,7 +60,27 @@ void PPPMCGOMP::allocate() { PPPMCG::allocate(); - const int nthreads = comm->nthreads; +#if defined(_OPENMP) +#pragma omp parallel default(none) +#endif + { +#if defined(_OPENMP) + const int tid = omp_get_thread_num(); +#else + const int tid = 0; +#endif + ThrData *thr = fix->get_thr(tid); + thr->init_pppm(order,memory); + } +} + +/* ---------------------------------------------------------------------- + free memory that depends on # of K-vectors and order +------------------------------------------------------------------------- */ + +void PPPMCGOMP::deallocate() +{ + PPPMCG::deallocate(); #if defined(_OPENMP) #pragma omp parallel default(none) @@ -66,153 +91,26 @@ void PPPMCGOMP::allocate() #else const int tid = 0; #endif - - FFT_SCALAR **rho1d_thr; - memory->create2d_offset(rho1d_thr,3,-order/2,order/2,"pppm:rho1d_thr"); ThrData *thr = fix->get_thr(tid); - thr->init_pppm(static_cast(rho1d_thr)); - } - - const int nzend = (nzhi_out-nzlo_out+1)*nthreads + nzlo_out -1; - - // reallocate density brick, so it fits our needs - memory->destroy3d_offset(density_brick,nzlo_out,nylo_out,nxlo_out); - memory->create3d_offset(density_brick,nzlo_out,nzend,nylo_out,nyhi_out, - nxlo_out,nxhi_out,"pppm:density_brick"); -} - -// NOTE: special version of reduce_data for FFT_SCALAR data type. -// reduce per thread data into the first part of the data -// array that is used for the non-threaded parts and reset -// the temporary storage to 0.0. this routine depends on -// multi-dimensional arrays like force stored in this order -// x1,y1,z1,x2,y2,z2,... -// we need to post a barrier to wait until all threads are done -// writing to the array. -static void data_reduce_fft(FFT_SCALAR *dall, int nall, int nthreads, int ndim, int tid) -{ -#if defined(_OPENMP) - // NOOP in non-threaded execution. - if (nthreads == 1) return; -#pragma omp barrier - { - const int nvals = ndim*nall; - const int idelta = nvals/nthreads + 1; - const int ifrom = tid*idelta; - const int ito = ((ifrom + idelta) > nvals) ? nvals : (ifrom + idelta); - - // this if protects against having more threads than atoms - if (ifrom < nall) { - for (int m = ifrom; m < ito; ++m) { - for (int n = 1; n < nthreads; ++n) { - dall[m] += dall[n*nvals + m]; - dall[n*nvals + m] = 0.0; - } - } - } - } -#else - // NOOP in non-threaded execution. - return; -#endif -} - -/* ---------------------------------------------------------------------- - free memory that depends on # of K-vectors and order -------------------------------------------------------------------------- */ - -void PPPMCGOMP::deallocate() -{ - PPPMCG::deallocate(); - for (int i=0; i < comm->nthreads; ++i) { - ThrData * thr = fix->get_thr(i); - FFT_SCALAR ** rho1d_thr = static_cast(thr->get_rho1d()); - memory->destroy2d_offset(rho1d_thr,-order/2); + thr->init_pppm(-order,memory); } } /* ---------------------------------------------------------------------- - adjust PPPMCG coeffs, called initially and whenever volume has changed + pre-compute modified (Hockney-Eastwood) Coulomb Green's function ------------------------------------------------------------------------- */ -void PPPMCGOMP::setup() +void PPPMCGOMP::compute_gf_ik() { - int i,j,k,n; - double *prd; - - // volume-dependent factors - // adjust z dimension for 2d slab PPPM - // z dimension for 3d PPPM is zprd since slab_volfactor = 1.0 - - if (triclinic == 0) prd = domain->prd; - else prd = domain->prd_lamda; + const double * const prd = (triclinic==0) ? domain->prd : domain->prd_lamda; const double xprd = prd[0]; const double yprd = prd[1]; const double zprd = prd[2]; const double zprd_slab = zprd*slab_volfactor; - volume = xprd * yprd * zprd_slab; - - delxinv = nx_pppm/xprd; - delyinv = ny_pppm/yprd; - delzinv = nz_pppm/zprd_slab; - - delvolinv = delxinv*delyinv*delzinv; - - const double unitkx = (2.0*MY_PI/xprd); - const double unitky = (2.0*MY_PI/yprd); - const double unitkz = (2.0*MY_PI/zprd_slab); - - // fkx,fky,fkz for my FFT grid pts - - double per; - - for (i = nxlo_fft; i <= nxhi_fft; i++) { - per = i - nx_pppm*(2*i/nx_pppm); - fkx[i] = unitkx*per; - } - - for (i = nylo_fft; i <= nyhi_fft; i++) { - per = i - ny_pppm*(2*i/ny_pppm); - fky[i] = unitky*per; - } - - for (i = nzlo_fft; i <= nzhi_fft; i++) { - per = i - nz_pppm*(2*i/nz_pppm); - fkz[i] = unitkz*per; - } - - // virial coefficients - - double sqk,vterm; - - n = 0; - for (k = nzlo_fft; k <= nzhi_fft; k++) { - for (j = nylo_fft; j <= nyhi_fft; j++) { - for (i = nxlo_fft; i <= nxhi_fft; i++) { - sqk = fkx[i]*fkx[i] + fky[j]*fky[j] + fkz[k]*fkz[k]; - if (sqk == 0.0) { - vg[n][0] = 0.0; - vg[n][1] = 0.0; - vg[n][2] = 0.0; - vg[n][3] = 0.0; - vg[n][4] = 0.0; - vg[n][5] = 0.0; - } else { - vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald)); - vg[n][0] = 1.0 + vterm*fkx[i]*fkx[i]; - vg[n][1] = 1.0 + vterm*fky[j]*fky[j]; - vg[n][2] = 1.0 + vterm*fkz[k]*fkz[k]; - vg[n][3] = vterm*fkx[i]*fky[j]; - vg[n][4] = vterm*fkx[i]*fkz[k]; - vg[n][5] = vterm*fky[j]*fkz[k]; - } - n++; - } - } - } - - // modified (Hockney-Eastwood) Coulomb Green's function + const double unitkx = (MY_2PI/xprd); + const double unitky = (MY_2PI/yprd); + const double unitkz = (MY_2PI/zprd_slab); const int nbx = static_cast ((g_ewald*xprd/(MY_PI*nx_pppm)) * pow(-log(EPS_HOC),0.25)); @@ -220,93 +118,188 @@ void PPPMCGOMP::setup() pow(-log(EPS_HOC),0.25)); const int nbz = static_cast ((g_ewald*zprd_slab/(MY_PI*nz_pppm)) * pow(-log(EPS_HOC),0.25)); + const int numk = nxhi_fft - nxlo_fft + 1; + const int numl = nyhi_fft - nylo_fft + 1; - const double form = 1.0; + const int twoorder = 2*order; #if defined(_OPENMP) #pragma omp parallel default(none) #endif { - int tid,nn,nnfrom,nnto,nx,ny,nz,k,l,m; - double snx,sny,snz,sqk; + double snx,sny,snz; double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz; double sum1,dot1,dot2; double numerator,denominator; - const double gew2 = -4.0*g_ewald*g_ewald; + double sqk; - const int nnx = nxhi_fft-nxlo_fft+1; - const int nny = nyhi_fft-nylo_fft+1; + int k,l,m,nx,ny,nz,kper,lper,mper,n,nfrom,nto,tid; - loop_setup_thr(nnfrom, nnto, tid, nfft, comm->nthreads); + loop_setup_thr(nfrom, nto, tid, nfft, comm->nthreads); - for (m = nzlo_fft; m <= nzhi_fft; m++) { + for (n = nfrom; n < nto; ++n) { + m = n / (numl*numk); + l = (n - m*numl*numk) / numk; + k = n - m*numl*numk - l*numk; + m += nzlo_fft; + l += nylo_fft; + k += nxlo_fft; - const double fkzm = fkz[m]; - snz = sin(0.5*fkzm*zprd_slab/nz_pppm); - snz *= snz; + mper = m - nz_pppm*(2*m/nz_pppm); + snz = square(sin(0.5*unitkz*mper*zprd_slab/nz_pppm)); - for (l = nylo_fft; l <= nyhi_fft; l++) { - const double fkyl = fky[l]; - sny = sin(0.5*fkyl*yprd/ny_pppm); - sny *= sny; + lper = l - ny_pppm*(2*l/ny_pppm); + sny = square(sin(0.5*unitky*lper*yprd/ny_pppm)); - for (k = nxlo_fft; k <= nxhi_fft; k++) { + kper = k - nx_pppm*(2*k/nx_pppm); + snx = square(sin(0.5*unitkx*kper*xprd/nx_pppm)); - /* only compute the part designated to this thread */ - nn = k-nxlo_fft + nnx*(l-nylo_fft + nny*(m-nzlo_fft)); - if ((nn < nnfrom) || (nn >=nnto)) continue; + sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper); - const double fkxk = fkx[k]; - snx = sin(0.5*fkxk*xprd/nx_pppm); - snx *= snx; + if (sqk != 0.0) { + numerator = 12.5663706/sqk; + denominator = gf_denom(snx,sny,snz); + sum1 = 0.0; - sqk = fkxk*fkxk + fkyl*fkyl + fkzm*fkzm; + for (nx = -nbx; nx <= nbx; nx++) { + qx = unitkx*(kper+nx_pppm*nx); + sx = exp(-0.25*square(qx/g_ewald)); + argx = 0.5*qx*xprd/nx_pppm; + wx = powsinxx(argx,twoorder); - if (sqk != 0.0) { - numerator = form*MY_4PI/sqk; - denominator = gf_denom(snx,sny,snz); - const double dorder = static_cast(order); - sum1 = 0.0; - for (nx = -nbx; nx <= nbx; nx++) { - qx = fkxk + unitkx*nx_pppm*nx; - sx = exp(qx*qx/gew2); - wx = 1.0; - argx = 0.5*qx*xprd/nx_pppm; - if (argx != 0.0) wx = pow(sin(argx)/argx,dorder); - wx *=wx; + for (ny = -nby; ny <= nby; ny++) { + qy = unitky*(lper+ny_pppm*ny); + sy = exp(-0.25*square(qy/g_ewald)); + argy = 0.5*qy*yprd/ny_pppm; + wy = powsinxx(argy,twoorder); - for (ny = -nby; ny <= nby; ny++) { - qy = fkyl + unitky*ny_pppm*ny; - sy = exp(qy*qy/gew2); - wy = 1.0; - argy = 0.5*qy*yprd/ny_pppm; - if (argy != 0.0) wy = pow(sin(argy)/argy,dorder); - wy *= wy; + for (nz = -nbz; nz <= nbz; nz++) { + qz = unitkz*(mper+nz_pppm*nz); + sz = exp(-0.25*square(qz/g_ewald)); + argz = 0.5*qz*zprd_slab/nz_pppm; + wz = powsinxx(argz,twoorder); - for (nz = -nbz; nz <= nbz; nz++) { - qz = fkzm + unitkz*nz_pppm*nz; - sz = exp(qz*qz/gew2); - wz = 1.0; - argz = 0.5*qz*zprd_slab/nz_pppm; - if (argz != 0.0) wz = pow(sin(argz)/argz,dorder); - wz *= wz; - - dot1 = fkxk*qx + fkyl*qy + fkzm*qz; - dot2 = qx*qx+qy*qy+qz*qz; - sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz; - } - } - } - greensfn[nn] = numerator*sum1/denominator; - } else greensfn[nn] = 0.0; - } - } + dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz; + dot2 = qx*qx+qy*qy+qz*qz; + sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz; + } + } + } + greensfn[n] = numerator*sum1/denominator; + } else greensfn[n] = 0.0; } - } + } // end of parallel region } /* ---------------------------------------------------------------------- - run the regular toplevel compute method from plain PPPPMCG + compute optimized Green's function for energy calculation +------------------------------------------------------------------------- */ + +void PPPMCGOMP::compute_gf_ad() +{ + + const double * const prd = (triclinic==0) ? domain->prd : domain->prd_lamda; + + const double xprd = prd[0]; + const double yprd = prd[1]; + const double zprd = prd[2]; + const double zprd_slab = zprd*slab_volfactor; + const double unitkx = (MY_2PI/xprd); + const double unitky = (MY_2PI/yprd); + const double unitkz = (MY_2PI/zprd_slab); + + const int numk = nxhi_fft - nxlo_fft + 1; + const int numl = nyhi_fft - nylo_fft + 1; + + const int twoorder = 2*order; + double sf0=0.0,sf1=0.0,sf2=0.0,sf3=0.0,sf4=0.0,sf5=0.0; + +#if defined(_OPENMP) +#pragma omp parallel default(none) reduction(+:sf0,sf1,sf2,sf3,sf4,sf5) +#endif + { + double snx,sny,snz,sqk; + double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz; + double numerator,denominator; + int k,l,m,kper,lper,mper,n,nfrom,nto,tid; + + loop_setup_thr(nfrom, nto, tid, nfft, comm->nthreads); + + for (n = nfrom; n < nto; ++n) { + + m = n / (numl*numk); + l = (n - m*numl*numk) / numk; + k = n - m*numl*numk - l*numk; + m += nzlo_fft; + l += nylo_fft; + k += nxlo_fft; + + mper = m - nz_pppm*(2*m/nz_pppm); + qz = unitkz*mper; + snz = square(sin(0.5*qz*zprd_slab/nz_pppm)); + sz = exp(-0.25*square(qz/g_ewald)); + argz = 0.5*qz*zprd_slab/nz_pppm; + wz = powsinxx(argz,twoorder); + + lper = l - ny_pppm*(2*l/ny_pppm); + qy = unitky*lper; + sny = square(sin(0.5*qy*yprd/ny_pppm)); + sy = exp(-0.25*square(qy/g_ewald)); + argy = 0.5*qy*yprd/ny_pppm; + wy = powsinxx(argy,twoorder); + + kper = k - nx_pppm*(2*k/nx_pppm); + qx = unitkx*kper; + snx = square(sin(0.5*qx*xprd/nx_pppm)); + sx = exp(-0.25*square(qx/g_ewald)); + argx = 0.5*qx*xprd/nx_pppm; + wx = powsinxx(argx,twoorder); + + sqk = qx*qx + qy*qy + qz*qz; + + if (sqk != 0.0) { + numerator = MY_4PI/sqk; + denominator = gf_denom(snx,sny,snz); + greensfn[n] = numerator*sx*sy*sz*wx*wy*wz/denominator; + sf0 += sf_precoeff1[n]*greensfn[n]; + sf1 += sf_precoeff2[n]*greensfn[n]; + sf2 += sf_precoeff3[n]*greensfn[n]; + sf3 += sf_precoeff4[n]*greensfn[n]; + sf4 += sf_precoeff5[n]*greensfn[n]; + sf5 += sf_precoeff6[n]*greensfn[n]; + } else { + greensfn[n] = 0.0; + sf0 += sf_precoeff1[n]*greensfn[n]; + sf1 += sf_precoeff2[n]*greensfn[n]; + sf2 += sf_precoeff3[n]*greensfn[n]; + sf3 += sf_precoeff4[n]*greensfn[n]; + sf4 += sf_precoeff5[n]*greensfn[n]; + sf5 += sf_precoeff6[n]*greensfn[n]; + } + } + } // end of paralle region + + // compute the coefficients for the self-force correction + + double prex, prey, prez, tmp[6]; + prex = prey = prez = MY_PI/volume; + prex *= nx_pppm/xprd; + prey *= ny_pppm/yprd; + prez *= nz_pppm/zprd_slab; + tmp[0] = sf0 * prex; + tmp[1] = sf1 * prex*2; + tmp[2] = sf2 * prey; + tmp[3] = sf3 * prey*2; + tmp[4] = sf4 * prez; + tmp[5] = sf5 * prez*2; + + // communicate values with other procs + + MPI_Allreduce(tmp,sf_coeff,6,MPI_DOUBLE,MPI_SUM,world); +} + +/* ---------------------------------------------------------------------- + run the regular toplevel compute method from plain PPPMCG which will have individual methods replaced by our threaded versions and then call the obligatory force reduction. ------------------------------------------------------------------------- */ @@ -331,94 +324,10 @@ void PPPMCGOMP::compute(int eflag, int vflag) } /* ---------------------------------------------------------------------- - create discretized "density" on section of global grid due to my particles - density(x,y,z) = charge "density" at grid points of my 3d brick - (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts) - in global grid + interpolate from grid to get electric field & force on my particles for ik ------------------------------------------------------------------------- */ -void PPPMCGOMP::make_rho() -{ - const double * const q = atom->q; - const double * const * const x = atom->x; - const int nthreads = comm->nthreads; - -#if defined(_OPENMP) -#pragma omp parallel default(none) -#endif - { -#if defined(_OPENMP) - // each thread works on a fixed chunk of atoms. - const int tid = omp_get_thread_num(); - const int inum = num_charged; - const int idelta = 1 + inum/nthreads; - const int ifrom = tid*idelta; - const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta; -#else - const int tid = 0; - const int ifrom = 0; - const int ito = num_charged; -#endif - - int i,j,l,m,n,nx,ny,nz,mx,my,mz; - FFT_SCALAR dx,dy,dz,x0,y0,z0; - - // set up clear 3d density array - const int nzoffs = (nzhi_out-nzlo_out+1)*tid; - FFT_SCALAR * const * const * const db = &(density_brick[nzoffs]); - memset(&(db[nzlo_out][nylo_out][nxlo_out]),0,ngrid*sizeof(FFT_SCALAR)); - - ThrData *thr = fix->get_thr(tid); - FFT_SCALAR * const * const r1d = static_cast(thr->get_rho1d()); - - // loop over my charges, add their contribution to nearby grid points - // (nx,ny,nz) = global coords of grid pt to "lower left" of charge - // (dx,dy,dz) = distance to "lower left" grid pt - // (mx,my,mz) = global coords of moving stencil pt - - // this if protects against having more threads than charged local atoms - if (ifrom < num_charged) { - for (j = ifrom; j < ito; j++) { - i = is_charged[j]; - - nx = part2grid[i][0]; - ny = part2grid[i][1]; - nz = part2grid[i][2]; - dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv; - dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv; - dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv; - - compute_rho1d_thr(r1d,dx,dy,dz); - - z0 = delvolinv * q[i]; - for (n = nlower; n <= nupper; n++) { - mz = n+nz; - y0 = z0*r1d[2][n]; - for (m = nlower; m <= nupper; m++) { - my = m+ny; - x0 = y0*r1d[1][m]; - for (l = nlower; l <= nupper; l++) { - mx = l+nx; - db[mz][my][mx] += x0*r1d[0][l]; - } - } - } - } - } -#if defined(_OPENMP) - // reduce 3d density array - if (nthreads > 1) { - data_reduce_fft(&(density_brick[nzlo_out][nylo_out][nxlo_out]),ngrid,nthreads,1,tid); - } -#endif - } -} - -/* ---------------------------------------------------------------------- - interpolate from grid to get electric field & force on my particles -------------------------------------------------------------------------- */ - -void PPPMCGOMP::fieldforce() +void PPPMCGOMP::fieldforce_ik() { // loop over my charges, interpolate electric field from nearby grid points // (nx,ny,nz) = global coords of grid pt to "lower left" of charge @@ -426,79 +335,161 @@ void PPPMCGOMP::fieldforce() // (mx,my,mz) = global coords of moving stencil pt // ek = 3 components of E-field on particle - const double * const q = atom->q; - const double * const * const x = atom->x; - const int nthreads = comm->nthreads; + const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0]; + const double * _noalias const q = atom->q; const double qqrd2e = force->qqrd2e; + const int nthreads = comm->nthreads; #if defined(_OPENMP) #pragma omp parallel default(none) #endif { -#if defined(_OPENMP) - // each thread works on a fixed chunk of atoms. - const int tid = omp_get_thread_num(); - const int inum = num_charged; - const int idelta = 1 + inum/nthreads; - const int ifrom = tid*idelta; - const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta; -#else - const int ifrom = 0; - const int ito = num_charged; - const int tid = 0; -#endif + FFT_SCALAR dx,dy,dz,x0,y0,z0,ekx,eky,ekz; + int i,ifrom,ito,tid,l,m,n,nx,ny,nz,mx,my,mz; + + loop_setup_thr(ifrom,ito,tid,num_charged,nthreads); + + // get per thread data ThrData *thr = fix->get_thr(tid); - double * const * const f = thr->get_f(); - FFT_SCALAR * const * const r1d = static_cast(thr->get_rho1d()); + dbl3_t * _noalias const f = (dbl3_t *) thr->get_f()[0]; + FFT_SCALAR * const * const r1d = static_cast(thr->get_rho1d()); - int i,j,l,m,n,nx,ny,nz,mx,my,mz; - FFT_SCALAR dx,dy,dz,x0,y0,z0; - FFT_SCALAR ekx,eky,ekz; + for (int j = ifrom; j < ito; ++j) { + i = is_charged[j]; - // this if protects against having more threads than charged local atoms - if (ifrom < num_charged) { - for (j = ifrom; j < ito; j++) { - i = is_charged[j]; + nx = part2grid[i][0]; + ny = part2grid[i][1]; + nz = part2grid[i][2]; + dx = nx+shiftone - (x[i].x-boxlo[0])*delxinv; + dy = ny+shiftone - (x[i].y-boxlo[1])*delyinv; + dz = nz+shiftone - (x[i].z-boxlo[2])*delzinv; - nx = part2grid[i][0]; - ny = part2grid[i][1]; - nz = part2grid[i][2]; - dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv; - dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv; - dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv; + compute_rho1d_thr(r1d,dx,dy,dz); - compute_rho1d_thr(r1d,dx,dy,dz); - - ekx = eky = ekz = ZEROF; - for (n = nlower; n <= nupper; n++) { - mz = n+nz; - z0 = r1d[2][n]; - for (m = nlower; m <= nupper; m++) { - my = m+ny; - y0 = z0*r1d[1][m]; - for (l = nlower; l <= nupper; l++) { - mx = l+nx; - x0 = y0*r1d[0][l]; - ekx -= x0*vdx_brick[mz][my][mx]; - eky -= x0*vdy_brick[mz][my][mx]; - ekz -= x0*vdz_brick[mz][my][mx]; - } - } - } - - // convert E-field to force - const double qfactor = qqrd2e*scale*q[i]; - f[i][0] += qfactor*ekx; - f[i][1] += qfactor*eky; - f[i][2] += qfactor*ekz; + ekx = eky = ekz = ZEROF; + for (n = nlower; n <= nupper; n++) { + mz = n+nz; + z0 = r1d[2][n]; + for (m = nlower; m <= nupper; m++) { + my = m+ny; + y0 = z0*r1d[1][m]; + for (l = nlower; l <= nupper; l++) { + mx = l+nx; + x0 = y0*r1d[0][l]; + ekx -= x0*vdx_brick[mz][my][mx]; + eky -= x0*vdy_brick[mz][my][mx]; + ekz -= x0*vdz_brick[mz][my][mx]; + } + } } + + // convert E-field to force + + const double qfactor = qqrd2e * scale * q[i]; + f[i].x += qfactor*ekx; + f[i].y += qfactor*eky; + if (slabflag != 2) f[i].z += qfactor*ekz; } - } + } // end of parallel region } /* ---------------------------------------------------------------------- - interpolate from grid to get per-atom energy/virial - ------------------------------------------------------------------------- */ + interpolate from grid to get electric field & force on my particles for ad +------------------------------------------------------------------------- */ + +void PPPMCGOMP::fieldforce_ad() +{ + const double *prd = (triclinic == 0) ? domain->prd : domain->prd_lamda; + const double hx_inv = nx_pppm/prd[0]; + const double hy_inv = ny_pppm/prd[1]; + const double hz_inv = nz_pppm/prd[2]; + + // loop over my charges, interpolate electric field from nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + // ek = 3 components of E-field on particle + + const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0]; + const double * _noalias const q = atom->q; + const double qqrd2e = force->qqrd2e; + const int nthreads = comm->nthreads; + +#if defined(_OPENMP) +#pragma omp parallel default(none) +#endif + { + int i,ifrom,ito,tid,l,m,n,nx,ny,nz,mx,my,mz; + FFT_SCALAR dx,dy,dz,ekx,eky,ekz; + double s1,s2,s3,sf; + + loop_setup_thr(ifrom,ito,tid,num_charged,nthreads); + + // get per thread data + ThrData *thr = fix->get_thr(tid); + dbl3_t * _noalias const f = (dbl3_t *) thr->get_f()[0]; + FFT_SCALAR * const * const r1d = static_cast(thr->get_rho1d()); + FFT_SCALAR * const * const d1d = static_cast(thr->get_drho1d()); + + for (int j = ifrom; j < ito; ++j) { + i = is_charged[j]; + + nx = part2grid[i][0]; + ny = part2grid[i][1]; + nz = part2grid[i][2]; + dx = nx+shiftone - (x[i].x-boxlo[0])*delxinv; + dy = ny+shiftone - (x[i].y-boxlo[1])*delyinv; + dz = nz+shiftone - (x[i].z-boxlo[2])*delzinv; + + compute_rho1d_thr(r1d,dx,dy,dz); + compute_drho1d_thr(d1d,dx,dy,dz); + + ekx = eky = ekz = ZEROF; + for (n = nlower; n <= nupper; n++) { + mz = n+nz; + for (m = nlower; m <= nupper; m++) { + my = m+ny; + for (l = nlower; l <= nupper; l++) { + mx = l+nx; + ekx += d1d[0][l]*r1d[1][m]*r1d[2][n]*u_brick[mz][my][mx]; + eky += r1d[0][l]*d1d[1][m]*r1d[2][n]*u_brick[mz][my][mx]; + ekz += r1d[0][l]*r1d[1][m]*d1d[2][n]*u_brick[mz][my][mx]; + } + } + } + ekx *= hx_inv; + eky *= hy_inv; + ekz *= hz_inv; + + // convert E-field to force and substract self forces + + const double qi = q[i]; + const double qfactor = qqrd2e * scale * qi; + + s1 = x[i].x*hx_inv; + sf = sf_coeff[0]*sin(MY_2PI*s1); + sf += sf_coeff[1]*sin(MY_4PI*s1); + sf *= 2.0*qi; + f[i].x += qfactor*(ekx - sf); + + s2 = x[i].y*hy_inv; + sf = sf_coeff[2]*sin(MY_2PI*s2); + sf += sf_coeff[3]*sin(MY_4PI*s2); + sf *= 2*qi; + f[i].y += qfactor*(eky - sf); + + s3 = x[i].z*hz_inv; + sf = sf_coeff[4]*sin(MY_2PI*s3); + sf += sf_coeff[5]*sin(MY_4PI*s3); + sf *= 2*qi; + if (slabflag != 2) f[i].z += qfactor*(ekz - sf); + } + } // end of parallel region +} + +/* ---------------------------------------------------------------------- + interpolate from grid to get per-atom energy/virial +------------------------------------------------------------------------- */ void PPPMCGOMP::fieldforce_peratom() { @@ -507,82 +498,71 @@ void PPPMCGOMP::fieldforce_peratom() // (dx,dy,dz) = distance to "lower left" grid pt // (mx,my,mz) = global coords of moving stencil pt - const double * const q = atom->q; - const double * const * const x = atom->x; + const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0]; + const double * _noalias const q = atom->q; const int nthreads = comm->nthreads; #if defined(_OPENMP) #pragma omp parallel default(none) #endif { -#if defined(_OPENMP) - // each thread works on a fixed chunk of atoms. - const int tid = omp_get_thread_num(); - const int inum = num_charged; - const int idelta = 1 + inum/nthreads; - const int ifrom = tid*idelta; - const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta; -#else - const int ifrom = 0; - const int ito = num_charged; - const int tid = 0; -#endif - ThrData *thr = fix->get_thr(tid); - FFT_SCALAR * const * const r1d = static_cast(thr->get_rho1d()); - - int i,j,l,m,n,nx,ny,nz,mx,my,mz; FFT_SCALAR dx,dy,dz,x0,y0,z0; FFT_SCALAR u,v0,v1,v2,v3,v4,v5; + int i,ifrom,ito,tid,l,m,n,nx,ny,nz,mx,my,mz; - // this if protects against having more threads than charged local atoms - if (ifrom < num_charged) { - for (j = ifrom; j < ito; j++) { - i = is_charged[j]; + loop_setup_thr(ifrom,ito,tid,num_charged,nthreads); - nx = part2grid[i][0]; - ny = part2grid[i][1]; - nz = part2grid[i][2]; - dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv; - dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv; - dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv; + // get per thread data + ThrData *thr = fix->get_thr(tid); + FFT_SCALAR * const * const r1d = static_cast(thr->get_rho1d()); - compute_rho1d_thr(r1d,dx,dy,dz); + for (int j=ifrom; j < ito; ++j) { + i = is_charged[j]; - u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF; - for (n = nlower; n <= nupper; n++) { - mz = n+nz; - z0 = r1d[2][n]; - for (m = nlower; m <= nupper; m++) { - my = m+ny; - y0 = z0*r1d[1][m]; - for (l = nlower; l <= nupper; l++) { - mx = l+nx; - x0 = y0*r1d[0][l]; - if (eflag_atom) u += x0*u_brick[mz][my][mx]; - if (vflag_atom) { - v0 += x0*v0_brick[mz][my][mx]; - v1 += x0*v1_brick[mz][my][mx]; - v2 += x0*v2_brick[mz][my][mx]; - v3 += x0*v3_brick[mz][my][mx]; - v4 += x0*v4_brick[mz][my][mx]; - v5 += x0*v5_brick[mz][my][mx]; - } - } - } - } + nx = part2grid[i][0]; + ny = part2grid[i][1]; + nz = part2grid[i][2]; + dx = nx+shiftone - (x[i].x-boxlo[0])*delxinv; + dy = ny+shiftone - (x[i].y-boxlo[1])*delyinv; + dz = nz+shiftone - (x[i].z-boxlo[2])*delzinv; - if (eflag_atom) eatom[i] += q[i]*u; - if (vflag_atom) { - vatom[i][0] += v0; - vatom[i][1] += v1; - vatom[i][2] += v2; - vatom[i][3] += v3; - vatom[i][4] += v4; - vatom[i][5] += v5; - } + compute_rho1d_thr(r1d,dx,dy,dz); + + u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF; + for (n = nlower; n <= nupper; n++) { + mz = n+nz; + z0 = r1d[2][n]; + for (m = nlower; m <= nupper; m++) { + my = m+ny; + y0 = z0*r1d[1][m]; + for (l = nlower; l <= nupper; l++) { + mx = l+nx; + x0 = y0*r1d[0][l]; + if (eflag_atom) u += x0*u_brick[mz][my][mx]; + if (vflag_atom) { + v0 += x0*v0_brick[mz][my][mx]; + v1 += x0*v1_brick[mz][my][mx]; + v2 += x0*v2_brick[mz][my][mx]; + v3 += x0*v3_brick[mz][my][mx]; + v4 += x0*v4_brick[mz][my][mx]; + v5 += x0*v5_brick[mz][my][mx]; + } + } + } + } + + const double qi = q[i]; + if (eflag_atom) eatom[i] += qi*u; + if (vflag_atom) { + vatom[i][0] += qi*v0; + vatom[i][1] += qi*v1; + vatom[i][2] += qi*v2; + vatom[i][3] += qi*v3; + vatom[i][4] += qi*v4; + vatom[i][5] += qi*v5; } } - } + } // end of parallel region } /* ---------------------------------------------------------------------- @@ -590,7 +570,7 @@ void PPPMCGOMP::fieldforce_peratom() dx,dy,dz = distance of particle from "lower left" grid point ------------------------------------------------------------------------- */ void PPPMCGOMP::compute_rho1d_thr(FFT_SCALAR * const * const r1d, const FFT_SCALAR &dx, - const FFT_SCALAR &dy, const FFT_SCALAR &dz) + const FFT_SCALAR &dy, const FFT_SCALAR &dz) { int k,l; FFT_SCALAR r1,r2,r3; @@ -608,3 +588,28 @@ void PPPMCGOMP::compute_rho1d_thr(FFT_SCALAR * const * const r1d, const FFT_SCAL r1d[2][k] = r3; } } + +/* ---------------------------------------------------------------------- + charge assignment into drho1d + dx,dy,dz = distance of particle from "lower left" grid point +------------------------------------------------------------------------- */ + +void PPPMCGOMP::compute_drho1d_thr(FFT_SCALAR * const * const d1d, const FFT_SCALAR &dx, + const FFT_SCALAR &dy, const FFT_SCALAR &dz) +{ + int k,l; + FFT_SCALAR r1,r2,r3; + + for (k = (1-order)/2; k <= order/2; k++) { + r1 = r2 = r3 = ZEROF; + + for (l = order-2; l >= 0; l--) { + r1 = drho_coeff[l][k] + r1*dx; + r2 = drho_coeff[l][k] + r2*dy; + r3 = drho_coeff[l][k] + r3*dz; + } + d1d[0][k] = r1; + d1d[1][k] = r2; + d1d[2][k] = r3; + } +} diff --git a/src/USER-OMP/pppm_cg_omp.h b/src/USER-OMP/pppm_cg_omp.h index 10a9d35f29..37be55eeef 100644 --- a/src/USER-OMP/pppm_cg_omp.h +++ b/src/USER-OMP/pppm_cg_omp.h @@ -25,23 +25,29 @@ KSpaceStyle(pppm/cg/omp,PPPMCGOMP) namespace LAMMPS_NS { - class PPPMCGOMP : public PPPMCG, public ThrOMP { +class PPPMCGOMP : public PPPMCG, public ThrOMP { public: PPPMCGOMP(class LAMMPS *, int, char **); - virtual void compute(int, int); - virtual void setup(); virtual ~PPPMCGOMP () {}; + virtual void compute(int, int); protected: virtual void allocate(); virtual void deallocate(); - virtual void fieldforce(); - virtual void fieldforce_peratom(); - virtual void make_rho(); + virtual void compute_gf_ik(); + virtual void compute_gf_ad(); + + virtual void fieldforce_ik(); + virtual void fieldforce_ad(); + virtual void fieldforce_peratom(); +// virtual void make_rho(); + + private: void compute_rho1d_thr(FFT_SCALAR * const * const, const FFT_SCALAR &, const FFT_SCALAR &, const FFT_SCALAR &); -// void compute_rho_coeff(); + void compute_drho1d_thr(FFT_SCALAR * const * const, const FFT_SCALAR &, + const FFT_SCALAR &, const FFT_SCALAR &); // void slabcorr(int); }; diff --git a/src/USER-OMP/pppm_omp.cpp b/src/USER-OMP/pppm_omp.cpp index 5c400b6f48..d45cc492ed 100644 --- a/src/USER-OMP/pppm_omp.cpp +++ b/src/USER-OMP/pppm_omp.cpp @@ -19,9 +19,12 @@ #include "atom.h" #include "comm.h" #include "domain.h" +#include "error.h" +#include "fix_omp.h" #include "force.h" #include "memory.h" #include "math_const.h" +#include "math_special.h" #include #include @@ -29,13 +32,12 @@ #include "suffix.h" using namespace LAMMPS_NS; using namespace MathConst; +using namespace MathSpecial; #ifdef FFT_SINGLE #define ZEROF 0.0f -#define ONEF 1.0f #else #define ZEROF 0.0 -#define ONEF 1.0 #endif #define EPS_HOC 1.0e-7 @@ -56,7 +58,27 @@ void PPPMOMP::allocate() { PPPM::allocate(); - const int nthreads = comm->nthreads; +#if defined(_OPENMP) +#pragma omp parallel default(none) +#endif + { +#if defined(_OPENMP) + const int tid = omp_get_thread_num(); +#else + const int tid = 0; +#endif + ThrData *thr = fix->get_thr(tid); + thr->init_pppm(order,memory); + } +} + +/* ---------------------------------------------------------------------- + free memory that depends on # of K-vectors and order +------------------------------------------------------------------------- */ + +void PPPMOMP::deallocate() +{ + PPPM::deallocate(); #if defined(_OPENMP) #pragma omp parallel default(none) @@ -67,153 +89,26 @@ void PPPMOMP::allocate() #else const int tid = 0; #endif - - FFT_SCALAR **rho1d_thr; - memory->create2d_offset(rho1d_thr,3,-order/2,order/2,"pppm:rho1d_thr"); ThrData *thr = fix->get_thr(tid); - thr->init_pppm(static_cast(rho1d_thr)); - } - - const int nzend = (nzhi_out-nzlo_out+1)*nthreads + nzlo_out -1; - - // reallocate density brick, so it fits our needs - memory->destroy3d_offset(density_brick,nzlo_out,nylo_out,nxlo_out); - memory->create3d_offset(density_brick,nzlo_out,nzend,nylo_out,nyhi_out, - nxlo_out,nxhi_out,"pppm:density_brick"); -} - -// NOTE: special version of reduce_data for FFT_SCALAR data type. -// reduce per thread data into the first part of the data -// array that is used for the non-threaded parts and reset -// the temporary storage to 0.0. this routine depends on -// multi-dimensional arrays like force stored in this order -// x1,y1,z1,x2,y2,z2,... -// we need to post a barrier to wait until all threads are done -// writing to the array. -static void data_reduce_fft(FFT_SCALAR *dall, int nall, int nthreads, int ndim, int tid) -{ -#if defined(_OPENMP) - // NOOP in non-threaded execution. - if (nthreads == 1) return; -#pragma omp barrier - { - const int nvals = ndim*nall; - const int idelta = nvals/nthreads + 1; - const int ifrom = tid*idelta; - const int ito = ((ifrom + idelta) > nvals) ? nvals : (ifrom + idelta); - - // this if protects against having more threads than atoms - if (ifrom < nall) { - for (int m = ifrom; m < ito; ++m) { - for (int n = 1; n < nthreads; ++n) { - dall[m] += dall[n*nvals + m]; - dall[n*nvals + m] = 0.0; - } - } - } - } -#else - // NOOP in non-threaded execution. - return; -#endif -} - -/* ---------------------------------------------------------------------- - free memory that depends on # of K-vectors and order -------------------------------------------------------------------------- */ - -void PPPMOMP::deallocate() -{ - PPPM::deallocate(); - for (int i=0; i < comm->nthreads; ++i) { - ThrData * thr = fix->get_thr(i); - FFT_SCALAR ** rho1d_thr = static_cast(thr->get_rho1d()); - memory->destroy2d_offset(rho1d_thr,-order/2); + thr->init_pppm(-order,memory); } } /* ---------------------------------------------------------------------- - adjust PPPM coeffs, called initially and whenever volume has changed + pre-compute modified (Hockney-Eastwood) Coulomb Green's function ------------------------------------------------------------------------- */ -void PPPMOMP::setup() +void PPPMOMP::compute_gf_ik() { - int i,j,k,n; - double *prd; - - // volume-dependent factors - // adjust z dimension for 2d slab PPPM - // z dimension for 3d PPPM is zprd since slab_volfactor = 1.0 - - if (triclinic == 0) prd = domain->prd; - else prd = domain->prd_lamda; + const double * const prd = (triclinic==0) ? domain->prd : domain->prd_lamda; const double xprd = prd[0]; const double yprd = prd[1]; const double zprd = prd[2]; const double zprd_slab = zprd*slab_volfactor; - volume = xprd * yprd * zprd_slab; - - delxinv = nx_pppm/xprd; - delyinv = ny_pppm/yprd; - delzinv = nz_pppm/zprd_slab; - - delvolinv = delxinv*delyinv*delzinv; - - const double unitkx = (2.0*MY_PI/xprd); - const double unitky = (2.0*MY_PI/yprd); - const double unitkz = (2.0*MY_PI/zprd_slab); - - // fkx,fky,fkz for my FFT grid pts - - double per; - - for (i = nxlo_fft; i <= nxhi_fft; i++) { - per = i - nx_pppm*(2*i/nx_pppm); - fkx[i] = unitkx*per; - } - - for (i = nylo_fft; i <= nyhi_fft; i++) { - per = i - ny_pppm*(2*i/ny_pppm); - fky[i] = unitky*per; - } - - for (i = nzlo_fft; i <= nzhi_fft; i++) { - per = i - nz_pppm*(2*i/nz_pppm); - fkz[i] = unitkz*per; - } - - // virial coefficients - - double sqk,vterm; - - n = 0; - for (k = nzlo_fft; k <= nzhi_fft; k++) { - for (j = nylo_fft; j <= nyhi_fft; j++) { - for (i = nxlo_fft; i <= nxhi_fft; i++) { - sqk = fkx[i]*fkx[i] + fky[j]*fky[j] + fkz[k]*fkz[k]; - if (sqk == 0.0) { - vg[n][0] = 0.0; - vg[n][1] = 0.0; - vg[n][2] = 0.0; - vg[n][3] = 0.0; - vg[n][4] = 0.0; - vg[n][5] = 0.0; - } else { - vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald)); - vg[n][0] = 1.0 + vterm*fkx[i]*fkx[i]; - vg[n][1] = 1.0 + vterm*fky[j]*fky[j]; - vg[n][2] = 1.0 + vterm*fkz[k]*fkz[k]; - vg[n][3] = vterm*fkx[i]*fky[j]; - vg[n][4] = vterm*fkx[i]*fkz[k]; - vg[n][5] = vterm*fky[j]*fkz[k]; - } - n++; - } - } - } - - // modified (Hockney-Eastwood) Coulomb Green's function + const double unitkx = (MY_2PI/xprd); + const double unitky = (MY_2PI/yprd); + const double unitkz = (MY_2PI/zprd_slab); const int nbx = static_cast ((g_ewald*xprd/(MY_PI*nx_pppm)) * pow(-log(EPS_HOC),0.25)); @@ -221,93 +116,188 @@ void PPPMOMP::setup() pow(-log(EPS_HOC),0.25)); const int nbz = static_cast ((g_ewald*zprd_slab/(MY_PI*nz_pppm)) * pow(-log(EPS_HOC),0.25)); + const int numk = nxhi_fft - nxlo_fft + 1; + const int numl = nyhi_fft - nylo_fft + 1; - const double form = 1.0; + const int twoorder = 2*order; #if defined(_OPENMP) #pragma omp parallel default(none) #endif { - int tid,nn,nnfrom,nnto,nx,ny,nz,k,l,m; - double snx,sny,snz,sqk; + double snx,sny,snz; double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz; double sum1,dot1,dot2; double numerator,denominator; - const double gew2 = -4.0*g_ewald*g_ewald; + double sqk; - const int nnx = nxhi_fft-nxlo_fft+1; - const int nny = nyhi_fft-nylo_fft+1; + int k,l,m,nx,ny,nz,kper,lper,mper,n,nfrom,nto,tid; - loop_setup_thr(nnfrom, nnto, tid, nfft, comm->nthreads); + loop_setup_thr(nfrom, nto, tid, nfft, comm->nthreads); - for (m = nzlo_fft; m <= nzhi_fft; m++) { + for (n = nfrom; n < nto; ++n) { + m = n / (numl*numk); + l = (n - m*numl*numk) / numk; + k = n - m*numl*numk - l*numk; + m += nzlo_fft; + l += nylo_fft; + k += nxlo_fft; - const double fkzm = fkz[m]; - snz = sin(0.5*fkzm*zprd_slab/nz_pppm); - snz *= snz; + mper = m - nz_pppm*(2*m/nz_pppm); + snz = square(sin(0.5*unitkz*mper*zprd_slab/nz_pppm)); - for (l = nylo_fft; l <= nyhi_fft; l++) { - const double fkyl = fky[l]; - sny = sin(0.5*fkyl*yprd/ny_pppm); - sny *= sny; + lper = l - ny_pppm*(2*l/ny_pppm); + sny = square(sin(0.5*unitky*lper*yprd/ny_pppm)); - for (k = nxlo_fft; k <= nxhi_fft; k++) { + kper = k - nx_pppm*(2*k/nx_pppm); + snx = square(sin(0.5*unitkx*kper*xprd/nx_pppm)); - /* only compute the part designated to this thread */ - nn = k-nxlo_fft + nnx*(l-nylo_fft + nny*(m-nzlo_fft)); - if ((nn < nnfrom) || (nn >=nnto)) continue; + sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper); - const double fkxk = fkx[k]; - snx = sin(0.5*fkxk*xprd/nx_pppm); - snx *= snx; + if (sqk != 0.0) { + numerator = 12.5663706/sqk; + denominator = gf_denom(snx,sny,snz); + sum1 = 0.0; - sqk = fkxk*fkxk + fkyl*fkyl + fkzm*fkzm; + for (nx = -nbx; nx <= nbx; nx++) { + qx = unitkx*(kper+nx_pppm*nx); + sx = exp(-0.25*square(qx/g_ewald)); + argx = 0.5*qx*xprd/nx_pppm; + wx = powsinxx(argx,twoorder); - if (sqk != 0.0) { - numerator = form*MY_4PI/sqk; - denominator = gf_denom(snx,sny,snz); - sum1 = 0.0; - const double dorder = static_cast(order); - for (nx = -nbx; nx <= nbx; nx++) { - qx = fkxk + unitkx*nx_pppm*nx; - sx = exp(qx*qx/gew2); - wx = 1.0; - argx = 0.5*qx*xprd/nx_pppm; - if (argx != 0.0) wx = pow(sin(argx)/argx,dorder); - wx *=wx; + for (ny = -nby; ny <= nby; ny++) { + qy = unitky*(lper+ny_pppm*ny); + sy = exp(-0.25*square(qy/g_ewald)); + argy = 0.5*qy*yprd/ny_pppm; + wy = powsinxx(argy,twoorder); - for (ny = -nby; ny <= nby; ny++) { - qy = fkyl + unitky*ny_pppm*ny; - sy = exp(qy*qy/gew2); - wy = 1.0; - argy = 0.5*qy*yprd/ny_pppm; - if (argy != 0.0) wy = pow(sin(argy)/argy,dorder); - wy *= wy; + for (nz = -nbz; nz <= nbz; nz++) { + qz = unitkz*(mper+nz_pppm*nz); + sz = exp(-0.25*square(qz/g_ewald)); + argz = 0.5*qz*zprd_slab/nz_pppm; + wz = powsinxx(argz,twoorder); - for (nz = -nbz; nz <= nbz; nz++) { - qz = fkzm + unitkz*nz_pppm*nz; - sz = exp(qz*qz/gew2); - wz = 1.0; - argz = 0.5*qz*zprd_slab/nz_pppm; - if (argz != 0.0) wz = pow(sin(argz)/argz,dorder); - wz *= wz; - - dot1 = fkxk*qx + fkyl*qy + fkzm*qz; - dot2 = qx*qx+qy*qy+qz*qz; - sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz; - } - } + dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz; + dot2 = qx*qx+qy*qy+qz*qz; + sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz; } - greensfn[nn] = numerator*sum1/denominator; - } else greensfn[nn] = 0.0; + } } - } + greensfn[n] = numerator*sum1/denominator; + } else greensfn[n] = 0.0; } - } + } // end of parallel region } /* ---------------------------------------------------------------------- - run the regular toplevel compute method from plain PPPPM + compute optimized Green's function for energy calculation +------------------------------------------------------------------------- */ + +void PPPMOMP::compute_gf_ad() +{ + + const double * const prd = (triclinic==0) ? domain->prd : domain->prd_lamda; + + const double xprd = prd[0]; + const double yprd = prd[1]; + const double zprd = prd[2]; + const double zprd_slab = zprd*slab_volfactor; + const double unitkx = (MY_2PI/xprd); + const double unitky = (MY_2PI/yprd); + const double unitkz = (MY_2PI/zprd_slab); + + const int numk = nxhi_fft - nxlo_fft + 1; + const int numl = nyhi_fft - nylo_fft + 1; + + const int twoorder = 2*order; + double sf0=0.0,sf1=0.0,sf2=0.0,sf3=0.0,sf4=0.0,sf5=0.0; + +#if defined(_OPENMP) +#pragma omp parallel default(none) reduction(+:sf0,sf1,sf2,sf3,sf4,sf5) +#endif + { + double snx,sny,snz,sqk; + double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz; + double numerator,denominator; + int k,l,m,kper,lper,mper,n,nfrom,nto,tid; + + loop_setup_thr(nfrom, nto, tid, nfft, comm->nthreads); + + for (n = nfrom; n < nto; ++n) { + + m = n / (numl*numk); + l = (n - m*numl*numk) / numk; + k = n - m*numl*numk - l*numk; + m += nzlo_fft; + l += nylo_fft; + k += nxlo_fft; + + mper = m - nz_pppm*(2*m/nz_pppm); + qz = unitkz*mper; + snz = square(sin(0.5*qz*zprd_slab/nz_pppm)); + sz = exp(-0.25*square(qz/g_ewald)); + argz = 0.5*qz*zprd_slab/nz_pppm; + wz = powsinxx(argz,twoorder); + + lper = l - ny_pppm*(2*l/ny_pppm); + qy = unitky*lper; + sny = square(sin(0.5*qy*yprd/ny_pppm)); + sy = exp(-0.25*square(qy/g_ewald)); + argy = 0.5*qy*yprd/ny_pppm; + wy = powsinxx(argy,twoorder); + + kper = k - nx_pppm*(2*k/nx_pppm); + qx = unitkx*kper; + snx = square(sin(0.5*qx*xprd/nx_pppm)); + sx = exp(-0.25*square(qx/g_ewald)); + argx = 0.5*qx*xprd/nx_pppm; + wx = powsinxx(argx,twoorder); + + sqk = qx*qx + qy*qy + qz*qz; + + if (sqk != 0.0) { + numerator = MY_4PI/sqk; + denominator = gf_denom(snx,sny,snz); + greensfn[n] = numerator*sx*sy*sz*wx*wy*wz/denominator; + sf0 += sf_precoeff1[n]*greensfn[n]; + sf1 += sf_precoeff2[n]*greensfn[n]; + sf2 += sf_precoeff3[n]*greensfn[n]; + sf3 += sf_precoeff4[n]*greensfn[n]; + sf4 += sf_precoeff5[n]*greensfn[n]; + sf5 += sf_precoeff6[n]*greensfn[n]; + } else { + greensfn[n] = 0.0; + sf0 += sf_precoeff1[n]*greensfn[n]; + sf1 += sf_precoeff2[n]*greensfn[n]; + sf2 += sf_precoeff3[n]*greensfn[n]; + sf3 += sf_precoeff4[n]*greensfn[n]; + sf4 += sf_precoeff5[n]*greensfn[n]; + sf5 += sf_precoeff6[n]*greensfn[n]; + } + } + } // end of paralle region + + // compute the coefficients for the self-force correction + + double prex, prey, prez, tmp[6]; + prex = prey = prez = MY_PI/volume; + prex *= nx_pppm/xprd; + prey *= ny_pppm/yprd; + prez *= nz_pppm/zprd_slab; + tmp[0] = sf0 * prex; + tmp[1] = sf1 * prex*2; + tmp[2] = sf2 * prey; + tmp[3] = sf3 * prey*2; + tmp[4] = sf4 * prez; + tmp[5] = sf5 * prez*2; + + // communicate values with other procs + + MPI_Allreduce(tmp,sf_coeff,6,MPI_DOUBLE,MPI_SUM,world); +} + +/* ---------------------------------------------------------------------- + run the regular toplevel compute method from plain PPPM which will have individual methods replaced by our threaded versions and then call the obligatory force reduction. ------------------------------------------------------------------------- */ @@ -332,94 +322,10 @@ void PPPMOMP::compute(int eflag, int vflag) } /* ---------------------------------------------------------------------- - create discretized "density" on section of global grid due to my particles - density(x,y,z) = charge "density" at grid points of my 3d brick - (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts) - in global grid + interpolate from grid to get electric field & force on my particles for ik ------------------------------------------------------------------------- */ -void PPPMOMP::make_rho() -{ - const double * const q = atom->q; - const double * const * const x = atom->x; - const int nthreads = comm->nthreads; - const int nlocal = atom->nlocal; - -#if defined(_OPENMP) -#pragma omp parallel default(none) -#endif - { -#if defined(_OPENMP) - // each thread works on a fixed chunk of atoms. - const int tid = omp_get_thread_num(); - const int inum = nlocal; - const int idelta = 1 + inum/nthreads; - const int ifrom = tid*idelta; - const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta; -#else - const int tid = 0; - const int ifrom = 0; - const int ito = nlocal; -#endif - - int l,m,n,nx,ny,nz,mx,my,mz; - FFT_SCALAR dx,dy,dz,x0,y0,z0; - - // set up clear 3d density array - const int nzoffs = (nzhi_out-nzlo_out+1)*tid; - FFT_SCALAR * const * const * const db = &(density_brick[nzoffs]); - memset(&(db[nzlo_out][nylo_out][nxlo_out]),0,ngrid*sizeof(FFT_SCALAR)); - - ThrData *thr = fix->get_thr(tid); - FFT_SCALAR * const * const r1d = static_cast(thr->get_rho1d()); - - // loop over my charges, add their contribution to nearby grid points - // (nx,ny,nz) = global coords of grid pt to "lower left" of charge - // (dx,dy,dz) = distance to "lower left" grid pt - // (mx,my,mz) = global coords of moving stencil pt - - // this if protects against having more threads than local atoms - if (ifrom < nlocal) { - for (int i = ifrom; i < ito; i++) { - - nx = part2grid[i][0]; - ny = part2grid[i][1]; - nz = part2grid[i][2]; - dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv; - dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv; - dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv; - - compute_rho1d_thr(r1d,dx,dy,dz); - - z0 = delvolinv * q[i]; - for (n = nlower; n <= nupper; n++) { - mz = n+nz; - y0 = z0*r1d[2][n]; - for (m = nlower; m <= nupper; m++) { - my = m+ny; - x0 = y0*r1d[1][m]; - for (l = nlower; l <= nupper; l++) { - mx = l+nx; - db[mz][my][mx] += x0*r1d[0][l]; - } - } - } - } - } -#if defined(_OPENMP) - // reduce 3d density array - if (nthreads > 1) { - data_reduce_fft(&(density_brick[nzlo_out][nylo_out][nxlo_out]),ngrid,nthreads,1,tid); - } -#endif - } -} - -/* ---------------------------------------------------------------------- - interpolate from grid to get electric field & force on my particles -------------------------------------------------------------------------- */ - -void PPPMOMP::fieldforce() +void PPPMOMP::fieldforce_ik() { // loop over my charges, interpolate electric field from nearby grid points // (nx,ny,nz) = global coords of grid pt to "lower left" of charge @@ -427,79 +333,170 @@ void PPPMOMP::fieldforce() // (mx,my,mz) = global coords of moving stencil pt // ek = 3 components of E-field on particle - const double * const q = atom->q; - const double * const * const x = atom->x; + const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0]; + const double * _noalias const q = atom->q; + const int3_t * _noalias const p2g = (int3_t *) part2grid[0]; + + const double qqrd2e = force->qqrd2e; + const double boxlox = boxlo[0]; + const double boxloy = boxlo[1]; + const double boxloz = boxlo[2]; + const int nthreads = comm->nthreads; const int nlocal = atom->nlocal; - const double qqrd2e = force->qqrd2e; #if defined(_OPENMP) #pragma omp parallel default(none) #endif { -#if defined(_OPENMP) - // each thread works on a fixed chunk of atoms. - const int tid = omp_get_thread_num(); - const int inum = nlocal; - const int idelta = 1 + inum/nthreads; - const int ifrom = tid*idelta; - const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta; -#else - const int ifrom = 0; - const int ito = nlocal; - const int tid = 0; -#endif + FFT_SCALAR x0,y0,z0,ekx,eky,ekz; + int i,ifrom,ito,tid,l,m,n,mx,my,mz; + + loop_setup_thr(ifrom,ito,tid,nlocal,nthreads); + + // get per thread data ThrData *thr = fix->get_thr(tid); - double * const * const f = thr->get_f(); - FFT_SCALAR * const * const r1d = static_cast(thr->get_rho1d()); + dbl3_t * _noalias const f = (dbl3_t *) thr->get_f()[0]; + FFT_SCALAR * const * const r1d = static_cast(thr->get_rho1d()); - int l,m,n,nx,ny,nz,mx,my,mz; - FFT_SCALAR dx,dy,dz,x0,y0,z0; - FFT_SCALAR ekx,eky,ekz; + for (i = ifrom; i < ito; ++i) { + const int nx = p2g[i].a; + const int ny = p2g[i].b; + const int nz = p2g[i].t; + const FFT_SCALAR dx = nx+shiftone - (x[i].x-boxlox)*delxinv; + const FFT_SCALAR dy = ny+shiftone - (x[i].y-boxloy)*delyinv; + const FFT_SCALAR dz = nz+shiftone - (x[i].z-boxloz)*delzinv; - // this if protects against having more threads than local atoms - if (ifrom < nlocal) { - for (int i = ifrom; i < ito; i++) { + compute_rho1d_thr(r1d,dx,dy,dz); - nx = part2grid[i][0]; - ny = part2grid[i][1]; - nz = part2grid[i][2]; - dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv; - dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv; - dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv; - - compute_rho1d_thr(r1d,dx,dy,dz); - - ekx = eky = ekz = ZEROF; - for (n = nlower; n <= nupper; n++) { - mz = n+nz; - z0 = r1d[2][n]; - for (m = nlower; m <= nupper; m++) { - my = m+ny; - y0 = z0*r1d[1][m]; - for (l = nlower; l <= nupper; l++) { - mx = l+nx; - x0 = y0*r1d[0][l]; - ekx -= x0*vdx_brick[mz][my][mx]; - eky -= x0*vdy_brick[mz][my][mx]; - ekz -= x0*vdz_brick[mz][my][mx]; - } + ekx = eky = ekz = ZEROF; + for (n = nlower; n <= nupper; n++) { + mz = n+nz; + z0 = r1d[2][n]; + for (m = nlower; m <= nupper; m++) { + my = m+ny; + y0 = z0*r1d[1][m]; + for (l = nlower; l <= nupper; l++) { + mx = l+nx; + x0 = y0*r1d[0][l]; + ekx -= x0*vdx_brick[mz][my][mx]; + eky -= x0*vdy_brick[mz][my][mx]; + ekz -= x0*vdz_brick[mz][my][mx]; } } - - // convert E-field to force - const double qfactor = qqrd2e*scale*q[i]; - f[i][0] += qfactor*ekx; - f[i][1] += qfactor*eky; - f[i][2] += qfactor*ekz; } + + // convert E-field to force + + const double qfactor = qqrd2e * scale * q[i]; + f[i].x += qfactor*ekx; + f[i].y += qfactor*eky; + if (slabflag != 2) f[i].z += qfactor*ekz; } - } + } // end of parallel region } /* ---------------------------------------------------------------------- - interpolate from grid to get per-atom energy/virial - ------------------------------------------------------------------------- */ + interpolate from grid to get electric field & force on my particles for ad +------------------------------------------------------------------------- */ + +void PPPMOMP::fieldforce_ad() +{ + const double *prd = (triclinic == 0) ? domain->prd : domain->prd_lamda; + const double hx_inv = nx_pppm/prd[0]; + const double hy_inv = ny_pppm/prd[1]; + const double hz_inv = nz_pppm/prd[2]; + + // loop over my charges, interpolate electric field from nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + // ek = 3 components of E-field on particle + + const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0]; + const double * _noalias const q = atom->q; + const int3_t * _noalias const p2g = (int3_t *) part2grid[0]; + const double qqrd2e = force->qqrd2e; + const double boxlox = boxlo[0]; + const double boxloy = boxlo[1]; + const double boxloz = boxlo[2]; + + const int nthreads = comm->nthreads; + const int nlocal = atom->nlocal; + +#if defined(_OPENMP) +#pragma omp parallel default(none) +#endif + { + double s1,s2,s3,sf; + FFT_SCALAR ekx,eky,ekz; + int i,ifrom,ito,tid,l,m,n,mx,my,mz; + + loop_setup_thr(ifrom,ito,tid,nlocal,nthreads); + + // get per thread data + ThrData *thr = fix->get_thr(tid); + dbl3_t * _noalias const f = (dbl3_t *) thr->get_f()[0]; + FFT_SCALAR * const * const r1d = static_cast(thr->get_rho1d()); + FFT_SCALAR * const * const d1d = static_cast(thr->get_drho1d()); + + for (i = ifrom; i < ito; ++i) { + const int nx = p2g[i].a; + const int ny = p2g[i].b; + const int nz = p2g[i].t; + const FFT_SCALAR dx = nx+shiftone - (x[i].x-boxlox)*delxinv; + const FFT_SCALAR dy = ny+shiftone - (x[i].y-boxloy)*delyinv; + const FFT_SCALAR dz = nz+shiftone - (x[i].z-boxloz)*delzinv; + + compute_rho1d_thr(r1d,dx,dy,dz); + compute_drho1d_thr(d1d,dx,dy,dz); + + ekx = eky = ekz = ZEROF; + for (n = nlower; n <= nupper; n++) { + mz = n+nz; + for (m = nlower; m <= nupper; m++) { + my = m+ny; + for (l = nlower; l <= nupper; l++) { + mx = l+nx; + ekx += d1d[0][l]*r1d[1][m]*r1d[2][n]*u_brick[mz][my][mx]; + eky += r1d[0][l]*d1d[1][m]*r1d[2][n]*u_brick[mz][my][mx]; + ekz += r1d[0][l]*r1d[1][m]*d1d[2][n]*u_brick[mz][my][mx]; + } + } + } + ekx *= hx_inv; + eky *= hy_inv; + ekz *= hz_inv; + + // convert E-field to force and substract self forces + + const double qi = q[i]; + const double qfactor = qqrd2e * scale * qi; + + s1 = x[i].x*hx_inv; + sf = sf_coeff[0]*sin(MY_2PI*s1); + sf += sf_coeff[1]*sin(MY_4PI*s1); + sf *= 2.0*qi; + f[i].x += qfactor*(ekx - sf); + + s2 = x[i].y*hy_inv; + sf = sf_coeff[2]*sin(MY_2PI*s2); + sf += sf_coeff[3]*sin(MY_4PI*s2); + sf *= 2.0*qi; + f[i].y += qfactor*(eky - sf); + + s3 = x[i].z*hz_inv; + sf = sf_coeff[4]*sin(MY_2PI*s3); + sf += sf_coeff[5]*sin(MY_4PI*s3); + sf *= 2.0*qi; + if (slabflag != 2) f[i].z += qfactor*(ekz - sf); + } + } // end of parallel region +} + +/* ---------------------------------------------------------------------- + interpolate from grid to get per-atom energy/virial +------------------------------------------------------------------------- */ void PPPMOMP::fieldforce_peratom() { @@ -508,8 +505,8 @@ void PPPMOMP::fieldforce_peratom() // (dx,dy,dz) = distance to "lower left" grid pt // (mx,my,mz) = global coords of moving stencil pt - const double * const q = atom->q; - const double * const * const x = atom->x; + const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0]; + const double * _noalias const q = atom->q; const int nthreads = comm->nthreads; const int nlocal = atom->nlocal; @@ -517,73 +514,61 @@ void PPPMOMP::fieldforce_peratom() #pragma omp parallel default(none) #endif { -#if defined(_OPENMP) - // each thread works on a fixed chunk of atoms. - const int tid = omp_get_thread_num(); - const int inum = nlocal; - const int idelta = 1 + inum/nthreads; - const int ifrom = tid*idelta; - const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta; -#else - const int ifrom = 0; - const int ito = nlocal; - const int tid = 0; -#endif - ThrData *thr = fix->get_thr(tid); - FFT_SCALAR * const * const r1d = static_cast(thr->get_rho1d()); - - int i,l,m,n,nx,ny,nz,mx,my,mz; FFT_SCALAR dx,dy,dz,x0,y0,z0; FFT_SCALAR u,v0,v1,v2,v3,v4,v5; + int i,ifrom,ito,tid,l,m,n,nx,ny,nz,mx,my,mz; - // this if protects against having more threads than local atoms - if (ifrom < nlocal) { - for (int i = ifrom; i < ito; i++) { + loop_setup_thr(ifrom,ito,tid,nlocal,nthreads); - nx = part2grid[i][0]; - ny = part2grid[i][1]; - nz = part2grid[i][2]; - dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv; - dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv; - dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv; + // get per thread data + ThrData *thr = fix->get_thr(tid); + FFT_SCALAR * const * const r1d = static_cast(thr->get_rho1d()); - compute_rho1d_thr(r1d,dx,dy,dz); + for (i = ifrom; i < ito; ++i) { + nx = part2grid[i][0]; + ny = part2grid[i][1]; + nz = part2grid[i][2]; + dx = nx+shiftone - (x[i].x-boxlo[0])*delxinv; + dy = ny+shiftone - (x[i].y-boxlo[1])*delyinv; + dz = nz+shiftone - (x[i].z-boxlo[2])*delzinv; - u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF; - for (n = nlower; n <= nupper; n++) { - mz = n+nz; - z0 = r1d[2][n]; - for (m = nlower; m <= nupper; m++) { - my = m+ny; - y0 = z0*r1d[1][m]; - for (l = nlower; l <= nupper; l++) { - mx = l+nx; - x0 = y0*r1d[0][l]; - if (eflag_atom) u += x0*u_brick[mz][my][mx]; - if (vflag_atom) { - v0 += x0*v0_brick[mz][my][mx]; - v1 += x0*v1_brick[mz][my][mx]; - v2 += x0*v2_brick[mz][my][mx]; - v3 += x0*v3_brick[mz][my][mx]; - v4 += x0*v4_brick[mz][my][mx]; - v5 += x0*v5_brick[mz][my][mx]; - } + compute_rho1d_thr(r1d,dx,dy,dz); + + u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF; + for (n = nlower; n <= nupper; n++) { + mz = n+nz; + z0 = r1d[2][n]; + for (m = nlower; m <= nupper; m++) { + my = m+ny; + y0 = z0*r1d[1][m]; + for (l = nlower; l <= nupper; l++) { + mx = l+nx; + x0 = y0*r1d[0][l]; + if (eflag_atom) u += x0*u_brick[mz][my][mx]; + if (vflag_atom) { + v0 += x0*v0_brick[mz][my][mx]; + v1 += x0*v1_brick[mz][my][mx]; + v2 += x0*v2_brick[mz][my][mx]; + v3 += x0*v3_brick[mz][my][mx]; + v4 += x0*v4_brick[mz][my][mx]; + v5 += x0*v5_brick[mz][my][mx]; } } } + } - if (eflag_atom) eatom[i] += q[i]*u; - if (vflag_atom) { - vatom[i][0] += v0; - vatom[i][1] += v1; - vatom[i][2] += v2; - vatom[i][3] += v3; - vatom[i][4] += v4; - vatom[i][5] += v5; - } + const double qi = q[i]; + if (eflag_atom) eatom[i] += qi*u; + if (vflag_atom) { + vatom[i][0] += qi*v0; + vatom[i][1] += qi*v1; + vatom[i][2] += qi*v2; + vatom[i][3] += qi*v3; + vatom[i][4] += qi*v4; + vatom[i][5] += qi*v5; } } - } + } // end of parallel region } /* ---------------------------------------------------------------------- @@ -609,3 +594,28 @@ void PPPMOMP::compute_rho1d_thr(FFT_SCALAR * const * const r1d, const FFT_SCALAR r1d[2][k] = r3; } } + +/* ---------------------------------------------------------------------- + charge assignment into drho1d + dx,dy,dz = distance of particle from "lower left" grid point +------------------------------------------------------------------------- */ + +void PPPMOMP::compute_drho1d_thr(FFT_SCALAR * const * const d1d, const FFT_SCALAR &dx, + const FFT_SCALAR &dy, const FFT_SCALAR &dz) +{ + int k,l; + FFT_SCALAR r1,r2,r3; + + for (k = (1-order)/2; k <= order/2; k++) { + r1 = r2 = r3 = ZEROF; + + for (l = order-2; l >= 0; l--) { + r1 = drho_coeff[l][k] + r1*dx; + r2 = drho_coeff[l][k] + r2*dy; + r3 = drho_coeff[l][k] + r3*dz; + } + d1d[0][k] = r1; + d1d[1][k] = r2; + d1d[2][k] = r3; + } +} diff --git a/src/USER-OMP/pppm_omp.h b/src/USER-OMP/pppm_omp.h index b3c069844c..835398b79f 100644 --- a/src/USER-OMP/pppm_omp.h +++ b/src/USER-OMP/pppm_omp.h @@ -25,23 +25,29 @@ KSpaceStyle(pppm/omp,PPPMOMP) namespace LAMMPS_NS { - class PPPMOMP : public PPPM, public ThrOMP { +class PPPMOMP : public PPPM, public ThrOMP { public: PPPMOMP(class LAMMPS *, int, char **); virtual ~PPPMOMP () {}; - virtual void setup(); virtual void compute(int, int); protected: virtual void allocate(); virtual void deallocate(); - virtual void fieldforce(); - virtual void fieldforce_peratom(); - virtual void make_rho(); + virtual void compute_gf_ik(); + virtual void compute_gf_ad(); + + virtual void fieldforce_ik(); + virtual void fieldforce_ad(); + virtual void fieldforce_peratom(); +// virtual void make_rho(); + + private: void compute_rho1d_thr(FFT_SCALAR * const * const, const FFT_SCALAR &, const FFT_SCALAR &, const FFT_SCALAR &); -// void compute_rho_coeff(); + void compute_drho1d_thr(FFT_SCALAR * const * const, const FFT_SCALAR &, + const FFT_SCALAR &, const FFT_SCALAR &); // void slabcorr(int); }; diff --git a/src/USER-OMP/pppm_tip4p_omp.cpp b/src/USER-OMP/pppm_tip4p_omp.cpp index e7c6cbe106..92c8098f1c 100644 --- a/src/USER-OMP/pppm_tip4p_omp.cpp +++ b/src/USER-OMP/pppm_tip4p_omp.cpp @@ -19,9 +19,12 @@ #include "atom.h" #include "comm.h" #include "domain.h" +#include "error.h" +#include "fix_omp.h" #include "force.h" #include "memory.h" #include "math_const.h" +#include "math_special.h" #include #include @@ -29,74 +32,294 @@ #include "suffix.h" using namespace LAMMPS_NS; using namespace MathConst; - -#define OFFSET 16384 +using namespace MathSpecial; #ifdef FFT_SINGLE #define ZEROF 0.0f -#define ONEF 1.0f #else #define ZEROF 0.0 -#define ONEF 1.0 #endif +#define EPS_HOC 1.0e-7 +#define OFFSET 16384 + /* ---------------------------------------------------------------------- */ PPPMTIP4POMP::PPPMTIP4POMP(LAMMPS *lmp, int narg, char **arg) : - PPPMOMP(lmp, narg, arg) + PPPMTIP4P(lmp, narg, arg), ThrOMP(lmp, THR_KSPACE) { - tip4pflag = 1; suffix_flag |= Suffix::OMP; } -/* ---------------------------------------------------------------------- */ +/* ---------------------------------------------------------------------- + allocate memory that depends on # of K-vectors and order +------------------------------------------------------------------------- */ -// NOTE: special version of reduce_data for FFT_SCALAR data type. -// reduce per thread data into the first part of the data -// array that is used for the non-threaded parts and reset -// the temporary storage to 0.0. this routine depends on -// multi-dimensional arrays like force stored in this order -// x1,y1,z1,x2,y2,z2,... -// we need to post a barrier to wait until all threads are done -// writing to the array. -static void data_reduce_fft(FFT_SCALAR *dall, int nall, int nthreads, int ndim, int tid) +void PPPMTIP4POMP::allocate() { -#if defined(_OPENMP) - // NOOP in non-threaded execution. - if (nthreads == 1) return; -#pragma omp barrier - { - const int nvals = ndim*nall; - const int idelta = nvals/nthreads + 1; - const int ifrom = tid*idelta; - const int ito = ((ifrom + idelta) > nvals) ? nvals : (ifrom + idelta); + PPPMTIP4P::allocate(); - // this if protects against having more threads than atoms - if (ifrom < nall) { - for (int m = ifrom; m < ito; ++m) { - for (int n = 1; n < nthreads; ++n) { - dall[m] += dall[n*nvals + m]; - dall[n*nvals + m] = 0.0; - } - } - } - } -#else - // NOOP in non-threaded execution. - return; +#if defined(_OPENMP) +#pragma omp parallel default(none) #endif + { +#if defined(_OPENMP) + const int tid = omp_get_thread_num(); +#else + const int tid = 0; +#endif + ThrData *thr = fix->get_thr(tid); + thr->init_pppm(order,memory); + } } -/* ---------------------------------------------------------------------- */ +/* ---------------------------------------------------------------------- + free memory that depends on # of K-vectors and order +------------------------------------------------------------------------- */ -void PPPMTIP4POMP::init() +void PPPMTIP4POMP::deallocate() { - // TIP4P PPPM requires newton on, b/c it computes forces on ghost atoms + PPPMTIP4P::deallocate(); - if (force->newton == 0) - error->all(FLERR,"Kspace style pppm/tip4p/omp requires newton on"); +#if defined(_OPENMP) +#pragma omp parallel default(none) +#endif + { +#if defined(_OPENMP) + const int tid = omp_get_thread_num(); +#else + const int tid = 0; +#endif + ThrData *thr = fix->get_thr(tid); + thr->init_pppm(-order,memory); + } +} - PPPMOMP::init(); +/* ---------------------------------------------------------------------- + pre-compute modified (Hockney-Eastwood) Coulomb Green's function +------------------------------------------------------------------------- */ + +void PPPMTIP4POMP::compute_gf_ik() +{ + const double * const prd = (triclinic==0) ? domain->prd : domain->prd_lamda; + + const double xprd = prd[0]; + const double yprd = prd[1]; + const double zprd = prd[2]; + const double zprd_slab = zprd*slab_volfactor; + const double unitkx = (MY_2PI/xprd); + const double unitky = (MY_2PI/yprd); + const double unitkz = (MY_2PI/zprd_slab); + + const int nbx = static_cast ((g_ewald*xprd/(MY_PI*nx_pppm)) * + pow(-log(EPS_HOC),0.25)); + const int nby = static_cast ((g_ewald*yprd/(MY_PI*ny_pppm)) * + pow(-log(EPS_HOC),0.25)); + const int nbz = static_cast ((g_ewald*zprd_slab/(MY_PI*nz_pppm)) * + pow(-log(EPS_HOC),0.25)); + const int numk = nxhi_fft - nxlo_fft + 1; + const int numl = nyhi_fft - nylo_fft + 1; + + const int twoorder = 2*order; + +#if defined(_OPENMP) +#pragma omp parallel default(none) +#endif + { + double snx,sny,snz; + double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz; + double sum1,dot1,dot2; + double numerator,denominator; + double sqk; + + int k,l,m,nx,ny,nz,kper,lper,mper,n,nfrom,nto,tid; + + loop_setup_thr(nfrom, nto, tid, nfft, comm->nthreads); + + for (n = nfrom; n < nto; ++n) { + m = n / (numl*numk); + l = (n - m*numl*numk) / numk; + k = n - m*numl*numk - l*numk; + m += nzlo_fft; + l += nylo_fft; + k += nxlo_fft; + + mper = m - nz_pppm*(2*m/nz_pppm); + snz = square(sin(0.5*unitkz*mper*zprd_slab/nz_pppm)); + + lper = l - ny_pppm*(2*l/ny_pppm); + sny = square(sin(0.5*unitky*lper*yprd/ny_pppm)); + + kper = k - nx_pppm*(2*k/nx_pppm); + snx = square(sin(0.5*unitkx*kper*xprd/nx_pppm)); + + sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper); + + if (sqk != 0.0) { + numerator = 12.5663706/sqk; + denominator = gf_denom(snx,sny,snz); + sum1 = 0.0; + + for (nx = -nbx; nx <= nbx; nx++) { + qx = unitkx*(kper+nx_pppm*nx); + sx = exp(-0.25*square(qx/g_ewald)); + argx = 0.5*qx*xprd/nx_pppm; + wx = powsinxx(argx,twoorder); + + for (ny = -nby; ny <= nby; ny++) { + qy = unitky*(lper+ny_pppm*ny); + sy = exp(-0.25*square(qy/g_ewald)); + argy = 0.5*qy*yprd/ny_pppm; + wy = powsinxx(argy,twoorder); + + for (nz = -nbz; nz <= nbz; nz++) { + qz = unitkz*(mper+nz_pppm*nz); + sz = exp(-0.25*square(qz/g_ewald)); + argz = 0.5*qz*zprd_slab/nz_pppm; + wz = powsinxx(argz,twoorder); + + dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz; + dot2 = qx*qx+qy*qy+qz*qz; + sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz; + } + } + } + greensfn[n] = numerator*sum1/denominator; + } else greensfn[n] = 0.0; + } + } // end of parallel region +} + +/* ---------------------------------------------------------------------- + compute optimized Green's function for energy calculation +------------------------------------------------------------------------- */ + +void PPPMTIP4POMP::compute_gf_ad() +{ + + const double * const prd = (triclinic==0) ? domain->prd : domain->prd_lamda; + + const double xprd = prd[0]; + const double yprd = prd[1]; + const double zprd = prd[2]; + const double zprd_slab = zprd*slab_volfactor; + const double unitkx = (MY_2PI/xprd); + const double unitky = (MY_2PI/yprd); + const double unitkz = (MY_2PI/zprd_slab); + + const int numk = nxhi_fft - nxlo_fft + 1; + const int numl = nyhi_fft - nylo_fft + 1; + + const int twoorder = 2*order; + double sf0=0.0,sf1=0.0,sf2=0.0,sf3=0.0,sf4=0.0,sf5=0.0; + +#if defined(_OPENMP) +#pragma omp parallel default(none) reduction(+:sf0,sf1,sf2,sf3,sf4,sf5) +#endif + { + double snx,sny,snz,sqk; + double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz; + double numerator,denominator; + int k,l,m,kper,lper,mper,n,nfrom,nto,tid; + + loop_setup_thr(nfrom, nto, tid, nfft, comm->nthreads); + + for (n = nfrom; n < nto; ++n) { + + m = n / (numl*numk); + l = (n - m*numl*numk) / numk; + k = n - m*numl*numk - l*numk; + m += nzlo_fft; + l += nylo_fft; + k += nxlo_fft; + + mper = m - nz_pppm*(2*m/nz_pppm); + qz = unitkz*mper; + snz = square(sin(0.5*qz*zprd_slab/nz_pppm)); + sz = exp(-0.25*square(qz/g_ewald)); + argz = 0.5*qz*zprd_slab/nz_pppm; + wz = powsinxx(argz,twoorder); + + lper = l - ny_pppm*(2*l/ny_pppm); + qy = unitky*lper; + sny = square(sin(0.5*qy*yprd/ny_pppm)); + sy = exp(-0.25*square(qy/g_ewald)); + argy = 0.5*qy*yprd/ny_pppm; + wy = powsinxx(argy,twoorder); + + kper = k - nx_pppm*(2*k/nx_pppm); + qx = unitkx*kper; + snx = square(sin(0.5*qx*xprd/nx_pppm)); + sx = exp(-0.25*square(qx/g_ewald)); + argx = 0.5*qx*xprd/nx_pppm; + wx = powsinxx(argx,twoorder); + + sqk = qx*qx + qy*qy + qz*qz; + + if (sqk != 0.0) { + numerator = MY_4PI/sqk; + denominator = gf_denom(snx,sny,snz); + greensfn[n] = numerator*sx*sy*sz*wx*wy*wz/denominator; + sf0 += sf_precoeff1[n]*greensfn[n]; + sf1 += sf_precoeff2[n]*greensfn[n]; + sf2 += sf_precoeff3[n]*greensfn[n]; + sf3 += sf_precoeff4[n]*greensfn[n]; + sf4 += sf_precoeff5[n]*greensfn[n]; + sf5 += sf_precoeff6[n]*greensfn[n]; + } else { + greensfn[n] = 0.0; + sf0 += sf_precoeff1[n]*greensfn[n]; + sf1 += sf_precoeff2[n]*greensfn[n]; + sf2 += sf_precoeff3[n]*greensfn[n]; + sf3 += sf_precoeff4[n]*greensfn[n]; + sf4 += sf_precoeff5[n]*greensfn[n]; + sf5 += sf_precoeff6[n]*greensfn[n]; + } + } + } // end of paralle region + + // compute the coefficients for the self-force correction + + double prex, prey, prez, tmp[6]; + prex = prey = prez = MY_PI/volume; + prex *= nx_pppm/xprd; + prey *= ny_pppm/yprd; + prez *= nz_pppm/zprd_slab; + tmp[0] = sf0 * prex; + tmp[1] = sf1 * prex*2; + tmp[2] = sf2 * prey; + tmp[3] = sf3 * prey*2; + tmp[4] = sf4 * prez; + tmp[5] = sf5 * prez*2; + + // communicate values with other procs + + MPI_Allreduce(tmp,sf_coeff,6,MPI_DOUBLE,MPI_SUM,world); +} + +/* ---------------------------------------------------------------------- + run the regular toplevel compute method from plain PPPM + which will have individual methods replaced by our threaded + versions and then call the obligatory force reduction. +------------------------------------------------------------------------- */ + +void PPPMTIP4POMP::compute(int eflag, int vflag) +{ + + PPPMTIP4P::compute(eflag,vflag); + +#if defined(_OPENMP) +#pragma omp parallel default(none) shared(eflag,vflag) +#endif + { +#if defined(_OPENMP) + const int tid = omp_get_thread_num(); +#else + const int tid = 0; +#endif + ThrData *thr = fix->get_thr(tid); + reduce_thr(this, eflag, vflag, thr); + } // end of omp parallel region } /* ---------------------------------------------------------------------- @@ -107,8 +330,12 @@ void PPPMTIP4POMP::init() void PPPMTIP4POMP::particle_map() { - const int * const type = atom->type; - const double * const * const x = atom->x; + const int * _noalias const type = atom->type; + const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0]; + int3_t * _noalias const p2g = (int3_t *) part2grid[0]; + const double boxlox = boxlo[0]; + const double boxloy = boxlo[1]; + const double boxloz = boxlo[2]; const int nlocal = atom->nlocal; int i, flag = 0; @@ -116,34 +343,33 @@ void PPPMTIP4POMP::particle_map() #pragma omp parallel for private(i) default(none) reduction(+:flag) schedule(static) #endif for (i = 0; i < nlocal; i++) { - int nx,ny,nz,iH1,iH2; - double xM[3]; + dbl3_t xM; + int iH1,iH2; if (type[i] == typeO) { - find_M(i,iH1,iH2,xM); + find_M_thr(i,iH1,iH2,xM); } else { - xM[0] = x[i][0]; - xM[1] = x[i][1]; - xM[2] = x[i][2]; + xM = x[i]; } // (nx,ny,nz) = global coords of grid pt to "lower left" of charge // current particle coord can be outside global and local box // add/subtract OFFSET to avoid int(-0.75) = 0 when want it to be -1 - nx = static_cast ((xM[0]-boxlo[0])*delxinv+shift) - OFFSET; - ny = static_cast ((xM[1]-boxlo[1])*delyinv+shift) - OFFSET; - nz = static_cast ((xM[2]-boxlo[2])*delzinv+shift) - OFFSET; + const int nx = static_cast ((xM.x-boxlox)*delxinv+shift) - OFFSET; + const int ny = static_cast ((xM.y-boxloy)*delyinv+shift) - OFFSET; + const int nz = static_cast ((xM.z-boxloz)*delzinv+shift) - OFFSET; - part2grid[i][0] = nx; - part2grid[i][1] = ny; - part2grid[i][2] = nz; + p2g[i].a = nx; + p2g[i].b = ny; + p2g[i].t = nz; // check that entire stencil around nx,ny,nz will fit in my 3d brick if (nx+nlower < nxlo_out || nx+nupper > nxhi_out || ny+nlower < nylo_out || ny+nupper > nyhi_out || - nz+nlower < nzlo_out || nz+nupper > nzhi_out) flag++; + nz+nlower < nzlo_out || nz+nupper > nzhi_out) + flag++; } int flag_all; @@ -160,96 +386,68 @@ void PPPMTIP4POMP::particle_map() void PPPMTIP4POMP::make_rho() { - const double * const q = atom->q; - const double * const * const x = atom->x; - const int * const type = atom->type; - const int nthreads = comm->nthreads; + const double * _noalias const q = atom->q; + const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0]; + const int3_t * _noalias const p2g = (int3_t *) part2grid[0]; + const int * _noalias const type = atom->type; + + dbl3_t xM; + FFT_SCALAR x0,y0,z0; + const double boxlox = boxlo[0]; + const double boxloy = boxlo[1]; + const double boxloz = boxlo[2]; + + int l,m,n,mx,my,mz,iH1,iH2; + const int nlocal = atom->nlocal; -#if defined(_OPENMP) -#pragma omp parallel default(none) -#endif - { -#if defined(_OPENMP) - // each thread works on a fixed chunk of atoms. - const int tid = omp_get_thread_num(); - const int inum = nlocal; - const int idelta = 1 + inum/nthreads; - const int ifrom = tid*idelta; - const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta; -#else - const int tid = 0; - const int ifrom = 0; - const int ito = nlocal; -#endif + // clear 3d density array - int l,m,n,nx,ny,nz,mx,my,mz,iH1,iH2; - FFT_SCALAR dx,dy,dz,x0,y0,z0; - double xM[3]; + FFT_SCALAR *vec = &density_brick[nzlo_out][nylo_out][nxlo_out]; + memset(vec,0,ngrid*sizeof(FFT_SCALAR)); - // set up clear 3d density array - const int nzoffs = (nzhi_out-nzlo_out+1)*tid; - FFT_SCALAR * const * const * const db = &(density_brick[nzoffs]); - memset(&(db[nzlo_out][nylo_out][nxlo_out]),0,ngrid*sizeof(FFT_SCALAR)); + // loop over my charges, add their contribution to nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt - ThrData *thr = fix->get_thr(tid); - FFT_SCALAR * const * const r1d = static_cast(thr->get_rho1d()); + for (int i = 0; i < nlocal; i++) { + if (type[i] == typeO) { + find_M_thr(i,iH1,iH2,xM); + } else { + xM = x[i]; + } - // loop over my charges, add their contribution to nearby grid points - // (nx,ny,nz) = global coords of grid pt to "lower left" of charge - // (dx,dy,dz) = distance to "lower left" grid pt - // (mx,my,mz) = global coords of moving stencil pt + const int nx = p2g[i].a; + const int ny = p2g[i].b; + const int nz = p2g[i].t; + const FFT_SCALAR dx = nx+shiftone - (xM.x-boxlox)*delxinv; + const FFT_SCALAR dy = ny+shiftone - (xM.y-boxloy)*delyinv; + const FFT_SCALAR dz = nz+shiftone - (xM.z-boxloz)*delzinv; - // this if protects against having more threads than local atoms - if (ifrom < nlocal) { - for (int i = ifrom; i < ito; i++) { + compute_rho1d(dx,dy,dz); - if (type[i] == typeO) { - find_M(i,iH1,iH2,xM); - } else { - xM[0] = x[i][0]; - xM[1] = x[i][1]; - xM[2] = x[i][2]; - } - - nx = part2grid[i][0]; - ny = part2grid[i][1]; - nz = part2grid[i][2]; - dx = nx+shiftone - (xM[0]-boxlo[0])*delxinv; - dy = ny+shiftone - (xM[1]-boxlo[1])*delyinv; - dz = nz+shiftone - (xM[2]-boxlo[2])*delzinv; - - compute_rho1d_thr(r1d,dx,dy,dz); - - z0 = delvolinv * q[i]; - for (n = nlower; n <= nupper; n++) { - mz = n+nz; - y0 = z0*r1d[2][n]; - for (m = nlower; m <= nupper; m++) { - my = m+ny; - x0 = y0*r1d[1][m]; - for (l = nlower; l <= nupper; l++) { - mx = l+nx; - db[mz][my][mx] += x0*r1d[0][l]; - } - } + z0 = delvolinv * q[i]; + for (n = nlower; n <= nupper; n++) { + mz = n+nz; + y0 = z0*rho1d[2][n]; + for (m = nlower; m <= nupper; m++) { + my = m+ny; + x0 = y0*rho1d[1][m]; + for (l = nlower; l <= nupper; l++) { + mx = l+nx; + density_brick[mz][my][mx] += x0*rho1d[0][l]; } } } -#if defined(_OPENMP) - // reduce 3d density array - if (nthreads > 1) { - data_reduce_fft(&(density_brick[nzlo_out][nylo_out][nxlo_out]),ngrid,nthreads,1,tid); - } -#endif } } /* ---------------------------------------------------------------------- - interpolate from grid to get electric field & force on my particles + interpolate from grid to get electric field & force on my particles for ik ------------------------------------------------------------------------- */ -void PPPMTIP4POMP::fieldforce() +void PPPMTIP4POMP::fieldforce_ik() { // loop over my charges, interpolate electric field from nearby grid points // (nx,ny,nz) = global coords of grid pt to "lower left" of charge @@ -257,106 +455,216 @@ void PPPMTIP4POMP::fieldforce() // (mx,my,mz) = global coords of moving stencil pt // ek = 3 components of E-field on particle - const double * const q = atom->q; - const double * const * const x = atom->x; - const int * const type = atom->type; + const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0]; + const double * _noalias const q = atom->q; + const int3_t * _noalias const p2g = (int3_t *) part2grid[0]; + const int * _noalias const type = atom->type; + + const double qqrd2e = force->qqrd2e; + const double boxlox = boxlo[0]; + const double boxloy = boxlo[1]; + const double boxloz = boxlo[2]; + const int nthreads = comm->nthreads; const int nlocal = atom->nlocal; - const double qqrd2e = force->qqrd2e; #if defined(_OPENMP) #pragma omp parallel default(none) #endif { -#if defined(_OPENMP) - // each thread works on a fixed chunk of atoms. - const int tid = omp_get_thread_num(); - const int inum = nlocal; - const int idelta = 1 + inum/nthreads; - const int ifrom = tid*idelta; - const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta; -#else - const int ifrom = 0; - const int ito = nlocal; - const int tid = 0; -#endif + dbl3_t xM; + FFT_SCALAR x0,y0,z0,ekx,eky,ekz; + int i,ifrom,ito,tid,iH1,iH2,l,m,n,mx,my,mz; + + loop_setup_thr(ifrom,ito,tid,nlocal,nthreads); + + // get per thread data ThrData *thr = fix->get_thr(tid); - double * const * const f = thr->get_f(); - FFT_SCALAR * const * const r1d = static_cast(thr->get_rho1d()); + dbl3_t * _noalias const f = (dbl3_t *) thr->get_f()[0]; + FFT_SCALAR * const * const r1d = static_cast(thr->get_rho1d()); - int l,m,n,nx,ny,nz,mx,my,mz; - FFT_SCALAR dx,dy,dz,x0,y0,z0; - FFT_SCALAR ekx,eky,ekz; - int iH1,iH2; - double xM[3], fx,fy,fz; + for (i = ifrom; i < ito; ++i) { + if (type[i] == typeO) { + find_M_thr(i,iH1,iH2,xM); + } else xM = x[i]; - // this if protects against having more threads than local atoms - if (ifrom < nlocal) { - for (int i = ifrom; i < ito; i++) { + const int nx = p2g[i].a; + const int ny = p2g[i].b; + const int nz = p2g[i].t; + const FFT_SCALAR dx = nx+shiftone - (xM.x-boxlox)*delxinv; + const FFT_SCALAR dy = ny+shiftone - (xM.y-boxloy)*delyinv; + const FFT_SCALAR dz = nz+shiftone - (xM.z-boxloz)*delzinv; - if (type[i] == typeO) { - find_M(i,iH1,iH2,xM); - } else { - xM[0] = x[i][0]; - xM[1] = x[i][1]; - xM[2] = x[i][2]; - } + compute_rho1d_thr(r1d,dx,dy,dz); - nx = part2grid[i][0]; - ny = part2grid[i][1]; - nz = part2grid[i][2]; - dx = nx+shiftone - (xM[0]-boxlo[0])*delxinv; - dy = ny+shiftone - (xM[1]-boxlo[1])*delyinv; - dz = nz+shiftone - (xM[2]-boxlo[2])*delzinv; - - compute_rho1d_thr(r1d,dx,dy,dz); - - ekx = eky = ekz = ZEROF; - for (n = nlower; n <= nupper; n++) { - mz = n+nz; - z0 = r1d[2][n]; - for (m = nlower; m <= nupper; m++) { - my = m+ny; - y0 = z0*r1d[1][m]; - for (l = nlower; l <= nupper; l++) { - mx = l+nx; - x0 = y0*r1d[0][l]; - ekx -= x0*vdx_brick[mz][my][mx]; - eky -= x0*vdy_brick[mz][my][mx]; - ekz -= x0*vdz_brick[mz][my][mx]; - } + ekx = eky = ekz = ZEROF; + for (n = nlower; n <= nupper; n++) { + mz = n+nz; + z0 = r1d[2][n]; + for (m = nlower; m <= nupper; m++) { + my = m+ny; + y0 = z0*r1d[1][m]; + for (l = nlower; l <= nupper; l++) { + mx = l+nx; + x0 = y0*r1d[0][l]; + ekx -= x0*vdx_brick[mz][my][mx]; + eky -= x0*vdy_brick[mz][my][mx]; + ekz -= x0*vdz_brick[mz][my][mx]; } } + } - // convert E-field to force - const double qfactor = qqrd2e*scale*q[i]; + // convert E-field to force - if (type[i] != typeO) { - f[i][0] += qfactor*ekx; - f[i][1] += qfactor*eky; - f[i][2] += qfactor*ekz; + const double qfactor = qqrd2e * scale * q[i]; + if (type[i] != typeO) { + f[i].x += qfactor*ekx; + f[i].y += qfactor*eky; + if (slabflag != 2) f[i].z += qfactor*ekz; - } else { - fx = qfactor * ekx; - fy = qfactor * eky; - fz = qfactor * ekz; - find_M(i,iH1,iH2,xM); + } else { + const double fx = qfactor * ekx; + const double fy = qfactor * eky; + const double fz = qfactor * ekz; - f[i][0] += fx*(1 - alpha); - f[i][1] += fy*(1 - alpha); - f[i][2] += fz*(1 - alpha); + f[i].x += fx*(1 - alpha); + f[i].y += fy*(1 - alpha); + if (slabflag != 2) f[i].z += fz*(1 - alpha); - f[iH1][0] += 0.5*alpha*fx; - f[iH1][1] += 0.5*alpha*fy; - f[iH1][2] += 0.5*alpha*fz; + f[iH1].x += 0.5*alpha*fx; + f[iH1].y += 0.5*alpha*fy; + if (slabflag != 2) f[iH1].z += 0.5*alpha*fz; - f[iH2][0] += 0.5*alpha*fx; - f[iH2][1] += 0.5*alpha*fy; - f[iH2][2] += 0.5*alpha*fz; - } + f[iH2].x += 0.5*alpha*fx; + f[iH2].y += 0.5*alpha*fy; + if (slabflag != 2) f[iH2].z += 0.5*alpha*fz; } } - } + } // end of parallel region +} + +/* ---------------------------------------------------------------------- + interpolate from grid to get electric field & force on my particles for ad +------------------------------------------------------------------------- */ + +void PPPMTIP4POMP::fieldforce_ad() +{ + const double *prd = (triclinic == 0) ? domain->prd : domain->prd_lamda; + const double hx_inv = nx_pppm/prd[0]; + const double hy_inv = ny_pppm/prd[1]; + const double hz_inv = nz_pppm/prd[2]; + + // loop over my charges, interpolate electric field from nearby grid points + // (nx,ny,nz) = global coords of grid pt to "lower left" of charge + // (dx,dy,dz) = distance to "lower left" grid pt + // (mx,my,mz) = global coords of moving stencil pt + // ek = 3 components of E-field on particle + + const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0]; + const double * _noalias const q = atom->q; + const int3_t * _noalias const p2g = (int3_t *) part2grid[0]; + const int * _noalias const type = atom->type; + + const double qqrd2e = force->qqrd2e; + const double boxlox = boxlo[0]; + const double boxloy = boxlo[1]; + const double boxloz = boxlo[2]; + + const int nthreads = comm->nthreads; + const int nlocal = atom->nlocal; + +#if defined(_OPENMP) +#pragma omp parallel default(none) +#endif + { + double s1,s2,s3,sf; + dbl3_t xM; + FFT_SCALAR ekx,eky,ekz; + int i,ifrom,ito,tid,iH1,iH2,l,m,n,mx,my,mz; + + loop_setup_thr(ifrom,ito,tid,nlocal,nthreads); + + // get per thread data + ThrData *thr = fix->get_thr(tid); + dbl3_t * _noalias const f = (dbl3_t *) thr->get_f()[0]; + FFT_SCALAR * const * const r1d = static_cast(thr->get_rho1d()); + FFT_SCALAR * const * const d1d = static_cast(thr->get_drho1d()); + + for (i = ifrom; i < ito; ++i) { + if (type[i] == typeO) { + find_M_thr(i,iH1,iH2,xM); + } else xM = x[i]; + + const int nx = p2g[i].a; + const int ny = p2g[i].b; + const int nz = p2g[i].t; + const FFT_SCALAR dx = nx+shiftone - (xM.x-boxlox)*delxinv; + const FFT_SCALAR dy = ny+shiftone - (xM.y-boxloy)*delyinv; + const FFT_SCALAR dz = nz+shiftone - (xM.z-boxloz)*delzinv; + + compute_rho1d_thr(r1d,dx,dy,dz); + compute_drho1d_thr(d1d,dx,dy,dz); + + ekx = eky = ekz = ZEROF; + for (n = nlower; n <= nupper; n++) { + mz = n+nz; + for (m = nlower; m <= nupper; m++) { + my = m+ny; + for (l = nlower; l <= nupper; l++) { + mx = l+nx; + ekx += d1d[0][l]*r1d[1][m]*r1d[2][n]*u_brick[mz][my][mx]; + eky += r1d[0][l]*d1d[1][m]*r1d[2][n]*u_brick[mz][my][mx]; + ekz += r1d[0][l]*r1d[1][m]*d1d[2][n]*u_brick[mz][my][mx]; + } + } + } + ekx *= hx_inv; + eky *= hy_inv; + ekz *= hz_inv; + + // convert E-field to force and substract self forces + + const double qi = q[i]; + const double qfactor = qqrd2e * scale * qi; + + s1 = x[i].x*hx_inv; + sf = sf_coeff[0]*sin(MY_2PI*s1); + sf += sf_coeff[1]*sin(MY_4PI*s1); + sf *= 2.0*qi; + const double fx = qfactor*(ekx - sf); + + s2 = x[i].y*hy_inv; + sf = sf_coeff[2]*sin(MY_2PI*s2); + sf += sf_coeff[3]*sin(MY_4PI*s2); + sf *= 2.0*qi; + const double fy = qfactor*(eky - sf); + + s3 = x[i].z*hz_inv; + sf = sf_coeff[4]*sin(MY_2PI*s3); + sf += sf_coeff[5]*sin(MY_4PI*s3); + sf *= 2.0*qi; + const double fz = qfactor*(ekz - sf); + + if (type[i] != typeO) { + f[i].x += fx; + f[i].y += fy; + if (slabflag != 2) f[i].z += fz; + + } else { + f[i].x += fx*(1 - alpha); + f[i].y += fy*(1 - alpha); + if (slabflag != 2) f[i].z += fz*(1 - alpha); + + f[iH1].x += 0.5*alpha*fx; + f[iH1].y += 0.5*alpha*fy; + if (slabflag != 2) f[iH1].z += 0.5*alpha*fz; + + f[iH2].x += 0.5*alpha*fx; + f[iH2].y += 0.5*alpha*fy; + if (slabflag != 2) f[iH2].z += 0.5*alpha*fz; + } + } + } // end of parallel region } /* ---------------------------------------------------------------------- @@ -365,7 +673,7 @@ void PPPMTIP4POMP::fieldforce() also return local indices iH1,iH2 of H atoms ------------------------------------------------------------------------- */ -void PPPMTIP4POMP::find_M(int i, int &iH1, int &iH2, double *xM) +void PPPMTIP4POMP::find_M_thr(int i, int &iH1, int &iH2, dbl3_t &xM) { iH1 = atom->map(atom->tag[i] + 1); iH2 = atom->map(atom->tag[i] + 2); @@ -374,19 +682,69 @@ void PPPMTIP4POMP::find_M(int i, int &iH1, int &iH2, double *xM) if (atom->type[iH1] != typeH || atom->type[iH2] != typeH) error->one(FLERR,"TIP4P hydrogen has incorrect atom type"); - const double * const * const x = atom->x; + const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0]; - double delx1 = x[iH1][0] - x[i][0]; - double dely1 = x[iH1][1] - x[i][1]; - double delz1 = x[iH1][2] - x[i][2]; + double delx1 = x[iH1].x - x[i].x; + double dely1 = x[iH1].y - x[i].y; + double delz1 = x[iH1].z - x[i].z; domain->minimum_image(delx1,dely1,delz1); - double delx2 = x[iH2][0] - x[i][0]; - double dely2 = x[iH2][1] - x[i][1]; - double delz2 = x[iH2][2] - x[i][2]; + double delx2 = x[iH2].x - x[i].x; + double dely2 = x[iH2].y - x[i].y; + double delz2 = x[iH2].z - x[i].z; domain->minimum_image(delx2,dely2,delz2); - xM[0] = x[i][0] + alpha * 0.5 * (delx1 + delx2); - xM[1] = x[i][1] + alpha * 0.5 * (dely1 + dely2); - xM[2] = x[i][2] + alpha * 0.5 * (delz1 + delz2); + xM.x = x[i].x + alpha * 0.5 * (delx1 + delx2); + xM.y = x[i].y + alpha * 0.5 * (dely1 + dely2); + xM.z = x[i].z + alpha * 0.5 * (delz1 + delz2); +} + + +/* ---------------------------------------------------------------------- + charge assignment into rho1d + dx,dy,dz = distance of particle from "lower left" grid point +------------------------------------------------------------------------- */ +void PPPMTIP4POMP::compute_rho1d_thr(FFT_SCALAR * const * const r1d, const FFT_SCALAR &dx, + const FFT_SCALAR &dy, const FFT_SCALAR &dz) +{ + int k,l; + FFT_SCALAR r1,r2,r3; + + for (k = (1-order)/2; k <= order/2; k++) { + r1 = r2 = r3 = ZEROF; + + for (l = order-1; l >= 0; l--) { + r1 = rho_coeff[l][k] + r1*dx; + r2 = rho_coeff[l][k] + r2*dy; + r3 = rho_coeff[l][k] + r3*dz; + } + r1d[0][k] = r1; + r1d[1][k] = r2; + r1d[2][k] = r3; + } +} + +/* ---------------------------------------------------------------------- + charge assignment into drho1d + dx,dy,dz = distance of particle from "lower left" grid point +------------------------------------------------------------------------- */ + +void PPPMTIP4POMP::compute_drho1d_thr(FFT_SCALAR * const * const d1d, const FFT_SCALAR &dx, + const FFT_SCALAR &dy, const FFT_SCALAR &dz) +{ + int k,l; + FFT_SCALAR r1,r2,r3; + + for (k = (1-order)/2; k <= order/2; k++) { + r1 = r2 = r3 = ZEROF; + + for (l = order-2; l >= 0; l--) { + r1 = drho_coeff[l][k] + r1*dx; + r2 = drho_coeff[l][k] + r2*dy; + r3 = drho_coeff[l][k] + r3*dz; + } + d1d[0][k] = r1; + d1d[1][k] = r2; + d1d[2][k] = r3; + } } diff --git a/src/USER-OMP/pppm_tip4p_omp.h b/src/USER-OMP/pppm_tip4p_omp.h index 11801ccb35..0ee807651c 100644 --- a/src/USER-OMP/pppm_tip4p_omp.h +++ b/src/USER-OMP/pppm_tip4p_omp.h @@ -20,25 +20,40 @@ KSpaceStyle(pppm/tip4p/omp,PPPMTIP4POMP) #ifndef LMP_PPPM_TIP4P_OMP_H #define LMP_PPPM_TIP4P_OMP_H -#include "pppm_omp.h" +#include "pppm_tip4p.h" +#include "thr_omp.h" namespace LAMMPS_NS { -class PPPMTIP4POMP : public PPPMOMP { +class PPPMTIP4POMP : public PPPMTIP4P, public ThrOMP { public: PPPMTIP4POMP(class LAMMPS *, int, char **); virtual ~PPPMTIP4POMP () {}; - virtual void init(); + virtual void compute(int, int); protected: + virtual void allocate(); + virtual void deallocate(); + + virtual void compute_gf_ik(); + virtual void compute_gf_ad(); + virtual void particle_map(); - virtual void fieldforce(); - virtual void make_rho(); + virtual void make_rho(); // XXX: not (yet) multi-threaded + + virtual void fieldforce_ik(); + virtual void fieldforce_ad(); + // virtual void fieldforce_peratom(); XXX: need to benchmark first. private: - void find_M(int, int &, int &, double *); + void compute_rho1d_thr(FFT_SCALAR * const * const, const FFT_SCALAR &, + const FFT_SCALAR &, const FFT_SCALAR &); + void compute_drho1d_thr(FFT_SCALAR * const * const, const FFT_SCALAR &, + const FFT_SCALAR &, const FFT_SCALAR &); -// void slabcorr(int); + void find_M_thr(const int, int &, int &, dbl3_t &); + +// void slabcorr(int); // XXX: not (yet) multi-threaded }; diff --git a/src/USER-OMP/thr_data.cpp b/src/USER-OMP/thr_data.cpp index cebed27721..c0217ed01f 100644 --- a/src/USER-OMP/thr_data.cpp +++ b/src/USER-OMP/thr_data.cpp @@ -21,13 +21,14 @@ #include #include +#include "memory.h" + using namespace LAMMPS_NS; ThrData::ThrData(int tid) - : _f(NULL), _torque(NULL), _erforce(NULL), _de(NULL), _drho(NULL), _mu(NULL), - _lambda(NULL), _rhoB(NULL), _D_values(NULL), _rho(NULL), _fp(NULL), - _rho1d(NULL), _tid(tid) + : _f(0),_torque(0),_erforce(0),_de(0),_drho(0),_mu(0),_lambda(0),_rhoB(0), + _D_values(0),_rho(0),_fp(0),_rho1d(0),_drho1d(0),_tid(tid) { // nothing else to do here. } @@ -136,6 +137,33 @@ void ThrData::init_eim(int nall, double *rho, double *fp) memset(_fp,0,nall*sizeof(double)); } +/* ---------------------------------------------------------------------- + if order > 0 : set up per thread storage for PPPM + if order < 0 : free per thread storage for PPPM +------------------------------------------------------------------------- */ +#if defined(FFT_SINGLE) +typedef float FFT_SCALAR; +#else +typedef double FFT_SCALAR; +#endif + +void ThrData::init_pppm(int order, Memory *memory) +{ + FFT_SCALAR **rho1d, **drho1d; + if (order > 0) { + memory->create2d_offset(rho1d,3,-order/2,order/2,"thr_data:rho1d"); + memory->create2d_offset(drho1d,3,-order/2,order/2,"thr_data:drho1d"); + _rho1d = static_cast(rho1d); + _drho1d = static_cast(drho1d); + } else { + order = -order; + rho1d = static_cast(_rho1d); + drho1d = static_cast(_drho1d); + memory->destroy2d_offset(rho1d,-order/2); + memory->destroy2d_offset(drho1d,-order/2); + } +} + /* ---------------------------------------------------------------------- compute global pair virial via summing F dot r over own & ghost atoms at this point, only pairwise forces have been accumulated in atom->f diff --git a/src/USER-OMP/thr_data.h b/src/USER-OMP/thr_data.h index 6345a203c7..7fc80a4929 100644 --- a/src/USER-OMP/thr_data.h +++ b/src/USER-OMP/thr_data.h @@ -53,7 +53,7 @@ class ThrData { void init_eam(int, double *); // EAM void init_eim(int, double *, double *); // EIM (+ EAM) - void init_pppm(void *r1d) { _rho1d = r1d; }; + void init_pppm(int, class Memory *); // access methods for arrays that we handle in this class double **get_lambda() const { return _lambda; }; @@ -63,6 +63,7 @@ class ThrData { double *get_rho() const { return _rho; }; double *get_rhoB() const { return _rhoB; }; void *get_rho1d() const { return _rho1d; }; + void *get_drho1d() const { return _drho1d; }; private: double eng_vdwl; // non-bonded non-coulomb energy @@ -108,6 +109,7 @@ class ThrData { // this is for pppm/omp void *_rho1d; + void *_drho1d; // my thread id const int _tid; diff --git a/src/USER-OMP/thr_omp.h b/src/USER-OMP/thr_omp.h index 3c5bbdec3e..79d5c4548c 100644 --- a/src/USER-OMP/thr_omp.h +++ b/src/USER-OMP/thr_omp.h @@ -186,4 +186,19 @@ static inline void loop_setup_thr(int &ifrom, int &ito, int &tid, } } + +// helpful definitions to help compilers optimizing code better + +typedef struct { double x,y,z; } dbl3_t; +typedef struct { double x,y,z,w; } dbl4_t; +typedef struct { int a,b,t; } int3_t; +typedef struct { int a,b,c,t; } int4_t; +typedef struct { int a,b,c,d,t; } int5_t; + +#if defined(__GNUC__) +#define _noalias __restrict +#else +#define _noalias +#endif + #endif