/* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator http://lammps.sandia.gov, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains certain rights in this software. This software is distributed under the GNU General Public License. See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- Contributing author: Axel Kohlmeyer (Temple U) ------------------------------------------------------------------------- */ #include "pppm_omp.h" #include "atom.h" #include "comm.h" #include "domain.h" #include "force.h" #include "memory.h" #include "math_const.h" #include #include #include "suffix.h" using namespace LAMMPS_NS; using namespace MathConst; #ifdef FFT_SINGLE #define ZEROF 0.0f #define ONEF 1.0f #else #define ZEROF 0.0 #define ONEF 1.0 #endif #define EPS_HOC 1.0e-7 /* ---------------------------------------------------------------------- */ PPPMOMP::PPPMOMP(LAMMPS *lmp, int narg, char **arg) : PPPM(lmp, narg, arg), ThrOMP(lmp, THR_KSPACE) { suffix_flag |= Suffix::OMP; } /* ---------------------------------------------------------------------- allocate memory that depends on # of K-vectors and order ------------------------------------------------------------------------- */ void PPPMOMP::allocate() { PPPM::allocate(); const int nthreads = comm->nthreads; #if defined(_OPENMP) #pragma omp parallel default(none) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif FFT_SCALAR **rho1d_thr; memory->create2d_offset(rho1d_thr,3,-order/2,order/2,"pppm:rho1d_thr"); ThrData *thr = fix->get_thr(tid); thr->init_pppm(static_cast(rho1d_thr)); } const int nzend = (nzhi_out-nzlo_out+1)*nthreads + nzlo_out -1; // reallocate density brick, so it fits our needs memory->destroy3d_offset(density_brick,nzlo_out,nylo_out,nxlo_out); memory->create3d_offset(density_brick,nzlo_out,nzend,nylo_out,nyhi_out, nxlo_out,nxhi_out,"pppm:density_brick"); } // NOTE: special version of reduce_data for FFT_SCALAR data type. // reduce per thread data into the first part of the data // array that is used for the non-threaded parts and reset // the temporary storage to 0.0. this routine depends on // multi-dimensional arrays like force stored in this order // x1,y1,z1,x2,y2,z2,... // we need to post a barrier to wait until all threads are done // writing to the array. static void data_reduce_fft(FFT_SCALAR *dall, int nall, int nthreads, int ndim, int tid) { #if defined(_OPENMP) // NOOP in non-threaded execution. if (nthreads == 1) return; #pragma omp barrier { const int nvals = ndim*nall; const int idelta = nvals/nthreads + 1; const int ifrom = tid*idelta; const int ito = ((ifrom + idelta) > nvals) ? nvals : (ifrom + idelta); // this if protects against having more threads than atoms if (ifrom < nall) { for (int m = ifrom; m < ito; ++m) { for (int n = 1; n < nthreads; ++n) { dall[m] += dall[n*nvals + m]; dall[n*nvals + m] = 0.0; } } } } #else // NOOP in non-threaded execution. return; #endif } /* ---------------------------------------------------------------------- free memory that depends on # of K-vectors and order ------------------------------------------------------------------------- */ void PPPMOMP::deallocate() { PPPM::deallocate(); for (int i=0; i < comm->nthreads; ++i) { ThrData * thr = fix->get_thr(i); FFT_SCALAR ** rho1d_thr = static_cast(thr->get_rho1d()); memory->destroy2d_offset(rho1d_thr,-order/2); } } /* ---------------------------------------------------------------------- adjust PPPM coeffs, called initially and whenever volume has changed ------------------------------------------------------------------------- */ void PPPMOMP::setup() { int i,j,k,n; double *prd; // volume-dependent factors // adjust z dimension for 2d slab PPPM // z dimension for 3d PPPM is zprd since slab_volfactor = 1.0 if (triclinic == 0) prd = domain->prd; else prd = domain->prd_lamda; const double xprd = prd[0]; const double yprd = prd[1]; const double zprd = prd[2]; const double zprd_slab = zprd*slab_volfactor; volume = xprd * yprd * zprd_slab; delxinv = nx_pppm/xprd; delyinv = ny_pppm/yprd; delzinv = nz_pppm/zprd_slab; delvolinv = delxinv*delyinv*delzinv; const double unitkx = (2.0*MY_PI/xprd); const double unitky = (2.0*MY_PI/yprd); const double unitkz = (2.0*MY_PI/zprd_slab); // fkx,fky,fkz for my FFT grid pts double per; for (i = nxlo_fft; i <= nxhi_fft; i++) { per = i - nx_pppm*(2*i/nx_pppm); fkx[i] = unitkx*per; } for (i = nylo_fft; i <= nyhi_fft; i++) { per = i - ny_pppm*(2*i/ny_pppm); fky[i] = unitky*per; } for (i = nzlo_fft; i <= nzhi_fft; i++) { per = i - nz_pppm*(2*i/nz_pppm); fkz[i] = unitkz*per; } // virial coefficients double sqk,vterm; n = 0; for (k = nzlo_fft; k <= nzhi_fft; k++) { for (j = nylo_fft; j <= nyhi_fft; j++) { for (i = nxlo_fft; i <= nxhi_fft; i++) { sqk = fkx[i]*fkx[i] + fky[j]*fky[j] + fkz[k]*fkz[k]; if (sqk == 0.0) { vg[n][0] = 0.0; vg[n][1] = 0.0; vg[n][2] = 0.0; vg[n][3] = 0.0; vg[n][4] = 0.0; vg[n][5] = 0.0; } else { vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald)); vg[n][0] = 1.0 + vterm*fkx[i]*fkx[i]; vg[n][1] = 1.0 + vterm*fky[j]*fky[j]; vg[n][2] = 1.0 + vterm*fkz[k]*fkz[k]; vg[n][3] = vterm*fkx[i]*fky[j]; vg[n][4] = vterm*fkx[i]*fkz[k]; vg[n][5] = vterm*fky[j]*fkz[k]; } n++; } } } // modified (Hockney-Eastwood) Coulomb Green's function const int nbx = static_cast ((g_ewald*xprd/(MY_PI*nx_pppm)) * pow(-log(EPS_HOC),0.25)); const int nby = static_cast ((g_ewald*yprd/(MY_PI*ny_pppm)) * pow(-log(EPS_HOC),0.25)); const int nbz = static_cast ((g_ewald*zprd_slab/(MY_PI*nz_pppm)) * pow(-log(EPS_HOC),0.25)); const double form = 1.0; #if defined(_OPENMP) #pragma omp parallel default(none) #endif { int tid,nn,nnfrom,nnto,nx,ny,nz,k,l,m; double snx,sny,snz,sqk; double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz; double sum1,dot1,dot2; double numerator,denominator; const double gew2 = -4.0*g_ewald*g_ewald; const int nnx = nxhi_fft-nxlo_fft+1; const int nny = nyhi_fft-nylo_fft+1; loop_setup_thr(nnfrom, nnto, tid, nfft, comm->nthreads); for (m = nzlo_fft; m <= nzhi_fft; m++) { const double fkzm = fkz[m]; snz = sin(0.5*fkzm*zprd_slab/nz_pppm); snz *= snz; for (l = nylo_fft; l <= nyhi_fft; l++) { const double fkyl = fky[l]; sny = sin(0.5*fkyl*yprd/ny_pppm); sny *= sny; for (k = nxlo_fft; k <= nxhi_fft; k++) { /* only compute the part designated to this thread */ nn = k-nxlo_fft + nnx*(l-nylo_fft + nny*(m-nzlo_fft)); if ((nn < nnfrom) || (nn >=nnto)) continue; const double fkxk = fkx[k]; snx = sin(0.5*fkxk*xprd/nx_pppm); snx *= snx; sqk = fkxk*fkxk + fkyl*fkyl + fkzm*fkzm; if (sqk != 0.0) { numerator = form*MY_4PI/sqk; denominator = gf_denom(snx,sny,snz); sum1 = 0.0; for (nx = -nbx; nx <= nbx; nx++) { qx = fkxk + unitkx*nx_pppm*nx; sx = exp(qx*qx/gew2); wx = 1.0; argx = 0.5*qx*xprd/nx_pppm; if (argx != 0.0) wx = pow(sin(argx)/argx,order); wx *=wx; for (ny = -nby; ny <= nby; ny++) { qy = fkyl + unitky*ny_pppm*ny; sy = exp(qy*qy/gew2); wy = 1.0; argy = 0.5*qy*yprd/ny_pppm; if (argy != 0.0) wy = pow(sin(argy)/argy,order); wy *= wy; for (nz = -nbz; nz <= nbz; nz++) { qz = fkzm + unitkz*nz_pppm*nz; sz = exp(qz*qz/gew2); wz = 1.0; argz = 0.5*qz*zprd_slab/nz_pppm; if (argz != 0.0) wz = pow(sin(argz)/argz,order); wz *= wz; dot1 = fkxk*qx + fkyl*qy + fkzm*qz; dot2 = qx*qx+qy*qy+qz*qz; sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz; } } } greensfn[nn] = numerator*sum1/denominator; } else greensfn[nn] = 0.0; } } } } } /* ---------------------------------------------------------------------- run the regular toplevel compute method from plain PPPPM which will have individual methods replaced by our threaded versions and then call the obligatory force reduction. ------------------------------------------------------------------------- */ void PPPMOMP::compute(int eflag, int vflag) { PPPM::compute(eflag,vflag); #if defined(_OPENMP) #pragma omp parallel default(none) shared(eflag,vflag) #endif { #if defined(_OPENMP) const int tid = omp_get_thread_num(); #else const int tid = 0; #endif ThrData *thr = fix->get_thr(tid); reduce_thr(this, eflag, vflag, thr); } // end of omp parallel region } /* ---------------------------------------------------------------------- create discretized "density" on section of global grid due to my particles density(x,y,z) = charge "density" at grid points of my 3d brick (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts) in global grid ------------------------------------------------------------------------- */ void PPPMOMP::make_rho() { const double * const q = atom->q; const double * const * const x = atom->x; const int nthreads = comm->nthreads; const int nlocal = atom->nlocal; #if defined(_OPENMP) #pragma omp parallel default(none) #endif { #if defined(_OPENMP) // each thread works on a fixed chunk of atoms. const int tid = omp_get_thread_num(); const int inum = nlocal; const int idelta = 1 + inum/nthreads; const int ifrom = tid*idelta; const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta; #else const int tid = 0; const int ifrom = 0; const int ito = nlocal; #endif int l,m,n,nx,ny,nz,mx,my,mz; FFT_SCALAR dx,dy,dz,x0,y0,z0; // set up clear 3d density array const int nzoffs = (nzhi_out-nzlo_out+1)*tid; FFT_SCALAR * const * const * const db = &(density_brick[nzoffs]); memset(&(db[nzlo_out][nylo_out][nxlo_out]),0,ngrid*sizeof(FFT_SCALAR)); ThrData *thr = fix->get_thr(tid); FFT_SCALAR * const * const r1d = static_cast(thr->get_rho1d()); // loop over my charges, add their contribution to nearby grid points // (nx,ny,nz) = global coords of grid pt to "lower left" of charge // (dx,dy,dz) = distance to "lower left" grid pt // (mx,my,mz) = global coords of moving stencil pt // this if protects against having more threads than local atoms if (ifrom < nlocal) { for (int i = ifrom; i < ito; i++) { nx = part2grid[i][0]; ny = part2grid[i][1]; nz = part2grid[i][2]; dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv; dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv; dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv; compute_rho1d_thr(r1d,dx,dy,dz); z0 = delvolinv * q[i]; for (n = nlower; n <= nupper; n++) { mz = n+nz; y0 = z0*r1d[2][n]; for (m = nlower; m <= nupper; m++) { my = m+ny; x0 = y0*r1d[1][m]; for (l = nlower; l <= nupper; l++) { mx = l+nx; db[mz][my][mx] += x0*r1d[0][l]; } } } } } #if defined(_OPENMP) // reduce 3d density array if (nthreads > 1) { data_reduce_fft(&(density_brick[nzlo_out][nylo_out][nxlo_out]),ngrid,nthreads,1,tid); } #endif } } /* ---------------------------------------------------------------------- interpolate from grid to get electric field & force on my particles ------------------------------------------------------------------------- */ void PPPMOMP::fieldforce() { // loop over my charges, interpolate electric field from nearby grid points // (nx,ny,nz) = global coords of grid pt to "lower left" of charge // (dx,dy,dz) = distance to "lower left" grid pt // (mx,my,mz) = global coords of moving stencil pt // ek = 3 components of E-field on particle const double * const q = atom->q; const double * const * const x = atom->x; const int nthreads = comm->nthreads; const int nlocal = atom->nlocal; const double qqrd2e = force->qqrd2e; #if defined(_OPENMP) #pragma omp parallel default(none) { // each thread works on a fixed chunk of atoms. const int tid = omp_get_thread_num(); const int inum = nlocal; const int idelta = 1 + inum/nthreads; const int ifrom = tid*idelta; const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta; #else const int ifrom = 0; const int ito = nlocal; const int tid = 0; #endif ThrData *thr = fix->get_thr(tid); double * const * const f = thr->get_f(); FFT_SCALAR * const * const r1d = static_cast(thr->get_rho1d()); int l,m,n,nx,ny,nz,mx,my,mz; FFT_SCALAR dx,dy,dz,x0,y0,z0; FFT_SCALAR ekx,eky,ekz; // this if protects against having more threads than local atoms if (ifrom < nlocal) { for (int i = ifrom; i < ito; i++) { nx = part2grid[i][0]; ny = part2grid[i][1]; nz = part2grid[i][2]; dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv; dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv; dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv; compute_rho1d_thr(r1d,dx,dy,dz); ekx = eky = ekz = ZEROF; for (n = nlower; n <= nupper; n++) { mz = n+nz; z0 = r1d[2][n]; for (m = nlower; m <= nupper; m++) { my = m+ny; y0 = z0*r1d[1][m]; for (l = nlower; l <= nupper; l++) { mx = l+nx; x0 = y0*r1d[0][l]; ekx -= x0*vdx_brick[mz][my][mx]; eky -= x0*vdy_brick[mz][my][mx]; ekz -= x0*vdz_brick[mz][my][mx]; } } } // convert E-field to force const double qfactor = qqrd2e*scale*q[i]; f[i][0] += qfactor*ekx; f[i][1] += qfactor*eky; f[i][2] += qfactor*ekz; } } #if defined(_OPENMP) } #endif } /* ---------------------------------------------------------------------- charge assignment into rho1d dx,dy,dz = distance of particle from "lower left" grid point ------------------------------------------------------------------------- */ void PPPMOMP::compute_rho1d_thr(FFT_SCALAR * const * const r1d, const FFT_SCALAR &dx, const FFT_SCALAR &dy, const FFT_SCALAR &dz) { int k,l; FFT_SCALAR r1,r2,r3; for (k = (1-order)/2; k <= order/2; k++) { r1 = r2 = r3 = ZEROF; for (l = order-1; l >= 0; l--) { r1 = rho_coeff[l][k] + r1*dx; r2 = rho_coeff[l][k] + r2*dy; r3 = rho_coeff[l][k] + r3*dz; } r1d[0][k] = r1; r1d[1][k] = r2; r1d[2][k] = r3; } }