diff --git a/src/USER-OMP/msm_omp.cpp b/src/USER-OMP/msm_omp.cpp index 480e52ce2f..18e2086a3b 100755 --- a/src/USER-OMP/msm_omp.cpp +++ b/src/USER-OMP/msm_omp.cpp @@ -112,12 +112,16 @@ void MSMOMP::direct(int n) direct_eval<0,0,0>(n); } } + + if (vflag_atom) + direct_peratom<1>(n); + else + direct_peratom<0>(n); } template void MSMOMP::direct_eval(const int nn) { - double * _noalias const * _noalias const * _noalias const egridn = egrid[nn]; const double * _noalias const * _noalias const * _noalias const qgridn = qgrid[nn]; const double * _noalias const g_directn = g_direct[nn]; const double * _noalias const v0_directn = v0_direct[nn]; @@ -157,14 +161,14 @@ void MSMOMP::direct_eval(const int nn) #pragma omp parallel default(none) reduction(+:v0,v1,v2,v3,v4,v5,emsm) #endif { - double qtmp,esum,v0sum,v1sum,v2sum,v3sum,v4sum,v5sum; + double esum,v0sum,v1sum,v2sum,v3sum,v4sum,v5sum; int i,ifrom,ito,tid,icx,icy,icz,ix,iy,iz,k; loop_setup_thr(ifrom, ito, tid, inum, comm->nthreads); for (i = ifrom; i < ito; ++i) { - // infer outer loop indices icx, icy, icz from master loop index + // infer outer loop indices icx, icy, icz from master loop index i icz = i/(numy*numx); icy = (i - icz*numy*numx) / numx; @@ -173,41 +177,102 @@ void MSMOMP::direct_eval(const int nn) icy += nylo_inn; icx += nxlo_inn; - const int kmin = zper ? nzlo_direct : MAX(nzlo_direct,alphan - icz); const int kmax = zper ? nzhi_direct : MIN(nzhi_direct,betazn - icz); const int jmin = yper ? nylo_direct : MAX(nylo_direct,alphan - icy); const int jmax = yper ? nyhi_direct : MIN(nyhi_direct,betayn - icy); const int imin = xper ? nxlo_direct : MAX(nxlo_direct,alphan - icx); const int imax = xper ? nxhi_direct : MIN(nxhi_direct,betaxn - icx); + const double qtmp = qgridn[icz][icy][icx]; // charge on center grid point + esum = 0.0; if (VFLAG_GLOBAL || VFLAG_ATOM) v0sum = v1sum = v2sum = v3sum = v4sum = v5sum = 0.0; - for (iz = kmin; iz <= kmax; iz++) { + // use hemisphere to avoid double computation of pair-wise + // interactions in direct sum (no computations in -z direction) + + for (iz = 1; iz <= kmax; iz++) { const int kk = icz+iz; const int zk = (iz + nzhi_direct)*ny; for (iy = jmin; iy <= jmax; iy++) { const int jj = icy+iy; const int zyk = (zk + iy + nyhi_direct)*nx; - const double * _noalias const qgridnkj = & qgridn[kk][jj][icx]; + const double * _noalias const qgridnkj = &qgridn[kk][jj][icx]; for (ix = imin; ix <= imax; ix++) { - qtmp = qgridnkj[ix]; + const double qtmp2 = qgridnkj[ix]; k = zyk + ix + nxhi_direct; - esum += g_directn[k] * qtmp; + const double gtmp = g_directn[k]; + esum += gtmp * qtmp2; if (VFLAG_GLOBAL || VFLAG_ATOM) { - v0sum += v0_directn[k] * qtmp; - v1sum += v1_directn[k] * qtmp; - v2sum += v2_directn[k] * qtmp; - v3sum += v3_directn[k] * qtmp; - v4sum += v4_directn[k] * qtmp; - v5sum += v5_directn[k] * qtmp; + v0sum += v0_directn[k] * qtmp2; + v1sum += v1_directn[k] * qtmp2; + v2sum += v2_directn[k] * qtmp2; + v3sum += v3_directn[k] * qtmp2; + v4sum += v4_directn[k] * qtmp2; + v5sum += v5_directn[k] * qtmp2; } } } } - egridn[icz][icy][icx] = esum; + + // iz=0 + + const int zk = nzhi_direct*ny; + for (iy = 1; iy <= jmax; iy++) { + const int jj = icy+iy; + const int zyk = (zk + iy + nyhi_direct)*nx; + const double * _noalias const qgridnkj = &qgridn[icz][jj][icx]; + for (ix = imin; ix <= imax; ix++) { + const double qtmp2 = qgridnkj[ix]; + k = zyk + ix + nxhi_direct; + const double gtmp = g_directn[k]; + esum += gtmp * qtmp2; + + if (VFLAG_GLOBAL || VFLAG_ATOM) { + v0sum += v0_directn[k] * qtmp2; + v1sum += v1_directn[k] * qtmp2; + v2sum += v2_directn[k] * qtmp2; + v3sum += v3_directn[k] * qtmp2; + v4sum += v4_directn[k] * qtmp2; + v5sum += v5_directn[k] * qtmp2; + } + } + } + + // iz=0, iy=0 + + const int zyk = (zk + nyhi_direct)*nx; + const double * _noalias const qgridnkj = &qgridn[icz][icy][icx]; + for (ix = 1; ix <= imax; ix++) { + const double qtmp2 = qgridnkj[ix]; + k = zyk + ix + nxhi_direct; + const double gtmp = g_directn[k]; + esum += gtmp * qtmp2; + + if (VFLAG_GLOBAL || VFLAG_ATOM) { + v0sum += v0_directn[k] * qtmp2; + v1sum += v1_directn[k] * qtmp2; + v2sum += v2_directn[k] * qtmp2; + v3sum += v3_directn[k] * qtmp2; + v4sum += v4_directn[k] * qtmp2; + v5sum += v5_directn[k] * qtmp2; + } + } + + // iz=0, iy=0, ix=0 + + const double qtmp2 = qgridnkj[0]; + k = zyk + nxhi_direct; + const double gtmp = g_directn[k]; + esum += 0.5 * gtmp * qtmp2; + + // virial is zero for iz=0, iy=0, ix=0 + + // accumulate per-atom energy/virial + + egrid[n][icz][icy][icx] = esum; if (VFLAG_ATOM) { v0grid[n][icz][icy][icx] = v0sum; @@ -219,15 +284,15 @@ void MSMOMP::direct_eval(const int nn) } if (EFLAG_GLOBAL || VFLAG_GLOBAL) { - qtmp = qgridn[icz][icy][icx]; - if (EFLAG_GLOBAL) emsm += esum * qtmp; + const double qtmp3 = qgridn[icz][icy][icx]; + if (EFLAG_GLOBAL) emsm += 2.0 * esum * qtmp3; if (VFLAG_GLOBAL) { - v0 += v0sum * qtmp; - v1 += v1sum * qtmp; - v2 += v2sum * qtmp; - v3 += v3sum * qtmp; - v4 += v4sum * qtmp; - v5 += v5sum * qtmp; + v0 += 2.0 * v0sum * qtmp3; + v1 += 2.0 * v1sum * qtmp3; + v2 += 2.0 * v2sum * qtmp3; + v3 += 2.0 * v3sum * qtmp3; + v4 += 2.0 * v4sum * qtmp3; + v5 += 2.0 * v5sum * qtmp3; } } } @@ -245,3 +310,155 @@ void MSMOMP::direct_eval(const int nn) } } } + +template +void MSMOMP::direct_peratom(const int nn) +{ + double * _noalias const * _noalias const * _noalias const egridn = egrid[nn]; + double * _noalias const * _noalias const * _noalias const v0gridn = v0grid[nn]; + double * _noalias const * _noalias const * _noalias const v1gridn = v1grid[nn]; + double * _noalias const * _noalias const * _noalias const v2gridn = v2grid[nn]; + double * _noalias const * _noalias const * _noalias const v3gridn = v3grid[nn]; + double * _noalias const * _noalias const * _noalias const v4gridn = v4grid[nn]; + double * _noalias const * _noalias const * _noalias const v5gridn = v5grid[nn]; + const double * _noalias const * _noalias const * _noalias const qgridn = qgrid[nn]; + const double * _noalias const g_directn = g_direct[nn]; + const double * _noalias const v0_directn = v0_direct[nn]; + const double * _noalias const v1_directn = v1_direct[nn]; + const double * _noalias const v2_directn = v2_direct[nn]; + const double * _noalias const v3_directn = v3_direct[nn]; + const double * _noalias const v4_directn = v4_direct[nn]; + const double * _noalias const v5_directn = v5_direct[nn]; + + + const int alphan = alpha[nn]; + const int betaxn = betax[nn]; + const int betayn = betay[nn]; + const int betazn = betaz[nn]; + + const int nx = nxhi_direct - nxlo_direct + 1; + const int ny = nyhi_direct - nylo_direct + 1; + + // merge three outer loops into one + + const int nzlo_inn = nzlo_in[nn]; + const int nylo_inn = nylo_in[nn]; + const int nxlo_inn = nxlo_in[nn]; + const int numz = nzhi_in[nn] - nzlo_inn + 1; + const int numy = nyhi_in[nn] - nylo_inn + 1; + const int numx = nxhi_in[nn] - nxlo_inn + 1; + const int inum = numz*numy*numx; + + const int zper = domain->zperiodic; + const int yper = domain->yperiodic; + const int xper = domain->xperiodic; + + const int n=nn; + int i,ifrom,ito,tid,icx,icy,icz,ix,iy,iz,k; + + + for (i = 0; i < inum; ++i) { + + // infer outer loop indices icx, icy, icz from master loop index i + + icz = i/(numy*numx); + icy = (i - icz*numy*numx) / numx; + icx = i - icz*numy*numx - icy*numx; + icz += nzlo_inn; + icy += nylo_inn; + icx += nxlo_inn; + + const int kmax = zper ? nzhi_direct : MIN(nzhi_direct,betazn - icz); + const int jmin = yper ? nylo_direct : MAX(nylo_direct,alphan - icy); + const int jmax = yper ? nyhi_direct : MIN(nyhi_direct,betayn - icy); + const int imin = xper ? nxlo_direct : MAX(nxlo_direct,alphan - icx); + const int imax = xper ? nxhi_direct : MIN(nxhi_direct,betaxn - icx); + + const double qtmp = qgridn[icz][icy][icx]; // charge on center grid point + + + // use hemisphere to avoid double computation of pair-wise + // interactions in direct sum (no computations in -z direction) + + for (iz = 1; iz <= kmax; iz++) { + const int kk = icz+iz; + const int zk = (iz + nzhi_direct)*ny; + for (iy = jmin; iy <= jmax; iy++) { + const int jj = icy+iy; + const int zyk = (zk + iy + nyhi_direct)*nx; + double * _noalias const egridnkj = &egridn[kk][jj][icx]; + for (ix = imin; ix <= imax; ix++) { + k = zyk + ix + nxhi_direct; + const int ii = icx+ix; + const double gtmp = g_directn[k]; + + egridnkj[ix] += gtmp * qtmp; + + if (VFLAG_ATOM) { + v0gridn[kk][jj][ii] += v0_directn[k] * qtmp; + v1gridn[kk][jj][ii] += v1_directn[k] * qtmp; + v2gridn[kk][jj][ii] += v2_directn[k] * qtmp; + v3gridn[kk][jj][ii] += v3_directn[k] * qtmp; + v4gridn[kk][jj][ii] += v4_directn[k] * qtmp; + v5gridn[kk][jj][ii] += v5_directn[k] * qtmp; + } + } + } + } + + // iz=0 + + const int zk = nzhi_direct*ny; + for (iy = 1; iy <= jmax; iy++) { + const int jj = icy+iy; + const int zyk = (zk + iy + nyhi_direct)*nx; + double * _noalias const egridnkj = &egridn[icz][jj][icx]; + for (ix = imin; ix <= imax; ix++) { + k = zyk + ix + nxhi_direct; + const int ii = icx+ix; + const double gtmp = g_directn[k]; + + egridnkj[ix] += gtmp * qtmp; + + if (VFLAG_ATOM) { + v0gridn[icz][jj][ii] += v0_directn[k] * qtmp; + v1gridn[icz][jj][ii] += v1_directn[k] * qtmp; + v2gridn[icz][jj][ii] += v2_directn[k] * qtmp; + v3gridn[icz][jj][ii] += v3_directn[k] * qtmp; + v4gridn[icz][jj][ii] += v4_directn[k] * qtmp; + v5gridn[icz][jj][ii] += v5_directn[k] * qtmp; + } + } + } + + // iz=0, iy=0 + + const int zyk = (zk + nyhi_direct)*nx; + double * _noalias const egridnkj = &egridn[icz][icy][icx]; + for (ix = 1; ix <= imax; ix++) { + k = zyk + ix + nxhi_direct; + const int ii = icx+ix; + const double gtmp = g_directn[k]; + + egridnkj[ix] += gtmp * qtmp; + + if (VFLAG_ATOM) { + v0gridn[icz][icy][ii] += v0_directn[k] * qtmp; + v1gridn[icz][icy][ii] += v1_directn[k] * qtmp; + v2gridn[icz][icy][ii] += v2_directn[k] * qtmp; + v3gridn[icz][icy][ii] += v3_directn[k] * qtmp; + v4gridn[icz][icy][ii] += v4_directn[k] * qtmp; + v5gridn[icz][icy][ii] += v5_directn[k] * qtmp; + } + } + + // iz=0, iy=0, ix=0 + + k = zyk + nxhi_direct; + const double gtmp = g_directn[k]; + egridnkj[0] += 0.5 * gtmp * qtmp; + + // virial is zero for iz=0, iy=0, ix=0 + + } +} diff --git a/src/USER-OMP/msm_omp.h b/src/USER-OMP/msm_omp.h index 369e23be49..0a41c57ac3 100755 --- a/src/USER-OMP/msm_omp.h +++ b/src/USER-OMP/msm_omp.h @@ -1,45 +1,46 @@ -/* -*- c++ -*- ---------------------------------------------------------- - LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories - Steve Plimpton, sjplimp@sandia.gov - - Copyright (2003) Sandia Corporation. Under the terms of Contract - DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains - certain rights in this software. This software is distributed under - the GNU General Public License. - - See the README file in the top-level LAMMPS directory. -------------------------------------------------------------------------- */ - -#ifdef KSPACE_CLASS - -KSpaceStyle(msm/omp,MSMOMP) - -#else - -#ifndef LMP_MSM_OMP_H -#define LMP_MSM_OMP_H - -#include "msm.h" -#include "thr_omp.h" - -namespace LAMMPS_NS { - - class MSMOMP : public MSM, public ThrOMP { - public: - MSMOMP(class LAMMPS *, int, char **); - virtual ~MSMOMP () {}; - - protected: - virtual void direct(int); - virtual void compute(int,int); - - private: - template void direct_eval(int); - -}; - -} - -#endif -#endif +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef KSPACE_CLASS + +KSpaceStyle(msm/omp,MSMOMP) + +#else + +#ifndef LMP_MSM_OMP_H +#define LMP_MSM_OMP_H + +#include "msm.h" +#include "thr_omp.h" + +namespace LAMMPS_NS { + + class MSMOMP : public MSM, public ThrOMP { + public: + MSMOMP(class LAMMPS *, int, char **); + virtual ~MSMOMP () {}; + + protected: + virtual void direct(int); + virtual void compute(int,int); + + private: + template void direct_eval(int); + template void direct_peratom(int); + +}; + +} + +#endif +#endif