From 6c836ed0e4c19edcb023e066c70705db9275c8e1 Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Wed, 7 Mar 2012 15:51:43 +0000
Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@7924
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 src/KSPACE/ewald.cpp         |  9 ++--
 src/KSPACE/pppm.cpp          |  9 ++--
 src/KSPACE/pppm_cg.cpp       | 85 ++++++++++++++++++++++++++++++--
 src/KSPACE/pppm_cg.h         |  3 +-
 src/USER-OMP/pppm_cg_omp.cpp | 95 +++++++++++++++++++++++++++++++++++-
 src/USER-OMP/pppm_cg_omp.h   |  1 +
 src/USER-OMP/pppm_omp.cpp    | 91 +++++++++++++++++++++++++++++++++-
 src/USER-OMP/pppm_omp.h      |  1 +
 8 files changed, 281 insertions(+), 13 deletions(-)

diff --git a/src/KSPACE/ewald.cpp b/src/KSPACE/ewald.cpp
index a95d4a3302..fcb7014ffa 100644
--- a/src/KSPACE/ewald.cpp
+++ b/src/KSPACE/ewald.cpp
@@ -151,9 +151,12 @@ void Ewald::init()
   // fluid-occupied volume used to estimate real-space error
   // zprd used rather than zprd_slab
 
-  if (!gewaldflag)
-    g_ewald = sqrt(-log(accuracy*sqrt(natoms*cutoff*xprd*yprd*zprd) / 
-                        (2.0*q2))) / cutoff;
+  if (!gewaldflag) {
+    g_ewald = accuracy*sqrt(natoms*cutoff*xprd*yprd*zprd) / (2.0*q2);
+    if (g_ewald >= 1.0)
+      error->all(FLERR,"KSpace accuracy too large to estimate G vector");
+    g_ewald = sqrt(-log(g_ewald)) / cutoff;
+  }
 
   // setup Ewald coefficients so can print stats
 
diff --git a/src/KSPACE/pppm.cpp b/src/KSPACE/pppm.cpp
index 8fae7b2d55..94e71f29cb 100644
--- a/src/KSPACE/pppm.cpp
+++ b/src/KSPACE/pppm.cpp
@@ -993,9 +993,12 @@ void PPPM::set_grid()
   double h_x,h_y,h_z;
   bigint natoms = atom->natoms;
 
-  if (!gewaldflag)
-    g_ewald = sqrt(-log(accuracy*sqrt(natoms*cutoff*xprd*yprd*zprd) / 
-			(2.0*q2))) / cutoff;
+  if (!gewaldflag) {
+    g_ewald = accuracy*sqrt(natoms*cutoff*xprd*yprd*zprd) / (2.0*q2);
+    if (g_ewald >= 1.0)
+      error->all(FLERR,"KSpace accuracy too large to estimate G vector");
+    g_ewald = sqrt(-log(g_ewald)) / cutoff;
+  } 
 
   // set optimal nx_pppm,ny_pppm,nz_pppm based on order and accuracy
   // nz_pppm uses extended zprd_slab instead of zprd
diff --git a/src/KSPACE/pppm_cg.cpp b/src/KSPACE/pppm_cg.cpp
index dd6396fe38..fa4ece88ed 100644
--- a/src/KSPACE/pppm_cg.cpp
+++ b/src/KSPACE/pppm_cg.cpp
@@ -221,7 +221,8 @@ void PPPMCG::compute(int eflag, int vflag)
     int nlocal = atom->nlocal;
 
     if (eflag_atom) {
-      for (i = 0; i < nlocal; i++) {
+      for (int j = 0; j < num_charged; j++) {
+        int i = is_charged[j];
 	eatom[i] *= 0.5;
         eatom[i] -= g_ewald*q[i]*q[i]/MY_PIS + MY_PI2*q[i]*qsum / 
 	  (g_ewald*g_ewald*volume);
@@ -230,8 +231,10 @@ void PPPMCG::compute(int eflag, int vflag)
     }
 
     if (vflag_atom) {
-      for (i = 0; i < nlocal; i++)
-        for (j = 0; j < 6; j++) vatom[i][j] *= 0.5*q[i]*qscale;
+      for (int j = 0; j < num_charged; j++) {
+        int i = is_charged[j];
+        for (int n = 0; n < 6; n++) vatom[i][n] *= 0.5*q[i]*qscale;
+      }
     }
   }
 
@@ -393,6 +396,72 @@ void PPPMCG::fieldforce()
   }
 }
 
+/* ----------------------------------------------------------------------
+ interpolate from grid to get per-atom energy/virial
+ ------------------------------------------------------------------------- */
+
+void PPPMCG::fieldforce_peratom()
+{
+    int i,l,m,n,nx,ny,nz,mx,my,mz;
+    FFT_SCALAR dx,dy,dz,x0,y0,z0;
+    FFT_SCALAR u,v0,v1,v2,v3,v4,v5;
+
+    // loop over my charges, interpolate from nearby grid points
+    // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+    // (dx,dy,dz) = distance to "lower left" grid pt
+    // (mx,my,mz) = global coords of moving stencil pt
+
+    double *q = atom->q;
+    double **x = atom->x;
+    double **f = atom->f;
+
+    for (int j = 0; j < num_charged; j++) {
+        i = is_charged[j];
+
+        nx = part2grid[i][0];
+        ny = part2grid[i][1];
+        nz = part2grid[i][2];
+        dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+        dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+        dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+        compute_rho1d(dx,dy,dz);
+
+        u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF;
+        for (n = nlower; n <= nupper; n++) {
+            mz = n+nz;
+            z0 = rho1d[2][n];
+            for (m = nlower; m <= nupper; m++) {
+                my = m+ny;
+                y0 = z0*rho1d[1][m];
+                for (l = nlower; l <= nupper; l++) {
+                    mx = l+nx;
+                    x0 = y0*rho1d[0][l];
+                    if (eflag_atom) u += x0*u_brick[mz][my][mx];
+                    if (vflag_atom) {
+                        v0 += x0*v0_brick[mz][my][mx];
+                        v1 += x0*v1_brick[mz][my][mx];
+                        v2 += x0*v2_brick[mz][my][mx];
+                        v3 += x0*v3_brick[mz][my][mx];
+                        v4 += x0*v4_brick[mz][my][mx];
+                        v5 += x0*v5_brick[mz][my][mx];
+                    }
+                }
+            }
+        }
+
+        if (eflag_atom) eatom[i] += q[i]*u;
+        if (vflag_atom) {
+            vatom[i][0] += v0;
+            vatom[i][1] += v1;
+            vatom[i][2] += v2;
+            vatom[i][3] += v3;
+            vatom[i][4] += v4;
+            vatom[i][5] += v5;
+        }
+    }
+}
+
 /* ----------------------------------------------------------------------
    Slab-geometry correction term to dampen inter-slab interactions between
    periodically repeating slabs.  Yields good approximation to 2D Ewald if 
@@ -425,6 +494,16 @@ void PPPMCG::slabcorr()
   
   if (eflag_global) energy += qscale * e_slabcorr;
 
+  //per-atom energy
+
+  if (eflag_atom) {
+    double efact = 2.0*MY_PI*dipole_all/volume;
+    for (int j = 0; j < num_charged; j++) {
+      int i = is_charged[j];
+      eatom[i] += qscale * q[i]*x[i][2]*efact;
+    }
+  }
+
   // add on force corrections
 
   const double ffact = -4.0*MY_PI*dipole_all/volume * qscale; 
diff --git a/src/KSPACE/pppm_cg.h b/src/KSPACE/pppm_cg.h
index 8fe6c2825d..4c0c1baa27 100644
--- a/src/KSPACE/pppm_cg.h
+++ b/src/KSPACE/pppm_cg.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
@@ -39,6 +39,7 @@ class PPPMCG : public PPPM {
   virtual void particle_map();
   virtual void make_rho();
   virtual void fieldforce();
+  virtual void fieldforce_peratom();
   virtual void slabcorr();
 };
 
diff --git a/src/USER-OMP/pppm_cg_omp.cpp b/src/USER-OMP/pppm_cg_omp.cpp
index 32526d2d2e..aa15a96517 100644
--- a/src/USER-OMP/pppm_cg_omp.cpp
+++ b/src/USER-OMP/pppm_cg_omp.cpp
@@ -344,7 +344,9 @@ void PPPMCGOMP::make_rho()
 
 #if defined(_OPENMP)
 #pragma omp parallel default(none)
+#endif
   {  
+#if defined(_OPENMP)
     // each thread works on a fixed chunk of atoms.
     const int tid = omp_get_thread_num();
     const int inum = num_charged;
@@ -407,8 +409,8 @@ void PPPMCGOMP::make_rho()
     if (nthreads > 1) {
       data_reduce_fft(&(density_brick[nzlo_out][nylo_out][nxlo_out]),ngrid,nthreads,1,tid);
     }
-  }
 #endif
+  }
 }
 
 /* ----------------------------------------------------------------------
@@ -430,7 +432,9 @@ void PPPMCGOMP::fieldforce()
 
 #if defined(_OPENMP)
 #pragma omp parallel default(none)
+#endif
   {  
+#if defined(_OPENMP)
     // each thread works on a fixed chunk of atoms.
     const int tid = omp_get_thread_num();
     const int inum = num_charged;
@@ -488,9 +492,96 @@ void PPPMCGOMP::fieldforce()
 	f[i][2] += qfactor*ekz;
       }
     }
-#if defined(_OPENMP)
   }
+}
+
+/* ----------------------------------------------------------------------
+ interpolate from grid to get per-atom energy/virial
+ ------------------------------------------------------------------------- */
+
+void PPPMCGOMP::fieldforce_peratom()
+{
+  // loop over my charges, interpolate from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  const double * const q = atom->q;
+  const double * const * const x = atom->x;
+  const int nthreads = comm->nthreads;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
 #endif
+  {
+#if defined(_OPENMP)
+    // each thread works on a fixed chunk of atoms.
+    const int tid = omp_get_thread_num();
+    const int inum = num_charged;
+    const int idelta = 1 + inum/nthreads;
+    const int ifrom = tid*idelta;
+    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
+#else
+    const int ifrom = 0;
+    const int ito = num_charged;
+    const int tid = 0;
+#endif
+    ThrData *thr = fix->get_thr(tid);
+    FFT_SCALAR * const * const r1d =  static_cast<FFT_SCALAR **>(thr->get_rho1d());
+
+    int i,j,l,m,n,nx,ny,nz,mx,my,mz;
+    FFT_SCALAR dx,dy,dz,x0,y0,z0;
+    FFT_SCALAR u,v0,v1,v2,v3,v4,v5;
+
+    // this if protects against having more threads than charged local atoms
+    if (ifrom < num_charged) {
+      for (j = ifrom; j < ito; j++) {
+	i = is_charged[j];
+
+	nx = part2grid[i][0];
+	ny = part2grid[i][1];
+	nz = part2grid[i][2];
+	dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+	dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+	dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+	compute_rho1d_thr(r1d,dx,dy,dz);
+
+	u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF;
+	for (n = nlower; n <= nupper; n++) {
+	  mz = n+nz;
+	  z0 = r1d[2][n];
+	  for (m = nlower; m <= nupper; m++) {
+	    my = m+ny;
+	    y0 = z0*r1d[1][m];
+	    for (l = nlower; l <= nupper; l++) {
+	      mx = l+nx;
+	      x0 = y0*r1d[0][l];
+	      if (eflag_atom) u += x0*u_brick[mz][my][mx];
+	      if (vflag_atom) {
+		v0 += x0*v0_brick[mz][my][mx];
+		v1 += x0*v1_brick[mz][my][mx];
+		v2 += x0*v2_brick[mz][my][mx];
+		v3 += x0*v3_brick[mz][my][mx];
+		v4 += x0*v4_brick[mz][my][mx];
+		v5 += x0*v5_brick[mz][my][mx];
+	      }
+	    }
+	  }
+	}
+
+	if (eflag_atom) eatom[i] += q[i]*u;
+	if (vflag_atom) {
+	  vatom[i][0] += v0;
+	  vatom[i][1] += v1;
+	  vatom[i][2] += v2;
+	  vatom[i][3] += v3;
+	  vatom[i][4] += v4;
+	  vatom[i][5] += v5;
+	}
+      }
+    }
+  }
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/USER-OMP/pppm_cg_omp.h b/src/USER-OMP/pppm_cg_omp.h
index 4c32630271..598c5522a6 100644
--- a/src/USER-OMP/pppm_cg_omp.h
+++ b/src/USER-OMP/pppm_cg_omp.h
@@ -36,6 +36,7 @@ namespace LAMMPS_NS {
   virtual void allocate();
   virtual void deallocate();
   virtual void fieldforce();
+  virtual void fieldforce_peratom();
   virtual void make_rho();
 
   void compute_rho1d_thr(FFT_SCALAR * const * const, const FFT_SCALAR &,
diff --git a/src/USER-OMP/pppm_omp.cpp b/src/USER-OMP/pppm_omp.cpp
index 706e3e4270..496ee27aa8 100644
--- a/src/USER-OMP/pppm_omp.cpp
+++ b/src/USER-OMP/pppm_omp.cpp
@@ -434,7 +434,9 @@ void PPPMOMP::fieldforce()
 
 #if defined(_OPENMP)
 #pragma omp parallel default(none)
+#endif
   {  
+#if defined(_OPENMP)
     // each thread works on a fixed chunk of atoms.
     const int tid = omp_get_thread_num();
     const int inum = nlocal;
@@ -491,9 +493,96 @@ void PPPMOMP::fieldforce()
 	f[i][2] += qfactor*ekz;
       }
     }
-#if defined(_OPENMP)
   }
+}
+
+/* ----------------------------------------------------------------------
+ interpolate from grid to get per-atom energy/virial
+ ------------------------------------------------------------------------- */
+
+void PPPMOMP::fieldforce_peratom()
+{
+  // loop over my charges, interpolate from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  const double * const q = atom->q;
+  const double * const * const x = atom->x;
+  const int nthreads = comm->nthreads;
+  const int nlocal = atom->nlocal;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none)
 #endif
+  {
+#if defined(_OPENMP)
+    // each thread works on a fixed chunk of atoms.
+    const int tid = omp_get_thread_num();
+    const int inum = nlocal;
+    const int idelta = 1 + inum/nthreads;
+    const int ifrom = tid*idelta;
+    const int ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;
+#else
+    const int ifrom = 0;
+    const int ito = nlocal;
+    const int tid = 0;
+#endif
+    ThrData *thr = fix->get_thr(tid);
+    FFT_SCALAR * const * const r1d =  static_cast<FFT_SCALAR **>(thr->get_rho1d());
+
+    int i,l,m,n,nx,ny,nz,mx,my,mz;
+    FFT_SCALAR dx,dy,dz,x0,y0,z0;
+    FFT_SCALAR u,v0,v1,v2,v3,v4,v5;
+
+    // this if protects against having more threads than local atoms
+    if (ifrom < nlocal) {
+      for (int i = ifrom; i < ito; i++) {
+
+	nx = part2grid[i][0];
+	ny = part2grid[i][1];
+	nz = part2grid[i][2];
+	dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+	dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+	dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+	compute_rho1d_thr(r1d,dx,dy,dz);
+
+	u = v0 = v1 = v2 = v3 = v4 = v5 = ZEROF;
+	for (n = nlower; n <= nupper; n++) {
+	  mz = n+nz;
+	  z0 = r1d[2][n];
+	  for (m = nlower; m <= nupper; m++) {
+	    my = m+ny;
+	    y0 = z0*r1d[1][m];
+	    for (l = nlower; l <= nupper; l++) {
+	      mx = l+nx;
+	      x0 = y0*r1d[0][l];
+	      if (eflag_atom) u += x0*u_brick[mz][my][mx];
+	      if (vflag_atom) {
+		v0 += x0*v0_brick[mz][my][mx];
+		v1 += x0*v1_brick[mz][my][mx];
+		v2 += x0*v2_brick[mz][my][mx];
+		v3 += x0*v3_brick[mz][my][mx];
+		v4 += x0*v4_brick[mz][my][mx];
+		v5 += x0*v5_brick[mz][my][mx];
+	      }
+	    }
+	  }
+	}
+
+	if (eflag_atom) eatom[i] += q[i]*u;
+	if (vflag_atom) {
+	  vatom[i][0] += v0;
+	  vatom[i][1] += v1;
+	  vatom[i][2] += v2;
+	  vatom[i][3] += v3;
+	  vatom[i][4] += v4;
+	  vatom[i][5] += v5;
+	}
+      }
+    }
+  }
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/USER-OMP/pppm_omp.h b/src/USER-OMP/pppm_omp.h
index 8abcd64aa6..93caa6ed6e 100644
--- a/src/USER-OMP/pppm_omp.h
+++ b/src/USER-OMP/pppm_omp.h
@@ -36,6 +36,7 @@ namespace LAMMPS_NS {
   virtual void allocate();
   virtual void deallocate();
   virtual void fieldforce();
+  virtual void fieldforce_peratom();
   virtual void make_rho();
 
   void compute_rho1d_thr(FFT_SCALAR * const * const, const FFT_SCALAR &,