From 062c1a04fcc129f2ba61edaac2f7b35e07e11e81 Mon Sep 17 00:00:00 2001
From: julient31 <julien.tranchida1@gmail.com>
Date: Tue, 14 Aug 2018 14:42:01 -0600
Subject: [PATCH] Commit JT 081418 - initial commit pppm_spin branch - copied
 short_range spin files (src/SPIN) - copied/renamed Stan's file (from
 pppm_dipole branch)

---
 src/DIPOLE/pair_lj_cut_dipole_long.cpp |    2 +-
 src/KSPACE/pppm.h                      |    8 +-
 src/KSPACE/pppm_spin.cpp               | 2559 ++++++++++++++++++++++++
 src/KSPACE/pppm_spin.h                 |  213 ++
 src/SPIN/pair_spin_long.cpp            |  550 +++++
 src/SPIN/pair_spin_long.h              |   97 +
 src/kspace.cpp                         |    4 +-
 src/kspace.h                           |    2 +-
 8 files changed, 3427 insertions(+), 8 deletions(-)
 create mode 100644 src/KSPACE/pppm_spin.cpp
 create mode 100644 src/KSPACE/pppm_spin.h
 create mode 100644 src/SPIN/pair_spin_long.cpp
 create mode 100644 src/SPIN/pair_spin_long.h
diff --git a/src/DIPOLE/pair_lj_cut_dipole_long.cpp b/src/DIPOLE/pair_lj_cut_dipole_long.cpp
index 817a120e3d..a0e7c1c4ec 100644
--- a/src/DIPOLE/pair_lj_cut_dipole_long.cpp
+++ b/src/DIPOLE/pair_lj_cut_dipole_long.cpp
@@ -44,7 +44,7 @@ using namespace MathConst;
 PairLJCutDipoleLong::PairLJCutDipoleLong(LAMMPS *lmp) : Pair(lmp)
 {
   single_enable = 0;
-  ewaldflag = dipoleflag = 1;
+  ewaldflag = pppmflag = dipoleflag = 1;
   respa_enable = 0;
 }
 
diff --git a/src/KSPACE/pppm.h b/src/KSPACE/pppm.h
index 9cb6bebb25..c6d463b69c 100644
--- a/src/KSPACE/pppm.h
+++ b/src/KSPACE/pppm.h
@@ -41,7 +41,7 @@ class PPPM : public KSpace {
   virtual ~PPPM();
   virtual void init();
   virtual void setup();
-  void setup_grid();
+  virtual void setup_grid();
   virtual void compute(int, int);
   virtual int timing_1d(int, double &);
   virtual int timing_3d(int, double &);
@@ -105,10 +105,10 @@ class PPPM : public KSpace {
   double qdist;                // distance from O site to negative charge
   double alpha;                // geometric factor
 
-  void set_grid_global();
+  virtual void set_grid_global();
   void set_grid_local();
   void adjust_gewald();
-  double newton_raphson_f();
+  virtual double newton_raphson_f();
   double derivf();
   double final_accuracy();
 
@@ -145,7 +145,7 @@ class PPPM : public KSpace {
   void compute_drho1d(const FFT_SCALAR &, const FFT_SCALAR &,
                      const FFT_SCALAR &);
   void compute_rho_coeff();
-  void slabcorr();
+  virtual void slabcorr();
 
   // grid communication
 
diff --git a/src/KSPACE/pppm_spin.cpp b/src/KSPACE/pppm_spin.cpp
new file mode 100644
index 0000000000..32e91cc9b2
--- /dev/null
+++ b/src/KSPACE/pppm_spin.cpp
@@ -0,0 +1,2559 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Stan Moore (SNL)
+------------------------------------------------------------------------- */
+
+#include <mpi.h>
+#include <cstring>
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+#include "pppm_dipole.h"
+#include "atom.h"
+#include "comm.h"
+#include "gridcomm.h"
+#include "neighbor.h"
+#include "force.h"
+#include "pair.h"
+#include "bond.h"
+#include "angle.h"
+#include "domain.h"
+#include "fft3d_wrap.h"
+#include "remap_wrap.h"
+#include "memory.h"
+#include "error.h"
+#include "update.h"
+
+#include "math_const.h"
+#include "math_special.h"
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+using namespace MathSpecial;
+
+#define MAXORDER 7
+#define OFFSET 16384
+#define LARGE 10000.0
+#define SMALL 0.00001
+#define EPS_HOC 1.0e-7
+
+enum{REVERSE_MU};
+enum{FORWARD_MU,FORWARD_MU_PERATOM};
+
+#ifdef FFT_SINGLE
+#define ZEROF 0.0f
+#define ONEF  1.0f
+#else
+#define ZEROF 0.0
+#define ONEF  1.0
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+PPPMDipole::PPPMDipole(LAMMPS *lmp, int narg, char **arg) : PPPM(lmp, narg, arg),
+  densityx_brick_dipole(NULL), densityy_brick_dipole(NULL), 
+  densityz_brick_dipole(NULL), ux_brick_dipole(NULL), 
+  uy_brick_dipole(NULL), uz_brick_dipole(NULL), vdxx_brick_dipole(NULL), 
+  vdxy_brick_dipole(NULL), vdyy_brick_dipole(NULL), 
+  vdxz_brick_dipole(NULL), vdyz_brick_dipole(NULL), 
+  vdzz_brick_dipole(NULL), v0x_brick_dipole(NULL), v1x_brick_dipole(NULL), 
+  v2x_brick_dipole(NULL), v3x_brick_dipole(NULL), v4x_brick_dipole(NULL), 
+  v5x_brick_dipole(NULL), v0y_brick_dipole(NULL), v1y_brick_dipole(NULL), 
+  v2y_brick_dipole(NULL), v3y_brick_dipole(NULL), v4y_brick_dipole(NULL), 
+  v5y_brick_dipole(NULL), v0z_brick_dipole(NULL), v1z_brick_dipole(NULL), 
+  v2z_brick_dipole(NULL), v3z_brick_dipole(NULL), v4z_brick_dipole(NULL), 
+  v5z_brick_dipole(NULL), work3(NULL), work4(NULL), 
+  densityx_fft_dipole(NULL), densityy_fft_dipole(NULL), 
+  densityz_fft_dipole(NULL)
+{
+  dipoleflag = 1;
+  group_group_enable = 0;
+
+  cg_dipole = NULL;
+  cg_peratom_dipole = NULL;
+}
+
+/* ----------------------------------------------------------------------
+   free all memory
+------------------------------------------------------------------------- */
+
+PPPMDipole::~PPPMDipole()
+{
+  if (copymode) return;
+
+  deallocate();
+  if (peratom_allocate_flag) deallocate_peratom();
+  fft1 = NULL;
+  fft2 = NULL;
+  remap = NULL;
+  cg_dipole = NULL;
+}
+
+/* ----------------------------------------------------------------------
+   called once before run
+------------------------------------------------------------------------- */
+
+void PPPMDipole::init()
+{
+  if (me == 0) {
+    if (screen) fprintf(screen,"PPPMDipole initialization ...\n");
+    if (logfile) fprintf(logfile,"PPPMDipole initialization ...\n");
+  }
+
+  // error check
+
+  dipoleflag = atom->mu?1:0;
+  qsum_qsq(0);
+  if (dipoleflag && q2)
+    error->all(FLERR,"Cannot (yet) uses charges with Kspace style PPPMDipole");
+
+  triclinic_check();
+
+  if (triclinic != domain->triclinic)
+    error->all(FLERR,"Must redefine kspace_style after changing to triclinic box");
+
+  if (domain->dimension == 2) error->all(FLERR,
+                                         "Cannot use PPPMDipole with 2d simulation");
+  if (comm->style != 0)
+    error->universe_all(FLERR,"PPPMDipole can only currently be used with "
+                        "comm_style brick");
+
+  if (!atom->mu) error->all(FLERR,"Kspace style requires atom attribute mu");
+
+  if (atom->mu && differentiation_flag == 1) error->all(FLERR,"Cannot (yet) use kspace_modify diff"
+       " ad with dipoles");
+
+  if (dipoleflag && strcmp(update->unit_style,"electron") == 0)
+    error->all(FLERR,"Cannot (yet) use 'electron' units with dipoles");
+
+  if (slabflag == 0 && domain->nonperiodic > 0)
+    error->all(FLERR,"Cannot use nonperiodic boundaries with PPPMDipole");
+  if (slabflag) {
+    if (domain->xperiodic != 1 || domain->yperiodic != 1 ||
+        domain->boundary[2][0] != 1 || domain->boundary[2][1] != 1)
+      error->all(FLERR,"Incorrect boundaries with slab PPPMDipole");
+  }
+
+  if (order < 2 || order > MAXORDER) {
+    char str[128];
+    sprintf(str,"PPPMDipole order cannot be < 2 or > than %d",MAXORDER);
+    error->all(FLERR,str);
+  }
+
+  // extract short-range Coulombic cutoff from pair style
+
+  triclinic = domain->triclinic;
+  if (triclinic)
+    error->all(FLERR,"Cannot yet use triclinic cells with PPPMDipole");
+
+  pair_check();
+
+  int itmp = 0;
+  double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp);
+  if (p_cutoff == NULL)
+    error->all(FLERR,"KSpace style is incompatible with Pair style");
+  cutoff = *p_cutoff;
+
+  // kspace TIP4P not yet supported
+
+  if (tip4pflag)
+    error->all(FLERR,"Cannot yet use TIP4P with PPPMDipole");
+
+  // compute qsum & qsqsum and warn if not charge-neutral
+
+  scale = 1.0;
+  qqrd2e = force->qqrd2e;
+  musum_musq();
+  natoms_original = atom->natoms;
+
+  // set accuracy (force units) from accuracy_relative or accuracy_absolute
+
+  if (accuracy_absolute >= 0.0) accuracy = accuracy_absolute;
+  else accuracy = accuracy_relative * two_charge_force;
+
+  // free all arrays previously allocated
+
+  deallocate();
+  if (peratom_allocate_flag) deallocate_peratom();
+
+  // setup FFT grid resolution and g_ewald
+  // normally one iteration thru while loop is all that is required
+  // if grid stencil does not extend beyond neighbor proc
+  //   or overlap is allowed, then done
+  // else reduce order and try again
+
+  int (*procneigh)[2] = comm->procneigh;
+
+  GridComm *cgtmp = NULL;
+  int iteration = 0;
+
+  while (order >= minorder) {
+    if (iteration && me == 0)
+      error->warning(FLERR,"Reducing PPPMDipole order b/c stencil extends "
+                     "beyond nearest neighbor processor");
+
+    compute_gf_denom();
+    set_grid_global();
+    set_grid_local();
+    if (overlap_allowed) break;
+
+    cgtmp = new GridComm(lmp,world,1,1,
+                         nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+                         nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
+                         procneigh[0][0],procneigh[0][1],procneigh[1][0],
+                         procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+    cgtmp->ghost_notify();
+    if (!cgtmp->ghost_overlap()) break;
+    delete cgtmp;
+
+    order--;
+    iteration++;
+  }
+
+  if (order < minorder) error->all(FLERR,"PPPMDipole order < minimum allowed order");
+  if (!overlap_allowed && cgtmp->ghost_overlap())
+    error->all(FLERR,"PPPMDipole grid stencil extends "
+               "beyond nearest neighbor processor");
+  if (cgtmp) delete cgtmp;
+
+  // adjust g_ewald
+
+  if (!gewaldflag) adjust_gewald();
+
+  // calculate the final accuracy
+
+  double estimated_accuracy = final_accuracy_dipole();
+
+  // print stats
+
+  int ngrid_max,nfft_both_max;
+  MPI_Allreduce(&ngrid,&ngrid_max,1,MPI_INT,MPI_MAX,world);
+  MPI_Allreduce(&nfft_both,&nfft_both_max,1,MPI_INT,MPI_MAX,world);
+
+  if (me == 0) {
+
+#ifdef FFT_SINGLE
+    const char fft_prec[] = "single";
+#else
+    const char fft_prec[] = "double";
+#endif
+
+    if (screen) {
+      fprintf(screen,"  G vector (1/distance) = %g\n",g_ewald);
+      fprintf(screen,"  grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm);
+      fprintf(screen,"  stencil order = %d\n",order);
+      fprintf(screen,"  estimated absolute RMS force accuracy = %g\n",
+              estimated_accuracy);
+      fprintf(screen,"  estimated relative force accuracy = %g\n",
+              estimated_accuracy/two_charge_force);
+      fprintf(screen,"  using %s precision FFTs\n",fft_prec);
+      fprintf(screen,"  3d grid and FFT values/proc = %d %d\n",
+              ngrid_max,nfft_both_max);
+    }
+    if (logfile) {
+      fprintf(logfile,"  G vector (1/distance) = %g\n",g_ewald);
+      fprintf(logfile,"  grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm);
+      fprintf(logfile,"  stencil order = %d\n",order);
+      fprintf(logfile,"  estimated absolute RMS force accuracy = %g\n",
+              estimated_accuracy);
+      fprintf(logfile,"  estimated relative force accuracy = %g\n",
+              estimated_accuracy/two_charge_force);
+      fprintf(logfile,"  using %s precision FFTs\n",fft_prec);
+      fprintf(logfile,"  3d grid and FFT values/proc = %d %d\n",
+              ngrid_max,nfft_both_max);
+    }
+  }
+
+  // allocate K-space dependent memory
+  // don't invoke allocate peratom(), will be allocated when needed
+
+  allocate();
+  cg_dipole->ghost_notify();
+  cg_dipole->setup();
+
+  // pre-compute Green's function denomiator expansion
+  // pre-compute 1d charge distribution coefficients
+
+  compute_gf_denom();
+  compute_rho_coeff();
+}
+
+/* ----------------------------------------------------------------------
+   adjust PPPMDipole coeffs, called initially and whenever volume has changed
+------------------------------------------------------------------------- */
+
+void PPPMDipole::setup()
+{
+  // perform some checks to avoid illegal boundaries with read_data
+
+  if (slabflag == 0 && domain->nonperiodic > 0)
+    error->all(FLERR,"Cannot use nonperiodic boundaries with PPPMDipole");
+  if (slabflag) {
+    if (domain->xperiodic != 1 || domain->yperiodic != 1 ||
+        domain->boundary[2][0] != 1 || domain->boundary[2][1] != 1)
+      error->all(FLERR,"Incorrect boundaries with slab PPPMDipole");
+  }
+
+  int i,j,k,n;
+  double *prd;
+
+  // volume-dependent factors
+  // adjust z dimension for 2d slab PPPMDipole
+  // z dimension for 3d PPPMDipole is zprd since slab_volfactor = 1.0
+
+  prd = domain->prd;
+  double xprd = prd[0];
+  double yprd = prd[1];
+  double zprd = prd[2];
+  double zprd_slab = zprd*slab_volfactor;
+  volume = xprd * yprd * zprd_slab;
+
+  delxinv = nx_pppm/xprd;
+  delyinv = ny_pppm/yprd;
+  delzinv = nz_pppm/zprd_slab;
+
+  delvolinv = delxinv*delyinv*delzinv;
+
+  double unitkx = (MY_2PI/xprd);
+  double unitky = (MY_2PI/yprd);
+  double unitkz = (MY_2PI/zprd_slab);
+
+  // fkx,fky,fkz for my FFT grid pts
+
+  double per;
+
+  for (i = nxlo_fft; i <= nxhi_fft; i++) {
+    per = i - nx_pppm*(2*i/nx_pppm);
+    fkx[i] = unitkx*per;
+  }
+
+  for (i = nylo_fft; i <= nyhi_fft; i++) {
+    per = i - ny_pppm*(2*i/ny_pppm);
+    fky[i] = unitky*per;
+  }
+
+  for (i = nzlo_fft; i <= nzhi_fft; i++) {
+    per = i - nz_pppm*(2*i/nz_pppm);
+    fkz[i] = unitkz*per;
+  }
+
+  // virial coefficients
+
+  double sqk,vterm;
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++) {
+    for (j = nylo_fft; j <= nyhi_fft; j++) {
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        sqk = fkx[i]*fkx[i] + fky[j]*fky[j] + fkz[k]*fkz[k];
+        if (sqk == 0.0) {
+          vg[n][0] = 0.0;
+          vg[n][1] = 0.0;
+          vg[n][2] = 0.0;
+          vg[n][3] = 0.0;
+          vg[n][4] = 0.0;
+          vg[n][5] = 0.0;
+        } else {
+          vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald));
+          vg[n][0] = 1.0 + vterm*fkx[i]*fkx[i];
+          vg[n][1] = 1.0 + vterm*fky[j]*fky[j];
+          vg[n][2] = 1.0 + vterm*fkz[k]*fkz[k];
+          vg[n][3] = vterm*fkx[i]*fky[j];
+          vg[n][4] = vterm*fkx[i]*fkz[k];
+          vg[n][5] = vterm*fky[j]*fkz[k];
+        }
+        n++;
+      }
+    }
+  }
+
+  compute_gf_dipole();
+}
+
+/* ----------------------------------------------------------------------
+   reset local grid arrays and communication stencils
+   called by fix balance b/c it changed sizes of processor sub-domains
+------------------------------------------------------------------------- */
+
+void PPPMDipole::setup_grid()
+{
+  // free all arrays previously allocated
+
+  deallocate();
+  if (peratom_allocate_flag) deallocate_peratom();
+
+  // reset portion of global grid that each proc owns
+
+  set_grid_local();
+
+  // reallocate K-space dependent memory
+  // check if grid communication is now overlapping if not allowed
+  // don't invoke allocate peratom(), will be allocated when needed
+
+  allocate();
+
+  cg_dipole->ghost_notify();
+  if (overlap_allowed == 0 && cg_dipole->ghost_overlap())
+    error->all(FLERR,"PPPMDipole grid stencil extends "
+               "beyond nearest neighbor processor");
+  cg_dipole->setup();
+
+  // pre-compute Green's function denomiator expansion
+  // pre-compute 1d charge distribution coefficients
+
+  compute_gf_denom();
+  compute_rho_coeff();
+
+  // pre-compute volume-dependent coeffs
+
+  setup();
+}
+
+/* ----------------------------------------------------------------------
+   compute the PPPMDipole long-range force, energy, virial
+------------------------------------------------------------------------- */
+
+void PPPMDipole::compute(int eflag, int vflag)
+{
+  int i,j;
+
+  // set energy/virial flags
+  // invoke allocate_peratom() if needed for first time
+
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = evflag_atom = eflag_global = vflag_global =
+         eflag_atom = vflag_atom = 0;
+
+  if (evflag_atom && !peratom_allocate_flag) {
+    allocate_peratom();
+    cg_peratom_dipole->ghost_notify();
+    cg_peratom_dipole->setup();
+  }
+
+  // if atom count has changed, update qsum and qsqsum
+
+  if (atom->natoms != natoms_original) {
+    musum_musq();
+    natoms_original = atom->natoms;
+  }
+
+  // return if there are no dipoles
+
+  if (musqsum == 0.0) return;
+
+  // convert atoms from box to lamda coords
+
+  boxlo = domain->boxlo;
+
+  // extend size of per-atom arrays if necessary
+
+  if (atom->nmax > nmax) {
+    memory->destroy(part2grid);
+    nmax = atom->nmax;
+    memory->create(part2grid,nmax,3,"pppm_dipole:part2grid");
+  }
+
+  // find grid points for all my particles
+  // map my particle charge onto my local 3d density grid
+
+  particle_map();
+  make_rho_dipole();
+
+  // all procs communicate density values from their ghost cells
+  //   to fully sum contribution in their 3d bricks
+  // remap from 3d decomposition to FFT decomposition
+
+  cg_dipole->reverse_comm(this,REVERSE_MU);
+  brick2fft_dipole();
+
+  // compute potential gradient on my FFT grid and
+  //   portion of e_long on this proc's FFT grid
+  // return gradients (electric fields) in 3d brick decomposition
+  // also performs per-atom calculations via poisson_peratom()
+
+  poisson_ik_dipole();
+
+  // all procs communicate E-field values
+  // to fill ghost cells surrounding their 3d bricks
+
+  cg_dipole->forward_comm(this,FORWARD_MU);
+
+  // extra per-atom energy/virial communication
+
+  if (evflag_atom) {
+    cg_peratom_dipole->forward_comm(this,FORWARD_MU_PERATOM);
+  }
+
+  // calculate the force on my particles
+
+  fieldforce_ik_dipole();
+
+  // extra per-atom energy/virial communication
+
+  if (evflag_atom) fieldforce_peratom_dipole();
+
+  // sum global energy across procs and add in volume-dependent term
+
+  const double qscale = qqrd2e * scale;
+  const double g3 = g_ewald*g_ewald*g_ewald;
+
+  if (eflag_global) {
+    double energy_all;
+    MPI_Allreduce(&energy,&energy_all,1,MPI_DOUBLE,MPI_SUM,world);
+    energy = energy_all;
+
+    energy *= 0.5*volume;
+    energy -= musqsum*2.0*g3/3.0/MY_PIS;
+    energy *= qscale;
+  }
+
+  // sum global virial across procs
+
+  if (vflag_global) {
+    double virial_all[6];
+    MPI_Allreduce(virial,virial_all,6,MPI_DOUBLE,MPI_SUM,world);
+    for (i = 0; i < 6; i++) virial[i] = 0.5*qscale*volume*virial_all[i];
+  }
+
+  // per-atom energy/virial
+  // energy includes self-energy correction
+
+  if (evflag_atom) {
+    double *q = atom->q;
+    double **mu = atom->mu;
+    int nlocal = atom->nlocal;
+    int ntotal = nlocal;
+
+    if (eflag_atom) {
+      for (i = 0; i < nlocal; i++) {
+        eatom[i] *= 0.5;
+        eatom[i] -= (mu[i][0]*mu[i][0] + mu[i][1]*mu[i][1] + mu[i][2]*mu[i][2])*2.0*g3/3.0/MY_PIS;
+        eatom[i] *= qscale;
+      }
+    }
+
+    if (vflag_atom) {
+      for (i = 0; i < ntotal; i++)
+        for (j = 0; j < 6; j++) vatom[i][j] *= 0.5*qscale;
+    }
+  }
+
+  // 2d slab correction
+
+  if (slabflag == 1) slabcorr();
+}
+
+/* ----------------------------------------------------------------------
+   allocate memory that depends on # of K-vectors and order
+------------------------------------------------------------------------- */
+
+void PPPMDipole::allocate()
+{
+  memory->create3d_offset(densityx_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:densityx_brick_dipole");
+  memory->create3d_offset(densityy_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:densityy_brick_dipole");
+  memory->create3d_offset(densityz_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:densityz_brick_dipole");
+
+  memory->create(densityx_fft_dipole,nfft_both,"pppm_dipole:densityy_fft_dipole");
+  memory->create(densityy_fft_dipole,nfft_both,"pppm_dipole:densityy_fft_dipole");
+  memory->create(densityz_fft_dipole,nfft_both,"pppm_dipole:densityz_fft_dipole");
+
+  memory->create(greensfn,nfft_both,"pppm_dipole:greensfn");
+  memory->create(work1,2*nfft_both,"pppm_dipole:work1");
+  memory->create(work2,2*nfft_both,"pppm_dipole:work2");
+  memory->create(work3,2*nfft_both,"pppm_dipole:work3");
+  memory->create(work4,2*nfft_both,"pppm_dipole:work4");
+  memory->create(vg,nfft_both,6,"pppm_dipole:vg");
+
+  memory->create1d_offset(fkx,nxlo_fft,nxhi_fft,"pppm_dipole:fkx");
+  memory->create1d_offset(fky,nylo_fft,nyhi_fft,"pppm_dipole:fky");
+  memory->create1d_offset(fkz,nzlo_fft,nzhi_fft,"pppm_dipole:fkz");
+
+  memory->create3d_offset(ux_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:ux_brick_dipole");
+  memory->create3d_offset(uy_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:uy_brick_dipole");
+  memory->create3d_offset(uz_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:uz_brick_dipole");
+
+  memory->create3d_offset(vdxx_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:vdxx_brick_dipole");
+  memory->create3d_offset(vdxy_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:vdxy_brick_dipole");
+  memory->create3d_offset(vdyy_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:vdyy_brick_dipole");
+  memory->create3d_offset(vdxz_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:vdxz_brick_dipole");
+  memory->create3d_offset(vdyz_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:vdyz_brick_dipole");
+  memory->create3d_offset(vdzz_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:vdzz_brick_dipole");
+
+  // summation coeffs
+
+  order_allocated = order;
+  memory->create(gf_b,order,"pppm_dipole:gf_b");
+  memory->create2d_offset(rho1d,3,-order/2,order/2,"pppm_dipole:rho1d");
+  memory->create2d_offset(drho1d,3,-order/2,order/2,"pppm_dipole:drho1d");
+  memory->create2d_offset(rho_coeff,order,(1-order)/2,order/2,"pppm_dipole:rho_coeff");
+  memory->create2d_offset(drho_coeff,order,(1-order)/2,order/2,
+                          "pppm_dipole:drho_coeff");
+
+  // create 2 FFTs and a Remap
+  // 1st FFT keeps data in FFT decompostion
+  // 2nd FFT returns data in 3d brick decomposition
+  // remap takes data from 3d brick to FFT decomposition
+
+  int tmp;
+
+  fft1 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
+                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
+                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
+                   0,0,&tmp,collective_flag);
+
+  fft2 = new FFT3d(lmp,world,nx_pppm,ny_pppm,nz_pppm,
+                   nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
+                   nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+                   0,0,&tmp,collective_flag);
+
+  remap = new Remap(lmp,world,
+                    nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+                    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
+                    1,0,0,FFT_PRECISION,collective_flag);
+
+  // create ghost grid object for rho and electric field communication
+
+  int (*procneigh)[2] = comm->procneigh;
+
+  cg_dipole = new GridComm(lmp,world,9,3,
+                           nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+                           nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
+                           procneigh[0][0],procneigh[0][1],procneigh[1][0],
+                           procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+}
+
+/* ----------------------------------------------------------------------
+   deallocate memory that depends on # of K-vectors and order
+------------------------------------------------------------------------- */
+
+void PPPMDipole::deallocate()
+{
+  memory->destroy3d_offset(densityx_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(densityy_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(densityz_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+
+  memory->destroy3d_offset(ux_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(uy_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(uz_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+
+  memory->destroy3d_offset(vdxx_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(vdxy_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(vdyy_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(vdxz_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(vdyz_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(vdzz_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+
+  memory->destroy(densityx_fft_dipole);
+  memory->destroy(densityy_fft_dipole);
+  memory->destroy(densityz_fft_dipole);
+
+  memory->destroy(greensfn);
+  memory->destroy(work1);
+  memory->destroy(work2);
+  memory->destroy(work3);
+  memory->destroy(work4);
+  memory->destroy(vg);
+
+  memory->destroy1d_offset(fkx,nxlo_fft);
+  memory->destroy1d_offset(fky,nylo_fft);
+  memory->destroy1d_offset(fkz,nzlo_fft);
+
+  memory->destroy(gf_b);
+  memory->destroy2d_offset(rho1d,-order_allocated/2);
+  memory->destroy2d_offset(drho1d,-order_allocated/2);
+  memory->destroy2d_offset(rho_coeff,(1-order_allocated)/2);
+  memory->destroy2d_offset(drho_coeff,(1-order_allocated)/2);
+
+  delete fft1;
+  delete fft2;
+  delete remap;
+  delete cg_dipole;
+}
+
+/* ----------------------------------------------------------------------
+   allocate per-atom memory that depends on # of K-vectors and order
+------------------------------------------------------------------------- */
+
+void PPPMDipole::allocate_peratom()
+{
+  peratom_allocate_flag = 1;
+
+  memory->create3d_offset(v0x_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v0x_brick_dipole");
+  memory->create3d_offset(v1x_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v1x_brick_dipole");
+  memory->create3d_offset(v2x_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v2x_brick_dipole");
+  memory->create3d_offset(v3x_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v3x_brick_dipole");
+  memory->create3d_offset(v4x_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v4x_brick_dipole");
+  memory->create3d_offset(v5x_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v5x_brick_dipole");
+
+  memory->create3d_offset(v0y_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v0y_brick_dipole");
+  memory->create3d_offset(v1y_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v1y_brick_dipole");
+  memory->create3d_offset(v2y_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v2y_brick_dipole");
+  memory->create3d_offset(v3y_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v3y_brick_dipole");
+  memory->create3d_offset(v4y_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v4y_brick_dipole");
+  memory->create3d_offset(v5y_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v5y_brick_dipole");
+
+  memory->create3d_offset(v0z_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v0z_brick_dipole");
+  memory->create3d_offset(v1z_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v1z_brick_dipole");
+  memory->create3d_offset(v2z_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v2z_brick_dipole");
+  memory->create3d_offset(v3z_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v3z_brick_dipole");
+  memory->create3d_offset(v4z_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v4z_brick_dipole");
+  memory->create3d_offset(v5z_brick_dipole,nzlo_out,nzhi_out,nylo_out,nyhi_out,
+                          nxlo_out,nxhi_out,"pppm_dipole:v5z_brick_dipole");
+
+  // create ghost grid object for rho and electric field communication
+
+  int (*procneigh)[2] = comm->procneigh;
+
+  cg_peratom_dipole =
+    new GridComm(lmp,world,18,1,
+                 nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
+                 nxlo_out,nxhi_out,nylo_out,nyhi_out,nzlo_out,nzhi_out,
+                 procneigh[0][0],procneigh[0][1],procneigh[1][0],
+                 procneigh[1][1],procneigh[2][0],procneigh[2][1]);
+}
+
+/* ----------------------------------------------------------------------
+   deallocate per-atom memory that depends on # of K-vectors and order
+------------------------------------------------------------------------- */
+
+void PPPMDipole::deallocate_peratom()
+{
+  peratom_allocate_flag = 0;
+
+  memory->destroy3d_offset(v0x_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v1x_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v2x_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v3x_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v4x_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v5x_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+
+  memory->destroy3d_offset(v0y_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v1y_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v2y_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v3y_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v4y_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v5y_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+
+  memory->destroy3d_offset(v0z_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v1z_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v2z_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v3z_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v4z_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+  memory->destroy3d_offset(v5z_brick_dipole,nzlo_out,nylo_out,nxlo_out);
+
+  delete cg_peratom_dipole;
+}
+
+/* ----------------------------------------------------------------------
+   set global size of PPPMDipole grid = nx,ny,nz_pppm
+   used for charge accumulation, FFTs, and electric field interpolation
+------------------------------------------------------------------------- */
+
+void PPPMDipole::set_grid_global()
+{
+  // use xprd,yprd,zprd
+  // adjust z dimension for 2d slab PPPMDipole
+  // 3d PPPMDipole just uses zprd since slab_volfactor = 1.0
+
+  double xprd = domain->xprd;
+  double yprd = domain->yprd;
+  double zprd = domain->zprd;
+  double zprd_slab = zprd*slab_volfactor;
+
+  // make initial g_ewald estimate
+  // based on desired accuracy and real space cutoff
+  // fluid-occupied volume used to estimate real-space error
+  // zprd used rather than zprd_slab
+
+  double h;
+  bigint natoms = atom->natoms;
+
+  if (!gewaldflag) {
+    if (accuracy <= 0.0)
+      error->all(FLERR,"KSpace accuracy must be > 0");
+    if (mu2 == 0.0)
+     error->all(FLERR,"Must use kspace_modify gewald for systems with no dipoles");
+    g_ewald = (1.35 - 0.15*log(accuracy))/cutoff;
+    //Try Newton Solver
+    double g_ewald_new =
+      find_gewald_dipole(g_ewald,cutoff,natoms,xprd*yprd*zprd,mu2);
+    if (g_ewald_new > 0.0) g_ewald = g_ewald_new;
+    else error->warning(FLERR,"PPPMDipole dipole Newton solver failed, "
+                        "using old method to estimate g_ewald");
+  }
+
+  // set optimal nx_pppm,ny_pppm,nz_pppm based on order and accuracy
+  // nz_pppm uses extended zprd_slab instead of zprd
+  // reduce it until accuracy target is met
+
+  if (!gridflag) {
+
+    h = h_x = h_y = h_z = 4.0/g_ewald;
+    int count = 0;
+    while (1) {
+
+      // set grid dimension
+      nx_pppm = static_cast<int> (xprd/h_x);
+      ny_pppm = static_cast<int> (yprd/h_y);
+      nz_pppm = static_cast<int> (zprd_slab/h_z);
+
+      if (nx_pppm <= 1) nx_pppm = 2;
+      if (ny_pppm <= 1) ny_pppm = 2;
+      if (nz_pppm <= 1) nz_pppm = 2;
+
+      //set local grid dimension
+      int npey_fft,npez_fft;
+      if (nz_pppm >= nprocs) {
+        npey_fft = 1;
+        npez_fft = nprocs;
+      } else procs2grid2d(nprocs,ny_pppm,nz_pppm,&npey_fft,&npez_fft);
+
+      int me_y = me % npey_fft;
+      int me_z = me / npey_fft;
+
+      nxlo_fft = 0;
+      nxhi_fft = nx_pppm - 1;
+      nylo_fft = me_y*ny_pppm/npey_fft;
+      nyhi_fft = (me_y+1)*ny_pppm/npey_fft - 1;
+      nzlo_fft = me_z*nz_pppm/npez_fft;
+      nzhi_fft = (me_z+1)*nz_pppm/npez_fft - 1;
+
+      double df_kspace = compute_df_kspace_dipole();
+
+      count++;
+
+      // break loop if the accuracy has been reached or
+      // too many loops have been performed
+
+      if (df_kspace <= accuracy) break;
+      if (count > 500) error->all(FLERR, "Could not compute grid size");
+      h *= 0.95;
+      h_x = h_y = h_z = h;
+    }
+  }
+
+  // boost grid size until it is factorable
+
+  while (!factorable(nx_pppm)) nx_pppm++;
+  while (!factorable(ny_pppm)) ny_pppm++;
+  while (!factorable(nz_pppm)) nz_pppm++;
+
+  h_x = xprd/nx_pppm;
+  h_y = yprd/ny_pppm;
+  h_z = zprd_slab/nz_pppm;
+
+  if (nx_pppm >= OFFSET || ny_pppm >= OFFSET || nz_pppm >= OFFSET)
+    error->all(FLERR,"PPPMDipole grid is too large");
+}
+
+
+/* ----------------------------------------------------------------------
+   compute estimated kspace force error for dipoles
+------------------------------------------------------------------------- */
+
+double PPPMDipole::compute_df_kspace_dipole()
+{
+  double xprd = domain->xprd;
+  double yprd = domain->yprd;
+  double zprd = domain->zprd;
+  double zprd_slab = zprd*slab_volfactor;
+  bigint natoms = atom->natoms;
+  double qopt = compute_qopt_dipole();
+  double df_kspace = sqrt(qopt/natoms)*mu2/(3.0*xprd*yprd*zprd_slab);
+  return df_kspace;
+}
+
+/* ----------------------------------------------------------------------
+   compute qopt for dipoles with ik differentiation
+------------------------------------------------------------------------- */
+
+double PPPMDipole::compute_qopt_dipole()
+{
+  double qopt = 0.0;
+  const double * const prd = domain->prd;
+
+  const double xprd = prd[0];
+  const double yprd = prd[1];
+  const double zprd = prd[2];
+  const double zprd_slab = zprd*slab_volfactor;
+  const double unitkx = (MY_2PI/xprd);
+  const double unitky = (MY_2PI/yprd);
+  const double unitkz = (MY_2PI/zprd_slab);
+
+  double snx,sny,snz;
+  double cnx,cny,cnz;
+  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
+  double sum1,sum2,dot1,dot2;
+  double numerator,denominator;
+  double u1,u2,u3,sqk;
+
+  int k,l,m,nx,ny,nz,kper,lper,mper;
+
+  const int nbx = 2;
+  const int nby = 2;
+  const int nbz = 2;
+
+  const int twoorder = 2*order;
+
+  for (m = nzlo_fft; m <= nzhi_fft; m++) {
+    mper = m - nz_pppm*(2*m/nz_pppm);
+    snz = square(sin(0.5*unitkz*mper*zprd_slab/nz_pppm));
+    cnz = cos(0.5*unitkz*mper*zprd_slab/nz_pppm);
+
+    for (l = nylo_fft; l <= nyhi_fft; l++) {
+      lper = l - ny_pppm*(2*l/ny_pppm);
+      sny = square(sin(0.5*unitky*lper*yprd/ny_pppm));
+      cny = cos(0.5*unitky*lper*yprd/ny_pppm);
+
+      for (k = nxlo_fft; k <= nxhi_fft; k++) {
+        kper = k - nx_pppm*(2*k/nx_pppm);
+        snx = square(sin(0.5*unitkx*kper*xprd/nx_pppm));
+        cnx = cos(0.5*unitkx*kper*xprd/nx_pppm);
+
+        sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper);
+
+        if (sqk != 0.0) {
+          numerator = MY_4PI/sqk;
+          denominator = gf_denom(snx,sny,snz);
+          sum1 = 0.0;
+          sum2 = 0.0;
+
+          for (nx = -nbx; nx <= nbx; nx++) {
+            qx = unitkx*(kper+nx_pppm*nx);
+            sx = exp(-0.25*square(qx/g_ewald));
+            argx = 0.5*qx*xprd/nx_pppm;
+            wx = powsinxx(argx,twoorder);
+
+            for (ny = -nby; ny <= nby; ny++) {
+              qy = unitky*(lper+ny_pppm*ny);
+              sy = exp(-0.25*square(qy/g_ewald));
+              argy = 0.5*qy*yprd/ny_pppm;
+              wy = powsinxx(argy,twoorder);
+
+              for (nz = -nbz; nz <= nbz; nz++) {
+                qz = unitkz*(mper+nz_pppm*nz);
+                sz = exp(-0.25*square(qz/g_ewald));
+                argz = 0.5*qz*zprd_slab/nz_pppm;
+                wz = powsinxx(argz,twoorder);
+
+                dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
+                dot2 = qx*qx + qy*qy + qz*qz;
+                //dot1 = dot1*dot1*dot1; // power of 3 for dipole forces
+                //dot2 = dot2*dot2*dot2;
+                u1 = sx*sy*sz;
+                const double w2 = wx*wy*wz;
+                const double phi = u1*MY_4PI/dot2;
+                const double top = dot1*dot1*dot1*w2*phi;
+                sum1 += phi*phi*dot2*dot2*dot2;
+                sum2 += top*top/sqk/sqk/sqk;
+              }
+            }
+          }
+          qopt += sum1 - sum2/denominator;
+        }
+      }
+    }
+  }
+  double qopt_all;
+  MPI_Allreduce(&qopt,&qopt_all,1,MPI_DOUBLE,MPI_SUM,world);
+  return qopt_all;
+}
+
+/* ----------------------------------------------------------------------
+   pre-compute modified (Hockney-Eastwood) Coulomb Green's function
+------------------------------------------------------------------------- */
+
+void PPPMDipole::compute_gf_dipole()
+{
+  const double * const prd = domain->prd;
+
+  const double xprd = prd[0];
+  const double yprd = prd[1];
+  const double zprd = prd[2];
+  const double zprd_slab = zprd*slab_volfactor;
+  const double unitkx = (MY_2PI/xprd);
+  const double unitky = (MY_2PI/yprd);
+  const double unitkz = (MY_2PI/zprd_slab);
+
+  double snx,sny,snz;
+  double cnx,cny,cnz;
+  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
+  double sum1,dot1,dot2;
+  double numerator,denominator;
+  double sqk;
+
+  int k,l,m,n,nx,ny,nz,kper,lper,mper;
+
+  int nbx = static_cast<int> ((g_ewald*xprd/(MY_PI*nx_pppm)) *
+                              pow(-log(EPS_HOC),0.25));
+  int nby = static_cast<int> ((g_ewald*yprd/(MY_PI*ny_pppm)) *
+                              pow(-log(EPS_HOC),0.25));
+  int nbz = static_cast<int> ((g_ewald*zprd_slab/(MY_PI*nz_pppm)) *
+                              pow(-log(EPS_HOC),0.25));
+  nbx = MAX(nbx,2);
+  nby = MAX(nby,2);
+  nbz = MAX(nbz,2);
+  const int twoorder = 2*order;
+
+  n = 0;
+  for (m = nzlo_fft; m <= nzhi_fft; m++) {
+    mper = m - nz_pppm*(2*m/nz_pppm);
+    snz = square(sin(0.5*unitkz*mper*zprd_slab/nz_pppm));
+    cnz = cos(0.5*unitkz*mper*zprd_slab/nz_pppm);
+
+    for (l = nylo_fft; l <= nyhi_fft; l++) {
+      lper = l - ny_pppm*(2*l/ny_pppm);
+      sny = square(sin(0.5*unitky*lper*yprd/ny_pppm));
+      cny = cos(0.5*unitky*lper*yprd/ny_pppm);
+
+      for (k = nxlo_fft; k <= nxhi_fft; k++) {
+        kper = k - nx_pppm*(2*k/nx_pppm);
+        snx = square(sin(0.5*unitkx*kper*xprd/nx_pppm));
+        cnx = cos(0.5*unitkx*kper*xprd/nx_pppm);
+
+        sqk = square(unitkx*kper) + square(unitky*lper) + square(unitkz*mper);
+
+        if (sqk != 0.0) {
+          numerator = MY_4PI/sqk;
+          denominator = gf_denom(snx,sny,snz);
+          sum1 = 0.0;
+
+          for (nx = -nbx; nx <= nbx; nx++) {
+            qx = unitkx*(kper+nx_pppm*nx);
+            sx = exp(-0.25*square(qx/g_ewald));
+            argx = 0.5*qx*xprd/nx_pppm;
+            wx = powsinxx(argx,twoorder);
+
+            for (ny = -nby; ny <= nby; ny++) {
+              qy = unitky*(lper+ny_pppm*ny);
+              sy = exp(-0.25*square(qy/g_ewald));
+              argy = 0.5*qy*yprd/ny_pppm;
+              wy = powsinxx(argy,twoorder);
+
+              for (nz = -nbz; nz <= nbz; nz++) {
+                qz = unitkz*(mper+nz_pppm*nz);
+                sz = exp(-0.25*square(qz/g_ewald));
+                argz = 0.5*qz*zprd_slab/nz_pppm;
+                wz = powsinxx(argz,twoorder);
+
+                dot1 = unitkx*kper*qx + unitky*lper*qy + unitkz*mper*qz;
+                dot2 = qx*qx + qy*qy + qz*qz;
+                const double u1 = sx*sy*sz;
+                const double w2 = wx*wy*wz;
+                const double phi = u1*MY_4PI/dot2;
+                sum1 += dot1*dot1*dot1*w2*phi/sqk/sqk/sqk;
+              }
+            }
+          }
+          greensfn[n++] = sum1/denominator;
+        } else greensfn[n++] = 0.0;
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   calculate f(x) for use in Newton-Raphson solver
+------------------------------------------------------------------------- */
+
+double PPPMDipole::newton_raphson_f()
+{
+  double xprd = domain->xprd;
+  double yprd = domain->yprd;
+  double zprd = domain->zprd;
+  bigint natoms = atom->natoms;
+
+  double df_rspace,df_kspace;
+  double vol = xprd*yprd*zprd;
+  double a = cutoff*g_ewald;
+  double rg2 = a*a;
+  double rg4 = rg2*rg2;
+  double rg6 = rg4*rg2;
+  double Cc = 4.0*rg4 + 6.0*rg2 + 3.0;
+  double Dc = 8.0*rg6 + 20.0*rg4 + 30.0*rg2 + 15.0;
+  df_rspace = (mu2/(sqrt(vol*powint(g_ewald,4)*powint(cutoff,9)*natoms)) *
+      sqrt(13.0/6.0*Cc*Cc + 2.0/15.0*Dc*Dc - 13.0/15.0*Cc*Dc) * exp(-rg2));
+  df_kspace = compute_df_kspace_dipole();
+
+  return df_rspace - df_kspace;
+}
+
+/* ----------------------------------------------------------------------
+   find g_ewald parameter for dipoles based on desired accuracy
+   using a Newton-Raphson solver
+------------------------------------------------------------------------- */
+
+double PPPMDipole::find_gewald_dipole(double x, double Rc,
+                              bigint natoms, double vol, double b2)
+{
+  double dx,tol;
+  int maxit;
+
+  maxit = 10000; //Maximum number of iterations
+  tol = 0.00001; //Convergence tolerance
+
+  //Begin algorithm
+
+  for (int i = 0; i < maxit; i++) {
+    dx = newton_raphson_f_dipole(x,Rc,natoms,vol,b2) / derivf_dipole(x,Rc,natoms,vol,b2);
+    x = x - dx; //Update x
+    if (fabs(dx) < tol) return x;
+    if (x < 0 || x != x) // solver failed
+      return -1;
+  }
+  return -1;
+}
+
+/* ----------------------------------------------------------------------
+   calculate f(x) objective function for dipoles
+ ------------------------------------------------------------------------- */
+
+double PPPMDipole::newton_raphson_f_dipole(double x, double Rc, bigint
+natoms, double vol, double b2)
+{
+  double a = Rc*x;
+  double rg2 = a*a;
+  double rg4 = rg2*rg2;
+  double rg6 = rg4*rg2;
+  double Cc = 4.0*rg4 + 6.0*rg2 + 3.0;
+  double Dc = 8.0*rg6 + 20.0*rg4 + 30.0*rg2 + 15.0;
+  double f = (b2/(sqrt(vol*powint(x,4)*powint(Rc,9)*natoms)) *
+    sqrt(13.0/6.0*Cc*Cc + 2.0/15.0*Dc*Dc - 13.0/15.0*Cc*Dc) *
+    exp(-rg2)) - accuracy;
+
+  return f;
+}
+
+/* ----------------------------------------------------------------------
+   calculate numerical derivative f'(x) of objective function for dipoles
+ ------------------------------------------------------------------------- */
+
+double PPPMDipole::derivf_dipole(double x, double Rc,
+                         bigint natoms, double vol, double b2)
+{
+  double h = 0.000001;  //Derivative step-size
+  return (newton_raphson_f_dipole(x + h,Rc,natoms,vol,b2) - newton_raphson_f_dipole(x,Rc,natoms,vol,b2)) / h;
+}
+
+/* ----------------------------------------------------------------------
+   calculate the final estimate of the accuracy
+------------------------------------------------------------------------- */
+
+double PPPMDipole::final_accuracy_dipole()
+{
+  double xprd = domain->xprd;
+  double yprd = domain->yprd;
+  double zprd = domain->zprd;
+  double vol = xprd*yprd*zprd;
+  bigint natoms = atom->natoms;
+  if (natoms == 0) natoms = 1; // avoid division by zero
+
+  double df_kspace = compute_df_kspace_dipole();
+
+  double a = cutoff*g_ewald;
+  double rg2 = a*a;
+  double rg4 = rg2*rg2;
+  double rg6 = rg4*rg2;
+  double Cc = 4.0*rg4 + 6.0*rg2 + 3.0;
+  double Dc = 8.0*rg6 + 20.0*rg4 + 30.0*rg2 + 15.0;
+  double df_rspace = (mu2/(sqrt(vol*powint(g_ewald,4)*powint(cutoff,9)*natoms)) *
+    sqrt(13.0/6.0*Cc*Cc + 2.0/15.0*Dc*Dc - 13.0/15.0*Cc*Dc) *
+    exp(-rg2));
+
+  double estimated_accuracy = sqrt(df_kspace*df_kspace + df_rspace*df_rspace);
+
+  return estimated_accuracy;
+}
+
+/* ----------------------------------------------------------------------
+   pre-compute Green's function denominator expansion coeffs, Gamma(2n)
+------------------------------------------------------------------------- */
+
+void PPPMDipole::compute_gf_denom()
+{
+  if (gf_b) memory->destroy(gf_b);
+  memory->create(gf_b,order,"pppm_dipole:gf_b");
+
+  int k,l,m;
+
+  for (l = 1; l < order; l++) gf_b[l] = 0.0;
+  gf_b[0] = 1.0;
+
+  for (m = 1; m < order; m++) {
+    for (l = m; l > 0; l--)
+      gf_b[l] = 4.0 * (gf_b[l]*(l-m)*(l-m-0.5)-gf_b[l-1]*(l-m-1)*(l-m-1));
+    gf_b[0] = 4.0 * (gf_b[0]*(l-m)*(l-m-0.5));
+  }
+
+  bigint ifact = 1;
+  for (k = 1; k < 2*order; k++) ifact *= k;
+  double gaminv = 1.0/ifact;
+  for (l = 0; l < order; l++) gf_b[l] *= gaminv;
+}
+
+/* ----------------------------------------------------------------------
+   create discretized "density" on section of global grid due to my particles
+   density(x,y,z) = charge "density" at grid points of my 3d brick
+   (nxlo:nxhi,nylo:nyhi,nzlo:nzhi) is extent of my brick (including ghosts)
+   in global grid
+------------------------------------------------------------------------- */
+
+void PPPMDipole::make_rho_dipole()
+{
+  int l,m,n,nx,ny,nz,mx,my,mz;
+  FFT_SCALAR dx,dy,dz;
+  FFT_SCALAR x0,y0,z0;
+  FFT_SCALAR x1,y1,z1;
+  FFT_SCALAR x2,y2,z2;
+
+  // clear 3d density array
+
+  memset(&(densityx_brick_dipole[nzlo_out][nylo_out][nxlo_out]),0,
+         ngrid*sizeof(FFT_SCALAR));
+  memset(&(densityy_brick_dipole[nzlo_out][nylo_out][nxlo_out]),0,
+         ngrid*sizeof(FFT_SCALAR));
+  memset(&(densityz_brick_dipole[nzlo_out][nylo_out][nxlo_out]),0,
+         ngrid*sizeof(FFT_SCALAR));
+
+  // loop over my charges, add their contribution to nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  double **mu = atom->mu;
+  double **x = atom->x;
+  int nlocal = atom->nlocal;
+
+  for (int i = 0; i < nlocal; i++) {
+
+    nx = part2grid[i][0];
+    ny = part2grid[i][1];
+    nz = part2grid[i][2];
+    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+    compute_rho1d(dx,dy,dz);
+
+    z0 = delvolinv * mu[i][0];
+    z1 = delvolinv * mu[i][1];
+    z2 = delvolinv * mu[i][2];
+    for (n = nlower; n <= nupper; n++) {
+      mz = n+nz;
+      y0 = z0*rho1d[2][n];
+      y1 = z1*rho1d[2][n];
+      y2 = z2*rho1d[2][n];
+      for (m = nlower; m <= nupper; m++) {
+        my = m+ny;
+        x0 = y0*rho1d[1][m];
+        x1 = y1*rho1d[1][m];
+        x2 = y2*rho1d[1][m];
+        for (l = nlower; l <= nupper; l++) {
+          mx = l+nx;
+          densityx_brick_dipole[mz][my][mx] += x0*rho1d[0][l];
+          densityy_brick_dipole[mz][my][mx] += x1*rho1d[0][l];
+          densityz_brick_dipole[mz][my][mx] += x2*rho1d[0][l];
+        }
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   remap density from 3d brick decomposition to FFT decomposition
+------------------------------------------------------------------------- */
+
+void PPPMDipole::brick2fft_dipole()
+{
+  int n,ix,iy,iz;
+
+  // copy grabs inner portion of density from 3d brick
+  // remap could be done as pre-stage of FFT,
+  //   but this works optimally on only double values, not complex values
+
+  n = 0;
+  for (iz = nzlo_in; iz <= nzhi_in; iz++)
+    for (iy = nylo_in; iy <= nyhi_in; iy++)
+      for (ix = nxlo_in; ix <= nxhi_in; ix++) {
+        densityx_fft_dipole[n] = densityx_brick_dipole[iz][iy][ix];
+        densityy_fft_dipole[n] = densityy_brick_dipole[iz][iy][ix];
+        densityz_fft_dipole[n] = densityz_brick_dipole[iz][iy][ix];
+        n++;
+      }
+
+  remap->perform(densityx_fft_dipole,densityx_fft_dipole,work1);
+  remap->perform(densityy_fft_dipole,densityy_fft_dipole,work1);
+  remap->perform(densityz_fft_dipole,densityz_fft_dipole,work1);
+}
+
+/* ----------------------------------------------------------------------
+   FFT-based Poisson solver for ik
+------------------------------------------------------------------------- */
+
+void PPPMDipole::poisson_ik_dipole()
+{
+  int i,j,k,n,ii;
+  double eng;
+  double wreal,wimg;
+
+  // transform dipole density (r -> k)
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work1[n] = densityx_fft_dipole[i];
+    work1[n+1] = ZEROF;
+    work2[n] = densityy_fft_dipole[i];
+    work2[n+1] = ZEROF;
+    work3[n] = densityz_fft_dipole[i];
+    work3[n+1] = ZEROF;
+    n += 2;
+  }
+
+  fft1->compute(work1,work1,1);
+  fft1->compute(work2,work2,1);
+  fft1->compute(work3,work3,1);
+
+  // global energy and virial contribution
+
+  double scaleinv = 1.0/(nx_pppm*ny_pppm*nz_pppm);
+  double s2 = scaleinv*scaleinv;
+
+  if (eflag_global || vflag_global) {
+    if (vflag_global) {
+      n = 0;
+      ii = 0;
+      for (k = nzlo_fft; k <= nzhi_fft; k++)
+        for (j = nylo_fft; j <= nyhi_fft; j++)
+          for (i = nxlo_fft; i <= nxhi_fft; i++) {
+            wreal = (work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]);
+            wimg = (work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]);
+            eng = s2 * greensfn[ii] * (wreal*wreal + wimg*wimg);
+            for (int jj = 0; jj < 6; jj++) virial[jj] += eng*vg[ii][jj];
+            virial[0] += 2.0*s2*greensfn[ii]*fkx[i]*(work1[n]*wreal + work1[n+1]*wimg);
+            virial[1] += 2.0*s2*greensfn[ii]*fky[j]*(work2[n]*wreal + work2[n+1]*wimg);
+            virial[2] += 2.0*s2*greensfn[ii]*fkz[k]*(work3[n]*wreal + work3[n+1]*wimg);
+            virial[3] += 2.0*s2*greensfn[ii]*fky[j]*(work1[n]*wreal + work1[n+1]*wimg);
+            virial[4] += 2.0*s2*greensfn[ii]*fkz[k]*(work1[n]*wreal + work1[n+1]*wimg);
+            virial[5] += 2.0*s2*greensfn[ii]*fkz[k]*(work2[n]*wreal + work2[n+1]*wimg);
+            if (eflag_global) energy += eng;
+            ii++;
+            n += 2;
+          }
+    } else {
+      n = 0;
+      ii = 0;
+      for (k = nzlo_fft; k <= nzhi_fft; k++)
+        for (j = nylo_fft; j <= nyhi_fft; j++)
+          for (i = nxlo_fft; i <= nxhi_fft; i++) {
+            wreal = (work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]);
+            wimg = (work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]);
+            energy +=
+            s2 * greensfn[ii] * (wreal*wreal + wimg*wimg);
+   ii++;
+            n += 2;
+          }
+    }
+  }
+
+  // scale by 1/total-grid-pts to get rho(k)
+  // multiply by Green's function to get V(k)
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work1[n]   *= scaleinv * greensfn[i];
+    work1[n+1] *= scaleinv * greensfn[i];
+    work2[n]   *= scaleinv * greensfn[i];
+    work2[n+1] *= scaleinv * greensfn[i];
+    work3[n]   *= scaleinv * greensfn[i];
+    work3[n+1] *= scaleinv * greensfn[i];
+    n += 2;
+  }
+
+  // extra FFTs for per-atom energy/virial
+
+  if (vflag_atom) poisson_peratom_dipole();
+
+  // compute electric potential
+  // FFT leaves data in 3d brick decomposition
+
+  // Ex
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkx[i]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]);
+        work4[n+1] = fkx[i]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]);
+        n += 2;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        ux_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // Ey
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fky[j]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]);
+        work4[n+1] = fky[j]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]);
+        n += 2;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        uy_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // Ez
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkz[k]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]);
+        work4[n+1] = fkz[k]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]);
+        n += 2;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        uz_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // Vxx
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkx[i]*fkx[i]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]);
+        work4[n+1] = -fkx[i]*fkx[i]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]);
+        n += 2;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        vdxx_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // Vyy
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fky[j]*fky[j]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]);
+        work4[n+1] = -fky[j]*fky[j]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]);
+        n += 2;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        vdyy_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // Vzz
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkz[k]*fkz[k]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]);
+        work4[n+1] = -fkz[k]*fkz[k]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]);
+        n += 2;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        vdzz_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // Vxy
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkx[i]*fky[j]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]);
+        work4[n+1] = -fkx[i]*fky[j]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]);
+        n += 2;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        vdxy_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // Vxz
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkx[i]*fkz[k]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]);
+        work4[n+1] = -fkx[i]*fkz[k]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]);
+        n += 2;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        vdxz_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // Vyz
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fky[j]*fkz[k]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]);
+        work4[n+1] = -fky[j]*fkz[k]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]);
+        n += 2;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        vdyz_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+}
+
+/* ----------------------------------------------------------------------
+   FFT-based Poisson solver for per-atom energy/virial
+------------------------------------------------------------------------- */
+
+void PPPMDipole::poisson_peratom_dipole()
+{
+  int i,ii,j,k,n;
+
+  // 18 components of virial in v0 thru v5
+
+  if (!vflag_atom) return;
+
+  // V0x
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkx[i]*(vg[ii][0]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fkx[i]*work1[n]);
+        work4[n+1] = fkx[i]*(vg[ii][0]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fkx[i]*work1[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v0x_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V0y
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fky[j]*(vg[ii][0]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fkx[i]*work1[n]);
+        work4[n+1] = fky[j]*(vg[ii][0]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fkx[i]*work1[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v0y_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V0z
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkz[k]*(vg[ii][0]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fkx[i]*work1[n]);
+        work4[n+1] = fkz[k]*(vg[ii][0]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fkx[i]*work1[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v0z_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V1x
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkx[i]*(vg[ii][1]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fky[j]*work2[n]);
+        work4[n+1] = fkx[i]*(vg[ii][1]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fky[j]*work2[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v1x_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V1y
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fky[j]*(vg[ii][1]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fky[j]*work2[n]);
+        work4[n+1] = fky[j]*(vg[ii][1]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fky[j]*work2[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v1y_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V1z
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkz[k]*(vg[ii][1]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fky[j]*work2[n]);
+        work4[n+1] = fkz[k]*(vg[ii][1]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fky[j]*work2[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v1z_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V2x
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkx[i]*(vg[ii][2]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fkz[k]*work3[n]);
+        work4[n+1] = fkx[i]*(vg[ii][2]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fkz[k]*work3[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v2x_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V2y
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fky[j]*(vg[ii][2]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fkz[k]*work3[n]);
+        work4[n+1] = fky[j]*(vg[ii][2]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fkz[k]*work3[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v2y_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V2z
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkz[k]*(vg[ii][2]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fkz[k]*work3[n]);
+        work4[n+1] = fkz[k]*(vg[ii][2]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fkz[k]*work3[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v2z_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V3x
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkx[i]*(vg[ii][3]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fky[j]*work1[n]);
+        work4[n+1] = fkx[i]*(vg[ii][3]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fky[j]*work1[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v3x_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V3y
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fky[j]*(vg[ii][3]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fky[j]*work1[n]);
+        work4[n+1] = fky[j]*(vg[ii][3]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fky[j]*work1[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v3y_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V3z
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkz[k]*(vg[ii][3]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fky[j]*work1[n]);
+        work4[n+1] = fkz[k]*(vg[ii][3]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fky[j]*work1[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v3z_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V4x
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkx[i]*(vg[ii][4]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fkz[k]*work1[n]);
+        work4[n+1] = fkx[i]*(vg[ii][4]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fkz[k]*work1[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v4x_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V4y
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fky[j]*(vg[ii][4]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fkz[k]*work1[n]);
+        work4[n+1] = fky[j]*(vg[ii][4]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fkz[k]*work1[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v4y_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V4z
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkz[k]*(vg[ii][4]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fkz[k]*work1[n]);
+        work4[n+1] = fkz[k]*(vg[ii][4]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fkz[k]*work1[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v4z_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V5x
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkx[i]*(vg[ii][5]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fkz[k]*work2[n]);
+        work4[n+1] = fkx[i]*(vg[ii][5]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fkz[k]*work2[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v5x_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V5y
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fky[j]*(vg[ii][5]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fkz[k]*work2[n]);
+        work4[n+1] = fky[j]*(vg[ii][5]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fkz[k]*work2[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v5y_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+
+  // V5z
+
+  n = 0;
+  ii = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++)
+    for (j = nylo_fft; j <= nyhi_fft; j++)
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        work4[n] = fkz[k]*(vg[ii][5]*(work1[n]*fkx[i] + work2[n]*fky[j] + work3[n]*fkz[k]) + 2.0*fkz[k]*work2[n]);
+        work4[n+1] = fkz[k]*(vg[ii][5]*(work1[n+1]*fkx[i] + work2[n+1]*fky[j] + work3[n+1]*fkz[k]) + 2.0*fkz[k]*work2[n+1]);
+        n += 2;
+        ii++;
+      }
+
+  fft2->compute(work4,work4,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        v5z_brick_dipole[k][j][i] = work4[n];
+        n += 2;
+      }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get electric field & force on my particles for ik
+------------------------------------------------------------------------- */
+
+void PPPMDipole::fieldforce_ik_dipole()
+{
+  int i,l,m,n,nx,ny,nz,mx,my,mz;
+  FFT_SCALAR dx,dy,dz;
+  FFT_SCALAR x0,y0,z0;
+  FFT_SCALAR ex,ey,ez;
+  FFT_SCALAR vxx,vyy,vzz,vxy,vxz,vyz;
+
+  // loop over my charges, interpolate electric field from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+
+  double **mu = atom->mu;
+  double **x = atom->x;
+  double **f = atom->f;
+  double **t = atom->torque;
+
+  int nlocal = atom->nlocal;
+
+  for (i = 0; i < nlocal; i++) {
+    nx = part2grid[i][0];
+    ny = part2grid[i][1];
+    nz = part2grid[i][2];
+    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+    compute_rho1d(dx,dy,dz);
+
+    ex = ey = ez = ZEROF;
+    vxx = vyy = vzz = vxy = vxz = vyz = ZEROF;
+    for (n = nlower; n <= nupper; n++) {
+      mz = n+nz;
+      z0 = rho1d[2][n];
+      for (m = nlower; m <= nupper; m++) {
+        my = m+ny;
+        y0 = z0*rho1d[1][m];
+        for (l = nlower; l <= nupper; l++) {
+          mx = l+nx;
+          x0 = y0*rho1d[0][l];
+          ex -= x0*ux_brick_dipole[mz][my][mx];
+          ey -= x0*uy_brick_dipole[mz][my][mx];
+          ez -= x0*uz_brick_dipole[mz][my][mx];
+          vxx -= x0*vdxx_brick_dipole[mz][my][mx];
+          vyy -= x0*vdyy_brick_dipole[mz][my][mx];
+          vzz -= x0*vdzz_brick_dipole[mz][my][mx];
+          vxy -= x0*vdxy_brick_dipole[mz][my][mx];
+          vxz -= x0*vdxz_brick_dipole[mz][my][mx];
+          vyz -= x0*vdyz_brick_dipole[mz][my][mx];
+        }
+      }
+    }
+
+    // convert E-field to torque
+
+    const double mufactor = qqrd2e * scale;
+    f[i][0] += mufactor*(vxx*mu[i][0] + vxy*mu[i][1] + vxz*mu[i][2]);
+    f[i][1] += mufactor*(vxy*mu[i][0] + vyy*mu[i][1] + vyz*mu[i][2]);
+    f[i][2] += mufactor*(vxz*mu[i][0] + vyz*mu[i][1] + vzz*mu[i][2]);
+
+    t[i][0] += mufactor*(mu[i][1]*ez - mu[i][2]*ey);
+    t[i][1] += mufactor*(mu[i][2]*ex - mu[i][0]*ez);
+    t[i][2] += mufactor*(mu[i][0]*ey - mu[i][1]*ex);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   interpolate from grid to get per-atom energy/virial
+------------------------------------------------------------------------- */
+
+void PPPMDipole::fieldforce_peratom_dipole()
+{
+  int i,l,m,n,nx,ny,nz,mx,my,mz;
+  FFT_SCALAR dx,dy,dz,x0,y0,z0;
+  FFT_SCALAR ux,uy,uz;
+  FFT_SCALAR v0x,v1x,v2x,v3x,v4x,v5x;
+  FFT_SCALAR v0y,v1y,v2y,v3y,v4y,v5y;
+  FFT_SCALAR v0z,v1z,v2z,v3z,v4z,v5z;
+
+  // loop over my charges, interpolate from nearby grid points
+  // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
+  // (dx,dy,dz) = distance to "lower left" grid pt
+  // (mx,my,mz) = global coords of moving stencil pt
+
+  double **mu = atom->mu;
+  double **x = atom->x;
+
+  int nlocal = atom->nlocal;
+
+  for (i = 0; i < nlocal; i++) {
+    nx = part2grid[i][0];
+    ny = part2grid[i][1];
+    nz = part2grid[i][2];
+    dx = nx+shiftone - (x[i][0]-boxlo[0])*delxinv;
+    dy = ny+shiftone - (x[i][1]-boxlo[1])*delyinv;
+    dz = nz+shiftone - (x[i][2]-boxlo[2])*delzinv;
+
+    compute_rho1d(dx,dy,dz);
+
+    ux = uy = uz = ZEROF;
+    v0x = v1x = v2x = v3x = v4x = v5x = ZEROF;
+    v0y = v1y = v2y = v3y = v4y = v5y = ZEROF;
+    v0z = v1z = v2z = v3z = v4z = v5z = ZEROF;
+    for (n = nlower; n <= nupper; n++) {
+      mz = n+nz;
+      z0 = rho1d[2][n];
+      for (m = nlower; m <= nupper; m++) {
+        my = m+ny;
+        y0 = z0*rho1d[1][m];
+        for (l = nlower; l <= nupper; l++) {
+          mx = l+nx;
+          x0 = y0*rho1d[0][l];
+          if (eflag_atom) {
+            ux += x0*ux_brick_dipole[mz][my][mx];
+            uy += x0*uy_brick_dipole[mz][my][mx];
+            uz += x0*uz_brick_dipole[mz][my][mx];
+          }
+          if (vflag_atom) {
+            v0x += x0*v0x_brick_dipole[mz][my][mx];
+            v1x += x0*v1x_brick_dipole[mz][my][mx];
+            v2x += x0*v2x_brick_dipole[mz][my][mx];
+            v3x += x0*v3x_brick_dipole[mz][my][mx];
+            v4x += x0*v4x_brick_dipole[mz][my][mx];
+            v5x += x0*v5x_brick_dipole[mz][my][mx];
+            v0y += x0*v0y_brick_dipole[mz][my][mx];
+            v1y += x0*v1y_brick_dipole[mz][my][mx];
+            v2y += x0*v2y_brick_dipole[mz][my][mx];
+            v3y += x0*v3y_brick_dipole[mz][my][mx];
+            v4y += x0*v4y_brick_dipole[mz][my][mx];
+            v5y += x0*v5y_brick_dipole[mz][my][mx];
+            v0z += x0*v0z_brick_dipole[mz][my][mx];
+            v1z += x0*v1z_brick_dipole[mz][my][mx];
+            v2z += x0*v2z_brick_dipole[mz][my][mx];
+            v3z += x0*v3z_brick_dipole[mz][my][mx];
+            v4z += x0*v4z_brick_dipole[mz][my][mx];
+            v5z += x0*v5z_brick_dipole[mz][my][mx];
+          }
+        }
+      }
+    }
+
+    if (eflag_atom) eatom[i] += mu[i][0]*ux + mu[i][1]*uy + mu[i][2]*uz;
+    if (vflag_atom) {
+      vatom[i][0] += mu[i][0]*v0x + mu[i][1]*v0y + mu[i][2]*v0z;
+      vatom[i][1] += mu[i][0]*v1x + mu[i][1]*v1y + mu[i][2]*v1z;
+      vatom[i][2] += mu[i][0]*v2x + mu[i][1]*v2y + mu[i][2]*v2z;
+      vatom[i][3] += mu[i][0]*v3x + mu[i][1]*v3y + mu[i][2]*v3z;
+      vatom[i][4] += mu[i][0]*v4x + mu[i][1]*v4y + mu[i][2]*v4z;
+      vatom[i][5] += mu[i][0]*v5x + mu[i][1]*v5y + mu[i][2]*v5z;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   pack own values to buf to send to another proc
+------------------------------------------------------------------------- */
+
+void PPPMDipole::pack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
+{
+  int n = 0;
+
+  if (flag == FORWARD_MU) {
+    FFT_SCALAR *src_ux = &ux_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *src_uy = &uy_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *src_uz = &uz_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *src_vxx = &vdxx_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *src_vyy = &vdyy_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *src_vzz = &vdzz_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *src_vxy = &vdxy_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *src_vxz = &vdxz_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *src_vyz = &vdyz_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    for (int i = 0; i < nlist; i++) {
+      buf[n++] = src_ux[list[i]];
+      buf[n++] = src_uy[list[i]];
+      buf[n++] = src_uz[list[i]];
+      buf[n++] = src_vxx[list[i]];
+      buf[n++] = src_vyy[list[i]];
+      buf[n++] = src_vzz[list[i]];
+      buf[n++] = src_vxy[list[i]];
+      buf[n++] = src_vxz[list[i]];
+      buf[n++] = src_vyz[list[i]];
+    }
+  } else if (flag == FORWARD_MU_PERATOM) {
+    FFT_SCALAR *v0xsrc = &v0x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v1xsrc = &v1x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v2xsrc = &v2x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v3xsrc = &v3x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v4xsrc = &v4x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v5xsrc = &v5x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v0ysrc = &v0y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v1ysrc = &v1y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v2ysrc = &v2y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v3ysrc = &v3y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v4ysrc = &v4y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v5ysrc = &v5y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v0zsrc = &v0z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v1zsrc = &v1z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v2zsrc = &v2z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v3zsrc = &v3z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v4zsrc = &v4z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v5zsrc = &v5z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    for (int i = 0; i < nlist; i++) {
+      buf[n++] = v0xsrc[list[i]];
+      buf[n++] = v1xsrc[list[i]];
+      buf[n++] = v2xsrc[list[i]];
+      buf[n++] = v3xsrc[list[i]];
+      buf[n++] = v4xsrc[list[i]];
+      buf[n++] = v5xsrc[list[i]];
+      buf[n++] = v0ysrc[list[i]];
+      buf[n++] = v1ysrc[list[i]];
+      buf[n++] = v2ysrc[list[i]];
+      buf[n++] = v3ysrc[list[i]];
+      buf[n++] = v4ysrc[list[i]];
+      buf[n++] = v5ysrc[list[i]];
+      buf[n++] = v0zsrc[list[i]];
+      buf[n++] = v1zsrc[list[i]];
+      buf[n++] = v2zsrc[list[i]];
+      buf[n++] = v3zsrc[list[i]];
+      buf[n++] = v4zsrc[list[i]];
+      buf[n++] = v5zsrc[list[i]];
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   unpack another proc's own values from buf and set own ghost values
+------------------------------------------------------------------------- */
+
+void PPPMDipole::unpack_forward(int flag, FFT_SCALAR *buf, int nlist, int *list)
+{
+  int n = 0;
+
+  if (flag == FORWARD_MU) {
+    FFT_SCALAR *dest_ux = &ux_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *dest_uy = &uy_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *dest_uz = &uz_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *dest_vxx = &vdxx_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *dest_vyy = &vdyy_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *dest_vzz = &vdzz_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *dest_vxy = &vdxy_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *dest_vxz = &vdxz_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *dest_vyz = &vdyz_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    for (int i = 0; i < nlist; i++) {
+      dest_ux[list[i]] = buf[n++];
+      dest_uy[list[i]] = buf[n++];
+      dest_uz[list[i]] = buf[n++];
+      dest_vxx[list[i]] = buf[n++];
+      dest_vyy[list[i]] = buf[n++];
+      dest_vzz[list[i]] = buf[n++];
+      dest_vxy[list[i]] = buf[n++];
+      dest_vxz[list[i]] = buf[n++];
+      dest_vyz[list[i]] = buf[n++];
+    }
+  } else if (flag == FORWARD_MU_PERATOM) {
+    FFT_SCALAR *v0xsrc = &v0x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v1xsrc = &v1x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v2xsrc = &v2x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v3xsrc = &v3x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v4xsrc = &v4x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v5xsrc = &v5x_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v0ysrc = &v0y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v1ysrc = &v1y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v2ysrc = &v2y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v3ysrc = &v3y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v4ysrc = &v4y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v5ysrc = &v5y_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v0zsrc = &v0z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v1zsrc = &v1z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v2zsrc = &v2z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v3zsrc = &v3z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v4zsrc = &v4z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *v5zsrc = &v5z_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    for (int i = 0; i < nlist; i++) {
+      v0xsrc[list[i]] = buf[n++];
+      v1xsrc[list[i]] = buf[n++];
+      v2xsrc[list[i]] = buf[n++];
+      v3xsrc[list[i]] = buf[n++];
+      v4xsrc[list[i]] = buf[n++];
+      v5xsrc[list[i]] = buf[n++];
+      v0ysrc[list[i]] = buf[n++];
+      v1ysrc[list[i]] = buf[n++];
+      v2ysrc[list[i]] = buf[n++];
+      v3ysrc[list[i]] = buf[n++];
+      v4ysrc[list[i]] = buf[n++];
+      v5ysrc[list[i]] = buf[n++];
+      v0zsrc[list[i]] = buf[n++];
+      v1zsrc[list[i]] = buf[n++];
+      v2zsrc[list[i]] = buf[n++];
+      v3zsrc[list[i]] = buf[n++];
+      v4zsrc[list[i]] = buf[n++];
+      v5zsrc[list[i]] = buf[n++];
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   pack ghost values into buf to send to another proc
+------------------------------------------------------------------------- */
+
+void PPPMDipole::pack_reverse(int flag, FFT_SCALAR *buf, int nlist, int *list)
+{
+  int n = 0;
+  if (flag == REVERSE_MU) {
+    FFT_SCALAR *src_dipole0 = &densityx_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *src_dipole1 = &densityy_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *src_dipole2 = &densityz_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    for (int i = 0; i < nlist; i++) {
+      buf[n++] = src_dipole0[list[i]];
+      buf[n++] = src_dipole1[list[i]];
+      buf[n++] = src_dipole2[list[i]];
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   unpack another proc's ghost values from buf and add to own values
+------------------------------------------------------------------------- */
+
+void PPPMDipole::unpack_reverse(int flag, FFT_SCALAR *buf, int nlist, int *list)
+{
+  int n = 0;
+  if (flag == REVERSE_MU) {
+    FFT_SCALAR *dest_dipole0 = &densityx_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *dest_dipole1 = &densityy_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    FFT_SCALAR *dest_dipole2 = &densityz_brick_dipole[nzlo_out][nylo_out][nxlo_out];
+    for (int i = 0; i < nlist; i++) {
+      dest_dipole0[list[i]] += buf[n++];
+      dest_dipole1[list[i]] += buf[n++];
+      dest_dipole2[list[i]] += buf[n++];
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   Slab-geometry correction term to dampen inter-slab interactions between
+   periodically repeating slabs.  Yields good approximation to 2D Ewald if
+   adequate empty space is left between repeating slabs (J. Chem. Phys.
+   111, 3155).  Slabs defined here to be parallel to the xy plane. Also
+   extended to non-neutral systems (J. Chem. Phys. 131, 094107).
+------------------------------------------------------------------------- */
+
+void PPPMDipole::slabcorr()
+{
+  // compute local contribution to global dipole moment
+
+  double **x = atom->x;
+  double zprd = domain->zprd;
+  int nlocal = atom->nlocal;
+
+  double dipole = 0.0;
+  double **mu = atom->mu;
+  for (int i = 0; i < nlocal; i++) dipole += mu[i][2];
+
+  // sum local contributions to get global dipole moment
+
+  double dipole_all;
+  MPI_Allreduce(&dipole,&dipole_all,1,MPI_DOUBLE,MPI_SUM,world);
+
+  // need to make non-neutral systems and/or
+  //  per-atom energy translationally invariant
+
+  if (eflag_atom || fabs(qsum) > SMALL) {
+
+    error->all(FLERR,"Cannot (yet) use kspace slab correction with "
+      "long-range dipoles and non-neutral systems or per-atom energy");
+  }
+
+  // compute corrections
+
+  const double e_slabcorr = MY_2PI*(dipole_all*dipole_all/12.0)/volume;
+  const double qscale = qqrd2e * scale;
+
+  if (eflag_global) energy += qscale * e_slabcorr;
+
+  // per-atom energy
+
+  if (eflag_atom) {
+    double efact = qscale * MY_2PI/volume/12.0;
+    for (int i = 0; i < nlocal; i++)
+      eatom[i] += efact * mu[i][2]*dipole_all;
+  }
+
+  // add on torque corrections
+
+  if (atom->torque) {
+    double ffact = qscale * (-4.0*MY_PI/volume);
+    double **mu = atom->mu;
+    double **torque = atom->torque;
+    for (int i = 0; i < nlocal; i++) {
+      torque[i][0] += ffact * dipole_all * mu[i][1];
+      torque[i][1] += -ffact * dipole_all * mu[i][0];
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   perform and time the 1d FFTs required for N timesteps
+------------------------------------------------------------------------- */
+
+int PPPMDipole::timing_1d(int n, double &time1d)
+{
+  double time1,time2;
+
+  for (int i = 0; i < 2*nfft_both; i++) work1[i] = ZEROF;
+
+  MPI_Barrier(world);
+  time1 = MPI_Wtime();
+
+  for (int i = 0; i < n; i++) {
+    fft1->timing1d(work1,nfft_both,1);
+    fft1->timing1d(work1,nfft_both,1);
+    fft1->timing1d(work1,nfft_both,1);
+    fft2->timing1d(work1,nfft_both,-1);
+    fft2->timing1d(work1,nfft_both,-1);
+    fft2->timing1d(work1,nfft_both,-1);
+    fft2->timing1d(work1,nfft_both,-1);
+    fft2->timing1d(work1,nfft_both,-1);
+    fft2->timing1d(work1,nfft_both,-1);
+    fft2->timing1d(work1,nfft_both,-1);
+    fft2->timing1d(work1,nfft_both,-1);
+    fft2->timing1d(work1,nfft_both,-1);
+  }
+
+  MPI_Barrier(world);
+  time2 = MPI_Wtime();
+  time1d = time2 - time1;
+
+  return 12;
+}
+
+/* ----------------------------------------------------------------------
+   perform and time the 3d FFTs required for N timesteps
+------------------------------------------------------------------------- */
+
+int PPPMDipole::timing_3d(int n, double &time3d)
+{
+  double time1,time2;
+
+  for (int i = 0; i < 2*nfft_both; i++) work1[i] = ZEROF;
+
+  MPI_Barrier(world);
+  time1 = MPI_Wtime();
+
+  for (int i = 0; i < n; i++) {
+    fft1->compute(work1,work1,1);
+    fft1->compute(work1,work1,1);
+    fft1->compute(work1,work1,1);
+    fft2->compute(work1,work1,-1);
+    fft2->compute(work1,work1,-1);
+    fft2->compute(work1,work1,-1);
+    fft2->compute(work1,work1,-1);
+    fft2->compute(work1,work1,-1);
+    fft2->compute(work1,work1,-1);
+    fft2->compute(work1,work1,-1);
+    fft2->compute(work1,work1,-1);
+    fft2->compute(work1,work1,-1);
+  }
+
+  MPI_Barrier(world);
+  time2 = MPI_Wtime();
+  time3d = time2 - time1;
+
+  return 12;
+}
+
+/* ----------------------------------------------------------------------
+   memory usage of local arrays
+------------------------------------------------------------------------- */
+
+double PPPMDipole::memory_usage()
+{
+  double bytes = nmax*3 * sizeof(double);
+  int nbrick = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) *
+    (nzhi_out-nzlo_out+1);
+  bytes += 6 * nfft_both * sizeof(double); // vg
+  bytes += nfft_both * sizeof(double); // greensfn
+  bytes += nfft_both*5 * sizeof(FFT_SCALAR); // work*2*2
+  bytes += 9 * nbrick * sizeof(FFT_SCALAR); // ubrick*3 + vdbrick*6
+  bytes += nfft_both*7 * sizeof(FFT_SCALAR); //density_ffx*3 + work*2*2
+
+  if (peratom_allocate_flag)
+    bytes += 21 * nbrick * sizeof(FFT_SCALAR);
+
+  if (cg_dipole) bytes += cg_dipole->memory_usage();
+  if (cg_peratom_dipole) bytes += cg_peratom_dipole->memory_usage();
+
+  return bytes;
+}
+
+/* ----------------------------------------------------------------------
+   compute musum,musqsum,mu2
+   called initially, when particle count changes, when dipoles are changed
+------------------------------------------------------------------------- */
+
+void PPPMDipole::musum_musq()
+{
+  const int nlocal = atom->nlocal;
+
+  musum = musqsum = mu2 = 0.0;
+  if (atom->mu_flag) {
+    double** mu = atom->mu;
+    double musum_local(0.0), musqsum_local(0.0);
+
+    for (int i = 0; i < nlocal; i++) {
+      musum_local += mu[i][0] + mu[i][1] + mu[i][2];
+      musqsum_local += mu[i][0]*mu[i][0] + mu[i][1]*mu[i][1] + mu[i][2]*mu[i][2];
+    }
+
+    MPI_Allreduce(&musum_local,&musum,1,MPI_DOUBLE,MPI_SUM,world);
+    MPI_Allreduce(&musqsum_local,&musqsum,1,MPI_DOUBLE,MPI_SUM,world);
+
+    mu2 = musqsum * force->qqrd2e;
+  }
+
+  if (mu2 == 0 && comm->me == 0)
+    error->all(FLERR,"Using kspace solver PPPMDipole on system with no dipoles");
+}
\ No newline at end of file
diff --git a/src/KSPACE/pppm_spin.h b/src/KSPACE/pppm_spin.h
new file mode 100644
index 0000000000..4d6906f974
--- /dev/null
+++ b/src/KSPACE/pppm_spin.h
@@ -0,0 +1,213 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef KSPACE_CLASS
+
+KSpaceStyle(pppm/dipole,PPPMDipole)
+
+#else
+
+#ifndef LMP_PPPM_DIPOLE_H
+#define LMP_PPPM_DIPOLE_H
+
+#include "pppm.h"
+
+namespace LAMMPS_NS {
+
+class PPPMDipole : public PPPM {
+ public:
+  PPPMDipole(class LAMMPS *, int, char **);
+  virtual ~PPPMDipole();
+  void init();
+  void setup();
+  void setup_grid();
+  void compute(int, int);
+  int timing_1d(int, double &);
+  int timing_3d(int, double &);
+  double memory_usage();
+
+ protected:
+  void set_grid_global();
+  double newton_raphson_f();
+
+  void allocate();
+  void allocate_peratom();
+  void deallocate();
+  void deallocate_peratom();
+  void compute_gf_denom();
+
+  void slabcorr();
+
+  // grid communication
+
+  void pack_forward(int, FFT_SCALAR *, int, int *);
+  void unpack_forward(int, FFT_SCALAR *, int, int *);
+  void pack_reverse(int, FFT_SCALAR *, int, int *);
+  void unpack_reverse(int, FFT_SCALAR *, int, int *);
+
+  // dipole
+
+  FFT_SCALAR ***densityx_brick_dipole,***densityy_brick_dipole,***densityz_brick_dipole;
+  FFT_SCALAR ***vdxx_brick_dipole,***vdyy_brick_dipole,***vdzz_brick_dipole;
+  FFT_SCALAR ***vdxy_brick_dipole,***vdxz_brick_dipole,***vdyz_brick_dipole;
+  FFT_SCALAR ***ux_brick_dipole,***uy_brick_dipole,***uz_brick_dipole;
+  FFT_SCALAR ***v0x_brick_dipole,***v1x_brick_dipole,***v2x_brick_dipole;
+  FFT_SCALAR ***v3x_brick_dipole,***v4x_brick_dipole,***v5x_brick_dipole;
+  FFT_SCALAR ***v0y_brick_dipole,***v1y_brick_dipole,***v2y_brick_dipole;
+  FFT_SCALAR ***v3y_brick_dipole,***v4y_brick_dipole,***v5y_brick_dipole;
+  FFT_SCALAR ***v0z_brick_dipole,***v1z_brick_dipole,***v2z_brick_dipole;
+  FFT_SCALAR ***v3z_brick_dipole,***v4z_brick_dipole,***v5z_brick_dipole;
+  FFT_SCALAR *work3,*work4;
+  FFT_SCALAR *densityx_fft_dipole,*densityy_fft_dipole,*densityz_fft_dipole;
+  class GridComm *cg_dipole;
+  class GridComm *cg_peratom_dipole;
+  int only_dipole_flag;
+  double musum,musqsum,mu2;
+  double find_gewald_dipole(double, double, bigint, double, double);
+  double newton_raphson_f_dipole(double, double, bigint, double, double);
+  double derivf_dipole(double, double, bigint, double, double);
+  double compute_df_kspace_dipole();
+  double compute_qopt_dipole();
+  void compute_gf_dipole();
+  void make_rho_dipole();
+  void brick2fft_dipole();
+  void poisson_ik_dipole();
+  void poisson_peratom_dipole();
+  void fieldforce_ik_dipole();
+  void fieldforce_peratom_dipole();
+  double final_accuracy_dipole();
+  void musum_musq();
+
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Cannot (yet) use charges with Kspace style PPPMDipole
+
+Charge-dipole interactions are not yet implemented in PPPMDipole so this
+feature is not yet supported.
+
+E: Must redefine kspace_style after changing to triclinic box
+
+Self-explanatory.
+
+E: Kspace style requires atom attribute mu
+
+The atom style defined does not have this attribute.
+
+E: Cannot (yet) use kspace_modify diff ad with dipoles
+
+This feature is not yet supported.
+
+E: Cannot (yet) use 'electron' units with dipoles
+
+This feature is not yet supported.
+
+E: Cannot yet use triclinic cells with PPPMDipole
+
+This feature is not yet supported.
+
+E: Cannot yet use TIP4P with PPPMDipole
+
+This feature is not yet supported.
+
+E: Cannot use nonperiodic boundaries with PPPM
+
+For kspace style pppm, all 3 dimensions must have periodic boundaries
+unless you use the kspace_modify command to define a 2d slab with a
+non-periodic z dimension.
+
+E: Incorrect boundaries with slab PPPM
+
+Must have periodic x,y dimensions and non-periodic z dimension to use
+2d slab option with PPPM.
+
+E: PPPM order cannot be < 2 or > than %d
+
+This is a limitation of the PPPM implementation in LAMMPS.
+
+E: KSpace style is incompatible with Pair style
+
+Setting a kspace style requires that a pair style with matching
+long-range dipole components be used.
+
+W: Reducing PPPM order b/c stencil extends beyond nearest neighbor processor
+
+This may lead to a larger grid than desired. See the kspace_modify overlap
+command to prevent changing of the PPPM order.
+
+E: PPPM order < minimum allowed order
+
+The default minimum order is 2. This can be reset by the
+kspace_modify minorder command.
+
+E: PPPM grid stencil extends beyond nearest neighbor processor
+
+This is not allowed if the kspace_modify overlap setting is no.
+
+E: KSpace accuracy must be > 0
+
+The kspace accuracy designated in the input must be greater than zero.
+
+E: Could not compute grid size
+
+The code is unable to compute a grid size consistent with the desired
+accuracy. This error should not occur for typical problems. Please
+send an email to the developers.
+
+E: PPPM grid is too large
+
+The global PPPM grid is larger than OFFSET in one or more dimensions.
+OFFSET is currently set to 4096. You likely need to decrease the
+requested accuracy.
+
+E: Could not compute g_ewald
+
+The Newton-Raphson solver failed to converge to a good value for
+g_ewald. This error should not occur for typical problems. Please
+send an email to the developers.
+
+E: Non-numeric box dimensions - simulation unstable
+
+The box size has apparently blown up.
+
+E: Out of range atoms - cannot compute PPPM
+
+One or more atoms are attempting to map their charge to a PPPM grid
+point that is not owned by a processor. This is likely for one of two
+reasons, both of them bad. First, it may mean that an atom near the
+boundary of a processor's sub-domain has moved more than 1/2 the
+"neighbor skin distance"_neighbor.html without neighbor lists being
+rebuilt and atoms being migrated to new processors. This also means
+you may be missing pairwise interactions that need to be computed.
+The solution is to change the re-neighboring criteria via the
+"neigh_modify"_neigh_modify command. The safest settings are "delay 0
+every 1 check yes". Second, it may mean that an atom has moved far
+outside a processor's sub-domain or even the entire simulation box.
+This indicates bad physics, e.g. due to highly overlapping atoms, too
+large a timestep, etc.
+
+E: Using kspace solver PPPMDipole on system with no dipoles
+
+Must have non-zero dipoles with PPPMDipole.
+
+E: Must use kspace_modify gewald for system with no dipoles
+
+Self-explanatory.
+
+*/
diff --git a/src/SPIN/pair_spin_long.cpp b/src/SPIN/pair_spin_long.cpp
new file mode 100644
index 0000000000..66b684ae1d
--- /dev/null
+++ b/src/SPIN/pair_spin_long.cpp
@@ -0,0 +1,550 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   www.cs.sandia.gov/~sjplimp/lammps.html
+   Steve Plimpton, sjplimp@sandia.gov, Sandia National Laboratories
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ------------------------------------------------------------------------
+   Contributing authors: Julien Tranchida (SNL)
+                         Stan Moore (SNL)
+   
+   Please cite the related publication:
+   Tranchida, J., Plimpton, S. J., Thibaudeau, P., & Thompson, A. P. (2018). 
+   Massively parallel symplectic algorithm for coupled magnetic spin dynamics 
+   and molecular dynamics. arXiv preprint arXiv:1801.10233.
+------------------------------------------------------------------------- */
+
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_spin_long.h"
+#include "atom.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "fix_nve_spin.h"
+#include "force.h"
+#include "kspace.h"
+#include "math_const.h"
+#include "memory.h"
+#include "modify.h"
+#include "error.h"
+#include "update.h"
+
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define A1        0.254829592
+#define A2       -0.284496736
+#define A3        1.421413741
+#define A4       -1.453152027
+#define A5        1.061405429
+
+/* ---------------------------------------------------------------------- */
+
+PairSpinLong::PairSpinLong(LAMMPS *lmp) : PairSpin(lmp),
+lockfixnvespin(NULL)
+{
+  single_enable = 0;
+  ewaldflag = pppmflag = 1;
+  respa_enable = 0;
+  no_virial_fdotr_compute = 1;
+  lattice_flag = 0;
+
+  hbar = force->hplanck/MY_2PI;		// eV/(rad.THz)
+  mub = 5.78901e-5;                	// in eV/T
+  mu_0 = 1.2566370614e-6;		// in T.m/A
+  mub2mu0 = mub * mub * mu_0;		// in eV
+  mub2mu0hbinv = mub2mu0 / hbar;	// in rad.THz
+
+}
+
+/* ----------------------------------------------------------------------
+   free all arrays
+------------------------------------------------------------------------- */
+
+PairSpinLong::~PairSpinLong()
+{
+  if (allocated) {
+    memory->destroy(setflag);
+    memory->destroy(cutsq);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairSpinLong::compute(int eflag, int vflag)
+{
+  int i,j,ii,jj,inum,jnum,itype,jtype;  
+  double r,rinv,r2inv,rsq;
+  double grij,expm2,t,erfc;
+  double bij[4];
+  double evdwl,ecoul;
+  double xi[3],rij[3];
+  double spi[4],spj[4],fi[3],fmi[3];
+  double pre1,pre2,pre3;
+  int *ilist,*jlist,*numneigh,**firstneigh;  
+
+  evdwl = ecoul = 0.0;
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+
+  double **x = atom->x;
+  double **f = atom->f;
+  double **fm = atom->fm;
+  double **sp = atom->sp;	
+  int *type = atom->type;  
+  int nlocal = atom->nlocal;  
+  int newton_pair = force->newton_pair;
+
+  inum = list->inum;
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  pre1 = 2.0 * g_ewald / MY_PIS;
+  pre2 = 4.0 * pow(g_ewald,3.0) / MY_PIS;
+  pre3 = 8.0 * pow(g_ewald,5.0) / MY_PIS;
+
+  // computation of the exchange interaction
+  // loop over atoms and their neighbors
+
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    xi[0] = x[i][0];
+    xi[1] = x[i][1];
+    xi[2] = x[i][2];
+    jlist = firstneigh[i];
+    jnum = numneigh[i]; 
+    spi[0] = sp[i][0]; 
+    spi[1] = sp[i][1]; 
+    spi[2] = sp[i][2];
+    spi[3] = sp[i][3];
+    itype = type[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+      jtype = type[j];
+
+      spj[0] = sp[j][0]; 
+      spj[1] = sp[j][1]; 
+      spj[2] = sp[j][2]; 
+      spj[3] = sp[j][3]; 
+
+      evdwl = 0.0;
+
+      fi[0] = fi[1] = fi[2] = 0.0;
+      fmi[0] = fmi[1] = fmi[2] = 0.0;
+      bij[0] = bij[1] = bij[2] = bij[3] = 0.0;
+     
+      rij[0] = x[j][0] - xi[0];
+      rij[1] = x[j][1] - xi[1];
+      rij[2] = x[j][2] - xi[2];
+      rsq = rij[0]*rij[0] + rij[1]*rij[1] + rij[2]*rij[2];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+        rinv = sqrt(r2inv);
+
+        if (rsq < cut_spinsq) {
+          r = sqrt(rsq);
+          grij = g_ewald * r;
+          expm2 = exp(-grij*grij);
+          t = 1.0 / (1.0 + EWALD_P*grij);
+          erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+
+          bij[0] = erfc * rinv;
+          bij[1] = (bij[0] + pre1*expm2) * r2inv;
+          bij[2] = (3.0*bij[1] + pre2*expm2) * r2inv;
+          bij[3] = (5.0*bij[2] + pre3*expm2) * r2inv;
+
+	  compute_long(i,j,rij,bij,fmi,spi,spj);
+	  compute_long_mech(i,j,rij,bij,fmi,spi,spj);
+          
+	}
+      }
+
+      // force accumulation
+
+      f[i][0] += fi[0] * mub2mu0;	 
+      f[i][1] += fi[1] * mub2mu0;	  	  
+      f[i][2] += fi[2] * mub2mu0;
+      fm[i][0] += fmi[0] * mub2mu0hbinv;	 
+      fm[i][1] += fmi[1] * mub2mu0hbinv;	  	  
+      fm[i][2] += fmi[2] * mub2mu0hbinv;
+
+      if (newton_pair || j < nlocal) {
+	f[j][0] -= fi[0];	 
+        f[j][1] -= fi[1];	  	  
+        f[j][2] -= fi[2];
+      }
+
+      if (eflag) {
+	if (rsq <= cut_spinsq) {
+	  evdwl -= spi[0]*fmi[0] + spi[1]*fmi[1] + 
+	    spi[2]*fmi[2];
+	  evdwl *= hbar;
+	}
+      } else evdwl = 0.0;
+
+
+      if (evflag) ev_tally_xyz(i,j,nlocal,newton_pair,
+	  evdwl,ecoul,fi[0],fi[1],fi[2],rij[0],rij[1],rij[2]);
+
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   update the pair interaction fmi acting on the spin ii
+------------------------------------------------------------------------- */
+
+void PairSpinLong::compute_single_pair(int ii, double fmi[3])
+{
+  int i,j,jj,jnum,itype,jtype;  
+  double r,rinv,r2inv,rsq;
+  double grij,expm2,t,erfc;
+  double bij[4],xi[3],rij[3],spi[4],spj[4];
+  double pre1,pre2,pre3;
+  int *ilist,*jlist,*numneigh,**firstneigh;  
+
+  double **x = atom->x;
+  double **sp = atom->sp;	
+  int *type = atom->type;  
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  pre1 = 2.0 * g_ewald / MY_PIS;
+  pre2 = 4.0 * pow(g_ewald,3.0) / MY_PIS;
+  pre3 = 8.0 * pow(g_ewald,5.0) / MY_PIS;
+
+  // computation of the exchange interaction
+  // loop over neighbors of atom i
+    
+  i = ilist[ii];
+  xi[0] = x[i][0];
+  xi[1] = x[i][1];
+  xi[2] = x[i][2];
+  spi[0] = sp[i][0]; 
+  spi[1] = sp[i][1]; 
+  spi[2] = sp[i][2];
+  spi[3] = sp[i][3];
+  jlist = firstneigh[i];
+  jnum = numneigh[i]; 
+  itype = type[i];
+
+  for (jj = 0; jj < jnum; jj++) {
+    j = jlist[jj];
+    j &= NEIGHMASK;
+    jtype = type[j];
+
+    spj[0] = sp[j][0]; 
+    spj[1] = sp[j][1]; 
+    spj[2] = sp[j][2]; 
+    spj[3] = sp[j][3]; 
+
+    fmi[0] = fmi[1] = fmi[2] = 0.0;
+    bij[0] = bij[1] = bij[2] = bij[3] = 0.0;
+   
+    rij[0] = x[j][0] - xi[0];
+    rij[1] = x[j][1] - xi[1];
+    rij[2] = x[j][2] - xi[2];
+    rsq = rij[0]*rij[0] + rij[1]*rij[1] + rij[2]*rij[2];
+
+    if (rsq < cutsq[itype][jtype]) {
+      r2inv = 1.0/rsq;
+      rinv = sqrt(r2inv);
+
+      if (rsq < cut_spinsq) {
+        r = sqrt(rsq);
+        grij = g_ewald * r;
+        expm2 = exp(-grij*grij);
+        t = 1.0 / (1.0 + EWALD_P*grij);
+        erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+
+        bij[0] = erfc * rinv;
+        bij[1] = (bij[0] + pre1*expm2) * r2inv;
+        bij[2] = (3.0*bij[1] + pre2*expm2) * r2inv;
+        bij[3] = (5.0*bij[2] + pre3*expm2) * r2inv;
+
+        compute_long(i,j,rij,bij,fmi,spi,spj);
+        
+      }
+    }
+  }
+
+  // force accumulation
+
+  fmi[0] *= mub2mu0hbinv;	 
+  fmi[1] *= mub2mu0hbinv;	  	  
+  fmi[2] *= mub2mu0hbinv;
+}
+
+/* ----------------------------------------------------------------------
+   compute exchange interaction between spins i and j
+------------------------------------------------------------------------- */
+
+void PairSpinLong::compute_long(int i, int j, double rij[3], 
+    double bij[4], double fmi[3], double spi[4], double spj[4])
+{
+  double sjdotr;
+  double b1,b2,gigj;
+
+  gigj = spi[3] * spj[3];
+  sjdotr = spj[0]*rij[0] + spj[1]*rij[1] + spj[2]*rij[2];
+
+  b1 = bij[1];
+  b2 = bij[2];
+
+  fmi[0] += gigj * (b2 * sjdotr *rij[0] - b1 * spj[0]);
+  fmi[1] += gigj * (b2 * sjdotr *rij[1] - b1 * spj[1]);
+  fmi[2] += gigj * (b2 * sjdotr *rij[2] - b1 * spj[2]);
+}
+
+/* ----------------------------------------------------------------------
+   compute the mechanical force due to the exchange interaction between atom i and atom j
+------------------------------------------------------------------------- */
+
+void PairSpinLong::compute_long_mech(int i, int j, double rij[3],
+    double bij[4], double fi[3], double spi[3], double spj[3])
+{
+  double sdots,sidotr,sjdotr,b2,b3;
+  double g1,g2,g1b2_g2b3,gigj;
+
+  gigj = spi[3] * spj[3];
+  sdots = spi[0]*spj[0] + spi[1]*spj[1] + spi[2]*spj[2];
+  sidotr = spi[0]*rij[0] + spi[1]*rij[1] + spi[2]*rij[2];
+  sjdotr = spj[0]*rij[0] + spj[1]*rij[1] + spj[2]*rij[2];
+
+  b2 = bij[2];
+  b3 = bij[3];
+  g1 = sdots;
+  g2 = -sidotr*sjdotr;
+  g1b2_g2b3 = g1*b2 + g2*b3;
+
+  fi[0] += gigj * (rij[0] * g1b2_g2b3 + 
+      b2 * (sjdotr*spi[0] + sidotr*spj[0]));
+  fi[1] += gigj * (rij[1] * g1b2_g2b3 + 
+      b2 * (sjdotr*spi[1] + sidotr*spj[1]));
+  fi[2] += gigj * (rij[2] * g1b2_g2b3 + 
+      b2 * (sjdotr*spi[2] + sidotr*spj[2]));
+}
+
+
+/* ----------------------------------------------------------------------
+   allocate all arrays
+------------------------------------------------------------------------- */
+
+void PairSpinLong::allocate()
+{
+  allocated = 1;
+  int n = atom->ntypes;
+
+  memory->create(setflag,n+1,n+1,"pair:setflag");
+  for (int i = 1; i <= n; i++)
+    for (int j = i; j <= n; j++)
+      setflag[i][j] = 0;
+
+  memory->create(cutsq,n+1,n+1,"pair:cutsq");
+}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+void PairSpinLong::settings(int narg, char **arg)
+{
+  if (narg < 1 || narg > 2)
+    error->all(FLERR,"Incorrect args in pair_style command");
+
+  if (strcmp(update->unit_style,"metal") != 0)
+    error->all(FLERR,"Spin simulations require metal unit style");
+
+  cut_spin = force->numeric(FLERR,arg[0]);
+
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+------------------------------------------------------------------------- */
+
+void PairSpinLong::coeff(int narg, char **arg)
+{
+  if (narg < 4 || narg > 5)
+    error->all(FLERR,"Incorrect args for pair coefficients");
+  if (!allocated) allocate();
+
+  // check if args correct
+
+  if (strcmp(arg[2],"long") != 0)
+    error->all(FLERR,"Incorrect args in pair_style command");
+  if (narg != 3) 
+    error->all(FLERR,"Incorrect args in pair_style command");
+  
+  int ilo,ihi,jlo,jhi;
+  force->bounds(FLERR,arg[0],atom->ntypes,ilo,ihi);
+  force->bounds(FLERR,arg[1],atom->ntypes,jlo,jhi);
+
+  int count = 0;
+  for (int i = ilo; i <= ihi; i++) {
+    for (int j = MAX(jlo,i); j <= jhi; j++) {
+      setflag[i][j] = 1;
+      count++;
+    }
+  }
+
+  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
+}
+
+/* ----------------------------------------------------------------------
+   init for one type pair i,j and corresponding j,i
+------------------------------------------------------------------------- */
+
+double PairSpinLong::init_one(int i, int j)
+{
+  if (setflag[i][j] == 0) error->all(FLERR,"All pair coeffs are not set");
+  
+  double cut = cut_spin;
+  return cut;
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairSpinLong::init_style()
+{
+  if (!atom->sp_flag)
+    error->all(FLERR,"Pair spin requires atom/spin style");
+  
+  // need a full neighbor list
+
+  int irequest = neighbor->request(this,instance_me);
+  neighbor->requests[irequest]->half = 0;
+  neighbor->requests[irequest]->full = 1;
+  
+  // checking if nve/spin is a listed fix
+
+  int ifix = 0;
+  while (ifix < modify->nfix) {
+    if (strcmp(modify->fix[ifix]->style,"nve/spin") == 0) break;
+    ifix++;
+  }
+  if (ifix == modify->nfix)
+    error->all(FLERR,"pair/spin style requires nve/spin");
+
+  // get the lattice_flag from nve/spin
+
+  for (int i = 0; i < modify->nfix; i++) {
+    if (strcmp(modify->fix[i]->style,"nve/spin") == 0) {
+      lockfixnvespin = (FixNVESpin *) modify->fix[i];
+      lattice_flag = lockfixnvespin->lattice_flag;
+    }
+  }
+
+  // insure use of KSpace long-range solver, set g_ewald
+
+  if (force->kspace == NULL)
+    error->all(FLERR,"Pair style requires a KSpace style");
+
+  g_ewald = force->kspace->g_ewald;
+
+  cut_spinsq = cut_spin * cut_spin;
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 writes to restart file
+------------------------------------------------------------------------- */
+
+void PairSpinLong::write_restart(FILE *fp)
+{
+  write_restart_settings(fp);
+
+  int i,j;
+  for (i = 1; i <= atom->ntypes; i++)
+    for (j = i; j <= atom->ntypes; j++) {
+      fwrite(&setflag[i][j],sizeof(int),1,fp);
+    }
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 reads from restart file, bcasts
+------------------------------------------------------------------------- */
+
+void PairSpinLong::read_restart(FILE *fp)
+{
+  read_restart_settings(fp);
+
+  allocate();
+
+  int i,j;
+  int me = comm->me;
+  for (i = 1; i <= atom->ntypes; i++)
+    for (j = i; j <= atom->ntypes; j++) {
+      if (me == 0) fread(&setflag[i][j],sizeof(int),1,fp);
+      MPI_Bcast(&setflag[i][j],1,MPI_INT,0,world);
+    }
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 writes to restart file
+------------------------------------------------------------------------- */
+
+void PairSpinLong::write_restart_settings(FILE *fp)
+{
+  fwrite(&cut_spin,sizeof(double),1,fp);
+  fwrite(&mix_flag,sizeof(int),1,fp);
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 reads from restart file, bcasts
+------------------------------------------------------------------------- */
+
+void PairSpinLong::read_restart_settings(FILE *fp)
+{
+  if (comm->me == 0) {
+    fread(&cut_spin,sizeof(double),1,fp);
+    fread(&mix_flag,sizeof(int),1,fp);
+  }
+  MPI_Bcast(&cut_spin,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&mix_flag,1,MPI_INT,0,world);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void *PairSpinLong::extract(const char *str, int &dim)
+{
+  if (strcmp(str,"cut") == 0) {
+    dim = 0;
+    return (void *) &cut_spin;
+  } else if (strcmp(str,"cut_coul") == 0) {
+    dim = 0;
+    return (void *) &cut_spin;
+  } else if (strcmp(str,"ewald_order") == 0) {
+    ewald_order = 0;
+    ewald_order |= 1<<1;
+    ewald_order |= 1<<3;
+    dim = 0;
+    return (void *) &ewald_order;
+  } else if (strcmp(str,"ewald_mix") == 0) {
+    dim = 0;
+    return (void *) &mix_flag;
+  }
+  return NULL;
+}
diff --git a/src/SPIN/pair_spin_long.h b/src/SPIN/pair_spin_long.h
new file mode 100644
index 0000000000..867b771f74
--- /dev/null
+++ b/src/SPIN/pair_spin_long.h
@@ -0,0 +1,97 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   www.cs.sandia.gov/~sjplimp/lammps.html
+   Steve Plimpton, sjplimp@sandia.gov, Sandia National Laboratories
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(spin/long,PairSpinLong)
+
+#else
+
+#ifndef LMP_PAIR_SPIN_LONG_H
+#define LMP_PAIR_SPIN_LONG_H
+
+#include "pair_spin.h"
+
+namespace LAMMPS_NS {
+
+class PairSpinLong : public PairSpin {
+ public:
+  double cut_coul;
+  double **sigma;
+
+  PairSpinLong(class LAMMPS *);
+  ~PairSpinLong();
+  void settings(int, char **);
+  void coeff(int, char **);
+  double init_one(int, int);
+  void init_style();
+  void *extract(const char *, int &); 
+  
+  void compute(int, int);
+  void compute_single_pair(int, double *);
+
+  void compute_long(int, int, double *, double *, double *, 
+      double *, double *);
+  void compute_long_mech(int, int, double *, double *, double *, 
+      double *, double *);
+
+  void write_restart(FILE *);
+  void read_restart(FILE *);
+  void write_restart_settings(FILE *);
+  void read_restart_settings(FILE *);
+
+ protected:
+  double hbar;	 		// reduced Planck's constant
+  double mub;			// Bohr's magneton
+  double mu_0;			// vacuum permeability
+  double mub2mu0;		// prefactor for mech force
+  double mub2mu0hbinv;		// prefactor for mag force
+  double cut_spin, cut_spinsq;
+
+  double g_ewald;
+  int ewald_order;
+
+  int lattice_flag;			// flag for mech force computation
+  class FixNVESpin *lockfixnvespin;	// ptr for setups
+
+  void allocate();
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Incorrect args in pair_style command
+
+Self-explanatory.
+
+E: Incorrect args for pair coefficients
+
+Self-explanatory.  Check the input script or data file.
+
+E: Pair dipole/long requires atom attributes q, mu, torque
+
+The atom style defined does not have these attributes.
+
+E: Cannot (yet) use 'electron' units with dipoles
+
+This feature is not yet supported.
+
+E: Pair style requires a KSpace style
+
+No kspace style is defined.
+
+*/
diff --git a/src/kspace.cpp b/src/kspace.cpp
index fc8b12288b..da606bbf3d 100644
--- a/src/kspace.cpp
+++ b/src/kspace.cpp
@@ -268,7 +268,7 @@ void KSpace::ev_setup(int eflag, int vflag, int alloc)
    called initially, when particle count changes, when charges are changed
 ------------------------------------------------------------------------- */
 
-void KSpace::qsum_qsq()
+void KSpace::qsum_qsq(int warning_flag)
 {
   const double * const q = atom->q;
   const int nlocal = atom->nlocal;
@@ -285,7 +285,7 @@ void KSpace::qsum_qsq()
   MPI_Allreduce(&qsum_local,&qsum,1,MPI_DOUBLE,MPI_SUM,world);
   MPI_Allreduce(&qsqsum_local,&qsqsum,1,MPI_DOUBLE,MPI_SUM,world);
 
-  if ((qsqsum == 0.0) && (comm->me == 0) && warn_nocharge) {
+  if ((qsqsum == 0.0) && (comm->me == 0) && warn_nocharge && warning_flag) {
     error->warning(FLERR,"Using kspace solver on system with no charge");
     warn_nocharge = 0;
   }
diff --git a/src/kspace.h b/src/kspace.h
index 28c7bcef2a..55ace5aa71 100644
--- a/src/kspace.h
+++ b/src/kspace.h
@@ -108,7 +108,7 @@ class KSpace : protected Pointers {
 
   // public so can be called by commands that change charge
 
-  void qsum_qsq();
+  void qsum_qsq(int warning_flag = 1);
 
   // general child-class methods