diff --git a/src/KSPACE/ewald.cpp b/src/KSPACE/ewald.cpp
index a80ba684dd..f98595992f 100644
--- a/src/KSPACE/ewald.cpp
+++ b/src/KSPACE/ewald.cpp
@@ -15,6 +15,7 @@
    Contributing authors: Roy Pollock (LLNL), Paul Crozier (SNL)
      per-atom energy/virial added by German Samolyuk (ORNL), Stan Moore (BYU)
      group/group energy/force added by Stan Moore (BYU)
+     triclinic added by Stan Moore (SNL)
 ------------------------------------------------------------------------- */
 
 #include "mpi.h"
@@ -86,8 +87,6 @@ void Ewald::init()
 
   // error check
 
-  if (domain->triclinic)
-    error->all(FLERR,"Cannot use Ewald with triclinic box");
   if (domain->dimension == 2)
     error->all(FLERR,"Cannot use Ewald with 2d simulation");
 
@@ -99,6 +98,8 @@ void Ewald::init()
     if (domain->xperiodic != 1 || domain->yperiodic != 1 ||
         domain->boundary[2][0] != 1 || domain->boundary[2][1] != 1)
       error->all(FLERR,"Incorrect boundaries with slab Ewald");
+    if (domain->triclinic)
+      error->all(FLERR,"Cannot (yet) use Ewald with triclinic box and slab correction");
   }
 
   // extract short-range Coulombic cutoff from pair style
@@ -143,6 +144,8 @@ void Ewald::init()
   q2 = qsqsum * force->qqrd2e / force->dielectric;
   bigint natoms = atom->natoms;
 
+  triclinic = domain->triclinic;
+
   // use xprd,yprd,zprd even if triclinic so grid size is the same
   // adjust z dimension for 2d slab Ewald
   // 3d Ewald just uses zprd since slab_volfactor = 1.0
@@ -171,9 +174,9 @@ void Ewald::init()
 
   // final RMS accuracy
 
-  double lprx = rms(kxmax,xprd,natoms,q2);
-  double lpry = rms(kymax,yprd,natoms,q2);
-  double lprz = rms(kzmax,zprd_slab,natoms,q2);
+  double lprx = rms(kxmax_orig,xprd,natoms,q2);
+  double lpry = rms(kymax_orig,yprd,natoms,q2);
+  double lprz = rms(kzmax_orig,zprd_slab,natoms,q2);
   double lpr = sqrt(lprx*lprx + lpry*lpry + lprz*lprz) / sqrt(3.0);
   double q2_over_sqrt = q2 / sqrt(natoms*cutoff*xprd*yprd*zprd_slab);
   double spr = 2.0 *q2_over_sqrt * exp(-g_ewald*g_ewald*cutoff*cutoff);
@@ -263,6 +266,28 @@ void Ewald::setup()
   double gsqzmx = unitk[2]*unitk[2]*kzmax*kzmax;
   gsqmx = MAX(gsqxmx,gsqymx);
   gsqmx = MAX(gsqmx,gsqzmx);
+
+  kxmax_orig = kxmax;
+  kymax_orig = kymax;
+  kzmax_orig = kzmax;
+
+  // scale lattice vectors for triclinic skew
+
+  if (triclinic) {
+    double tmp[3];
+    tmp[0] = kxmax/xprd;
+    tmp[1] = kymax/yprd;
+    tmp[2] = kzmax/zprd;
+    lamda2xT(&tmp[0],&tmp[0]);
+    kxmax = static_cast<int>(tmp[0]);
+    kymax = static_cast<int>(tmp[1]);
+    kzmax = static_cast<int>(tmp[2]);
+
+    kmax = MAX(kxmax,kymax);
+    kmax = MAX(kmax,kzmax);
+    kmax3d = 4*kmax*kmax*kmax + 6*kmax*kmax + 3*kmax;
+  }
+
   gsqmx *= 1.00001;
 
   // if size has grown, reallocate k-dependent and nlocal-dependent arrays
@@ -284,7 +309,10 @@ void Ewald::setup()
 
   // pre-compute Ewald coefficients
 
-  coeffs();
+  if (triclinic == 0)
+    coeffs();
+  else
+    coeffs_triclinic();
 }
 
 /* ----------------------------------------------------------------------
@@ -330,7 +358,11 @@ void Ewald::compute(int eflag, int vflag)
   // partial structure factors on each processor
   // total structure factor by summing over procs
 
-  eik_dot_r();
+  if (triclinic == 0)
+    eik_dot_r();
+  else
+    eik_dot_r_triclinic();
+
   MPI_Allreduce(sfacrl,sfacrl_all,kcount,MPI_DOUBLE,MPI_SUM,world);
   MPI_Allreduce(sfacim,sfacim_all,kcount,MPI_DOUBLE,MPI_SUM,world);
 
@@ -368,7 +400,7 @@ void Ewald::compute(int eflag, int vflag)
 
       if (evflag_atom) {
         partial_peratom = exprl*sfacrl_all[k] + expim*sfacim_all[k];
-            if (eflag_atom) eatom[i] += q[i]*ug[k]*partial_peratom;
+        if (eflag_atom) eatom[i] += q[i]*ug[k]*partial_peratom;
         if (vflag_atom)
           for (j = 0; j < 6; j++)
             vatom[i][j] += ug[k]*vg[k][j]*partial_peratom;
@@ -611,6 +643,82 @@ void Ewald::eik_dot_r()
   }
 }
 
+/* ---------------------------------------------------------------------- */
+
+void Ewald::eik_dot_r_triclinic()
+{
+  int i,k,l,m,n,ic;
+  double cstr1,sstr1;
+  double sqk,clpm,slpm;
+
+  double **x = atom->x;
+  double *q = atom->q;
+  int nlocal = atom->nlocal;
+
+  double unitk_lamda[3];
+
+  double max_kvecs[3];
+  max_kvecs[0] = kxmax;
+  max_kvecs[1] = kymax;
+  max_kvecs[2] = kzmax;
+
+  // (k,0,0), (0,l,0), (0,0,m)
+
+  for (ic = 0; ic < 3; ic++) {
+    unitk_lamda[0] = 0.0;
+    unitk_lamda[1] = 0.0;
+    unitk_lamda[2] = 0.0;
+    unitk_lamda[ic] = 2.0*MY_PI;
+    x2lamdaT(&unitk_lamda[0],&unitk_lamda[0]);
+    sqk = unitk_lamda[ic]*unitk_lamda[ic];
+    if (sqk <= gsqmx) {
+      for (i = 0; i < nlocal; i++) {
+        cs[0][ic][i] = 1.0;
+        sn[0][ic][i] = 0.0;
+        cs[1][ic][i] = cos(unitk_lamda[0]*x[i][0] + unitk_lamda[1]*x[i][1] + unitk_lamda[2]*x[i][2]);
+        sn[1][ic][i] = sin(unitk_lamda[0]*x[i][0] + unitk_lamda[1]*x[i][1] + unitk_lamda[2]*x[i][2]);
+        cs[-1][ic][i] = cs[1][ic][i];
+        sn[-1][ic][i] = -sn[1][ic][i];
+      }
+    }
+  }
+
+  for (ic = 0; ic < 3; ic++) {
+    for (m = 2; m <= max_kvecs[ic]; m++) {
+      unitk_lamda[0] = 0.0;
+      unitk_lamda[1] = 0.0;
+      unitk_lamda[2] = 0.0;
+      unitk_lamda[ic] = 2.0*MY_PI*m;
+      x2lamdaT(&unitk_lamda[0],&unitk_lamda[0]);
+      sqk = unitk_lamda[ic]*unitk_lamda[ic];
+      for (i = 0; i < nlocal; i++) {
+        cs[m][ic][i] = cs[m-1][ic][i]*cs[1][ic][i] -
+          sn[m-1][ic][i]*sn[1][ic][i];
+        sn[m][ic][i] = sn[m-1][ic][i]*cs[1][ic][i] +
+          cs[m-1][ic][i]*sn[1][ic][i];
+        cs[-m][ic][i] = cs[m][ic][i];
+        sn[-m][ic][i] = -sn[m][ic][i];
+      }
+    }
+  }
+
+  for (n = 0; n < kcount; n++) {
+    k = kxvecs[n];
+    l = kyvecs[n];
+    m = kzvecs[n];
+    cstr1 = 0.0;
+    sstr1 = 0.0;
+    for (i = 0; i < nlocal; i++) {
+      clpm = cs[l][1][i]*cs[m][2][i] - sn[l][1][i]*sn[m][2][i];
+      slpm = sn[l][1][i]*cs[m][2][i] + cs[l][1][i]*sn[m][2][i];
+      cstr1 += q[i]*(cs[k][0][i]*clpm - sn[k][0][i]*slpm);
+      sstr1 += q[i]*(sn[k][0][i]*clpm + cs[k][0][i]*slpm);
+    }
+    sfacrl[n] = cstr1;
+    sfacim[n] = sstr1;
+  }
+}
+
 /* ----------------------------------------------------------------------
    pre-compute coefficients for each Ewald K-vector
 ------------------------------------------------------------------------- */
@@ -878,6 +986,112 @@ void Ewald::coeffs()
   }
 }
 
+/* ----------------------------------------------------------------------
+   pre-compute coefficients for each Ewald K-vector for a triclinic 
+   system
+------------------------------------------------------------------------- */
+
+void Ewald::coeffs_triclinic()
+{
+  int k,l,m;
+  double sqk,vterm;
+
+  double g_ewald_sq_inv = 1.0 / (g_ewald*g_ewald);
+  double preu = 4.0*MY_PI/volume;
+
+  double unitk_lamda[3];
+
+  kcount = 0;
+
+  // 1 = (k,l,m), 2 = (k,-l,m), 3 = (k,l,-m), 4 = (k,-l,-m)
+
+  for (k = 1; k <= kxmax; k++) {
+    for (l = -kymax; l <= kymax; l++) {
+      for (m = -kzmax; m <= kzmax; m++) {
+        unitk_lamda[0] = 2.0*MY_PI*k;
+        unitk_lamda[1] = 2.0*MY_PI*l;
+        unitk_lamda[2] = 2.0*MY_PI*m;
+        x2lamdaT(&unitk_lamda[0],&unitk_lamda[0]);
+        sqk = unitk_lamda[0]*unitk_lamda[0] + unitk_lamda[1]*unitk_lamda[1] +
+          unitk_lamda[2]*unitk_lamda[2];
+        if (sqk <= gsqmx) {
+          kxvecs[kcount] = k;
+          kyvecs[kcount] = l;
+          kzvecs[kcount] = m;
+          ug[kcount] = preu*exp(-0.25*sqk*g_ewald_sq_inv)/sqk;
+          eg[kcount][0] = 2.0*unitk_lamda[0]*ug[kcount];
+          eg[kcount][1] = 2.0*unitk_lamda[1]*ug[kcount];
+          eg[kcount][2] = 2.0*unitk_lamda[2]*ug[kcount];
+          vterm = -2.0*(1.0/sqk + 0.25*g_ewald_sq_inv);
+          vg[kcount][0] = 1.0 + vterm*unitk_lamda[0]*unitk_lamda[0];
+          vg[kcount][1] = 1.0 + vterm*unitk_lamda[1]*unitk_lamda[1];
+          vg[kcount][2] = 1.0 + vterm*unitk_lamda[2]*unitk_lamda[2];
+          vg[kcount][3] = vterm*unitk_lamda[0]*unitk_lamda[1];
+          vg[kcount][4] = vterm*unitk_lamda[0]*unitk_lamda[2];
+          vg[kcount][5] = vterm*unitk_lamda[1]*unitk_lamda[2];
+          kcount++;
+        }
+      }
+    }
+  }
+
+  // 1 = (0,l,m), 2 = (0,l,-m)
+
+  for (l = 1; l <= kymax; l++) {
+    for (m = -kzmax; m <= kzmax; m++) {
+      unitk_lamda[0] = 0.0;
+      unitk_lamda[1] = 2.0*MY_PI*l;
+      unitk_lamda[2] = 2.0*MY_PI*m;
+      x2lamdaT(&unitk_lamda[0],&unitk_lamda[0]);
+      sqk = unitk_lamda[1]*unitk_lamda[1] + unitk_lamda[2]*unitk_lamda[2];
+      if (sqk <= gsqmx) {
+        kxvecs[kcount] = 0;
+        kyvecs[kcount] = l;
+        kzvecs[kcount] = m;
+        ug[kcount] = preu*exp(-0.25*sqk*g_ewald_sq_inv)/sqk;
+        eg[kcount][0] =  0.0;
+        eg[kcount][1] =  2.0*unitk_lamda[1]*ug[kcount];
+        eg[kcount][2] =  2.0*unitk_lamda[2]*ug[kcount];
+        vterm = -2.0*(1.0/sqk + 0.25*g_ewald_sq_inv);
+        vg[kcount][0] = 1.0;
+        vg[kcount][1] = 1.0 + vterm*unitk_lamda[1]*unitk_lamda[1];
+        vg[kcount][2] = 1.0 + vterm*unitk_lamda[2]*unitk_lamda[2];
+        vg[kcount][3] = 0.0;
+        vg[kcount][4] = 0.0;
+        vg[kcount][5] = vterm*unitk_lamda[1]*unitk_lamda[2];
+        kcount++;
+      }
+    }
+  }
+
+  // (0,0,m)
+
+  for (m = 1; m <= kmax; m++) {
+    unitk_lamda[0] = 0.0;
+    unitk_lamda[1] = 0.0;
+    unitk_lamda[2] = 2.0*MY_PI*m;
+    x2lamdaT(&unitk_lamda[0],&unitk_lamda[0]);
+    sqk = unitk_lamda[2]*unitk_lamda[2];
+    if (sqk <= gsqmx) {
+      kxvecs[kcount] = 0;
+      kyvecs[kcount] = 0;
+      kzvecs[kcount] = m;
+      ug[kcount] = preu*exp(-0.25*sqk*g_ewald_sq_inv)/sqk;
+      eg[kcount][0] = 0.0;
+      eg[kcount][1] = 0.0;
+      eg[kcount][2] = 2.0*unitk_lamda[2]*ug[kcount];
+      vterm = -2.0*(1.0/sqk + 0.25*g_ewald_sq_inv);
+      vg[kcount][0] = 1.0;
+      vg[kcount][1] = 1.0;
+      vg[kcount][2] = 1.0 + vterm*unitk_lamda[2]*unitk_lamda[2];
+      vg[kcount][3] = 0.0;
+      vg[kcount][4] = 0.0;
+      vg[kcount][5] = 0.0;
+      kcount++;
+    }
+  }
+}
+
 /* ----------------------------------------------------------------------
    allocate memory that depends on # of K-vectors
 ------------------------------------------------------------------------- */
diff --git a/src/KSPACE/ewald.h b/src/KSPACE/ewald.h
index e8efd7bc8c..c97690412d 100644
--- a/src/KSPACE/ewald.h
+++ b/src/KSPACE/ewald.h
@@ -43,6 +43,7 @@ class Ewald : public KSpace {
 
   double unitk[3];
   int *kxvecs,*kyvecs,*kzvecs;
+  int kxmax_orig,kymax_orig,kzmax_orig;
   double *ug;
   double **eg,**vg;
   double **ek;
@@ -62,6 +63,12 @@ class Ewald : public KSpace {
   void deallocate();
   void slabcorr();
 
+  // triclinic
+
+  int triclinic;
+  void eik_dot_r_triclinic();
+  void coeffs_triclinic();
+
   // group-group interactions
 
   void allocate_groups();
@@ -81,10 +88,6 @@ Self-explanatory.  Check the input script syntax and compare to the
 documentation for the command.  You can use -echo screen as a
 command-line option when running LAMMPS to see the offending line.
 
-E: Cannot use Ewald with triclinic box
-
-This feature is not yet supported.
-
 E: Cannot use Ewald with 2d simulation
 
 The kspace style ewald cannot be used in 2d simulations.  You can use
@@ -105,6 +108,10 @@ E: Incorrect boundaries with slab Ewald
 Must have periodic x,y dimensions and non-periodic z dimension to use
 2d slab option with Ewald.
 
+E: Cannot (yet) use Ewald with triclinic box and slab correction
+
+This feature is not yet supported.
+
 E: KSpace style is incompatible with Pair style
 
 Setting a kspace style requires that a pair style with a long-range
diff --git a/src/KSPACE/msm.cpp b/src/KSPACE/msm.cpp
index cadd865b02..40bc87f042 100644
--- a/src/KSPACE/msm.cpp
+++ b/src/KSPACE/msm.cpp
@@ -126,9 +126,6 @@ void MSM::init()
 
   // error check
 
-  if (domain->triclinic)
-    error->all(FLERR,"Cannot (yet) use MSM with triclinic box");
-
   if (domain->dimension == 2)
     error->all(FLERR,"Cannot (yet) use MSM with 2d simulation");
 
@@ -186,6 +183,8 @@ void MSM::init()
     error->all(FLERR,str);
   }
 
+  triclinic = domain->triclinic;
+
   // set accuracy (force units) from accuracy_relative or accuracy_absolute
 
   if (accuracy_absolute >= 0.0) accuracy = accuracy_absolute;
@@ -343,9 +342,9 @@ double MSM::estimate_3d_error()
   double xprd = domain->xprd;
   double yprd = domain->yprd;
   double zprd = domain->zprd;
-  double error_x = estimate_1d_error(xprd/nx_msm[0],xprd);
-  double error_y = estimate_1d_error(yprd/ny_msm[0],yprd);
-  double error_z = estimate_1d_error(zprd/nz_msm[0],zprd);
+  double error_x = estimate_1d_error(h_x,xprd);
+  double error_y = estimate_1d_error(h_y,yprd);
+  double error_z = estimate_1d_error(h_z,zprd);
   double error_3d =
    sqrt(error_x*error_x + error_y*error_y + error_z*error_z) / sqrt(3.0);
   return error_3d;
@@ -394,21 +393,39 @@ void MSM::setup()
   // loop over grid levels and compute grid spacing
 
   for (int n=0; n<levels; n++) {
+    if (triclinic == 0) {
+      delxinv[n] = nx_msm[n]/xprd;
+      delyinv[n] = ny_msm[n]/yprd;
+      delzinv[n] = nz_msm[n]/zprd;
+    } else { // use lamda (0-1) coordinates
+      delxinv[n] = nx_msm[n];
+      delyinv[n] = ny_msm[n];
+      delzinv[n] = nz_msm[n];
+    }
+  }
 
-    delxinv[n] = nx_msm[n]/xprd;
-    delyinv[n] = ny_msm[n]/yprd;
-    delzinv[n] = nz_msm[n]/zprd;
+  double ax = a;
+  double ay = a;
+  double az = a;
 
-    delvolinv[n] = delxinv[n]*delyinv[n]*delzinv[n];
+  // transform the interaction sphere in box coords to an ellipsoid in lamda (0-1) coords to
+  // get the direct sum interaction limits for a triclinic system
+
+  if (triclinic) {
+    double tmp[3];
+    kspacebbox(a,&tmp[0]);
+    ax = tmp[0];
+    ay = tmp[1];
+    az = tmp[2];
   }
 
   // direct sum interaction limits
 
-  nxhi_direct = static_cast<int> (2.0*a*delxinv[0]);
+  nxhi_direct = static_cast<int> (2.0*ax*delxinv[0]);
   nxlo_direct = -nxhi_direct;
-  nyhi_direct = static_cast<int> (2.0*a*delyinv[0]);
+  nyhi_direct = static_cast<int> (2.0*ay*delyinv[0]);
   nylo_direct = -nyhi_direct;
-  nzhi_direct = static_cast<int> (2.0*a*delzinv[0]);
+  nzhi_direct = static_cast<int> (2.0*az*delzinv[0]);
   nzlo_direct = -nzhi_direct;
 
   nmax_direct = 8*(nxhi_direct+1)*(nyhi_direct+1)*(nzhi_direct+1);
@@ -434,7 +451,10 @@ void MSM::setup()
     }
   }
 
-  boxlo = domain->boxlo;
+  if (!triclinic)
+    boxlo = domain->boxlo;
+  else
+    boxlo = domain->boxlo_lamda;
 
   // ghost grid points depend on direct sum interaction limits, so need to recompute local grid
 
@@ -486,6 +506,11 @@ void MSM::compute(int eflag, int vflag)
     peratom_allocate_flag = 1;
   }
 
+  // convert atoms from box to lamda coords
+
+  if (triclinic)
+    domain->x2lamda(atom->nlocal);
+
   // extend size of per-atom arrays if necessary
 
   if (atom->nlocal > nmax) {
@@ -618,6 +643,10 @@ void MSM::compute(int eflag, int vflag)
     }
   }
 
+  // convert atoms back from lamda to box coords
+
+  if (triclinic)
+    domain->lamda2x(atom->nlocal);
 }
 
 /* ----------------------------------------------------------------------
@@ -810,7 +839,6 @@ void MSM::allocate_levels()
   delxinv = new double[levels];
   delyinv = new double[levels];
   delzinv = new double[levels];
-  delvolinv = new double[levels];
 
   qgrid = new double***[levels];
   egrid = new double***[levels];
@@ -882,7 +910,6 @@ void MSM::deallocate_levels()
   delete [] delxinv;
   delete [] delyinv;
   delete [] delzinv;
-  delete [] delvolinv;
 
   delete [] qgrid;
   delete [] egrid;
@@ -917,14 +944,16 @@ void MSM::set_grid_global()
     int p = order - 1;
     double hmin = 3072.0*(p+1)/(p-1)/
       (448.0*MY_PI + 56.0*MY_PI*order/2 + 1701.0);
-    hmin = pow(hmin,1.0/6.0)/pow(atom->natoms,1.0/3.0);
-    hx = hmin*xprd;
-    hy = hmin*yprd;
-    hz = hmin*zprd;
+    hmin = pow(hmin,1.0/6.0)*pow(xprd*yprd*zprd/atom->natoms,1.0/3.0);
+
+    nx_max = static_cast<int>(xprd/hmin);
+    ny_max = static_cast<int>(yprd/hmin);
+    nz_max = static_cast<int>(zprd/hmin);
+
+    nx_max = MAX(nx_max,2);
+    ny_max = MAX(ny_max,2);
+    nz_max = MAX(nz_max,2);
 
-    nx_max = static_cast<int>(xprd/hx);
-    ny_max = static_cast<int>(yprd/hy);
-    nz_max = static_cast<int>(zprd/hz);
   } else if (!gridflag) {
     // Coulombic cutoff is set by user, choose grid to give requested error
     nx_max = ny_max = nz_max = 2;
@@ -960,39 +989,6 @@ void MSM::set_grid_global()
     nz_max = nz_msm_max;
   }
 
-  // boost grid size until it is factorable by 2
-
-  int flag = 0;
-  int xlevels,ylevels,zlevels;
-
-  while (!factorable(nx_max,flag,xlevels)) nx_max++;
-  while (!factorable(ny_max,flag,ylevels)) ny_max++;
-  while (!factorable(nz_max,flag,zlevels)) nz_max++;
-
-  if (flag && gridflag && me == 0)
-    error->warning(FLERR,"Number of MSM mesh points increased to be a multiple of 2");
-
-  // find maximum number of levels
-
-  levels = MAX(xlevels,ylevels);
-  levels = MAX(levels,zlevels);
-
-  if (levels > MAX_LEVELS) error->all(FLERR,"Too many MSM grid levels");
-
-  // need at least 2 MSM levels for periodic systems
-
-  if (levels <= 1) {
-    levels = xlevels = ylevels = zlevels = 2;
-    nx_max = ny_max = nz_max = 2;
-    if (gridflag)
-      error->warning(FLERR,
-             "MSM mesh too small, increasing to 2 points in each direction");
-  }
-
-  // omit top grid level for periodic systems
-
-  if (!domain->nonperiodic) levels -= 1;
-
   // adjust Coulombic cutoff to give desired error (if requested)
 
   if (adjust_cutoff_flag) {
@@ -1015,6 +1011,67 @@ void MSM::set_grid_global()
     if (me == 0) error->warning(FLERR,str);
   }
 
+  // scale grid for triclinic skew
+
+  if (triclinic && !gridflag) {
+    double tmp[3];
+    tmp[0] = nx_max/xprd;
+    tmp[1] = ny_max/yprd;
+    tmp[2] = nz_max/zprd;
+    lamda2xT(&tmp[0],&tmp[0]);
+    nx_max = static_cast<int>(tmp[0]);
+    ny_max = static_cast<int>(tmp[1]);
+    nz_max = static_cast<int>(tmp[2]);
+  }
+
+  // boost grid size until it is factorable by 2
+
+  int flag = 0;
+  int xlevels,ylevels,zlevels;
+
+  while (!factorable(nx_max,flag,xlevels)) nx_max++;
+  while (!factorable(ny_max,flag,ylevels)) ny_max++;
+  while (!factorable(nz_max,flag,zlevels)) nz_max++;
+
+  if (flag && gridflag && me == 0)
+    error->warning(FLERR,"Number of MSM mesh points increased to be a multiple of 2");
+
+  if (triclinic == 0) {
+    h_x = xprd/nx_max;
+    h_y = yprd/ny_max;
+    h_z = zprd/nz_max;
+  } else {
+    double tmp[3];
+    tmp[0] = nx_max;
+    tmp[1] = ny_max;
+    tmp[2] = nz_max;
+    x2lamdaT(&tmp[0],&tmp[0]);
+    h_x = 1.0/tmp[0];
+    h_y = 1.0/tmp[1];
+    h_z = 1.0/tmp[2];
+  }
+
+  // find maximum number of levels
+
+  levels = MAX(xlevels,ylevels);
+  levels = MAX(levels,zlevels);
+
+  if (levels > MAX_LEVELS) error->all(FLERR,"Too many MSM grid levels");
+
+  // need at least 2 MSM levels for periodic systems
+
+  if (levels <= 1) {
+    levels = xlevels = ylevels = zlevels = 2;
+    nx_max = ny_max = nz_max = 2;
+    if (gridflag)
+      error->warning(FLERR,
+             "MSM mesh too small, increasing to 2 points in each direction");
+  }
+
+  // omit top grid level for periodic systems
+
+  if (!domain->nonperiodic) levels -= 1;
+
   allocate_levels();
 
   // find number of grid levels in each direction
@@ -1104,12 +1161,15 @@ void MSM::set_grid_local()
 
     double *prd,*sublo,*subhi,*boxhi;
 
-    prd = domain->prd;
-    boxlo = domain->boxlo;
-    boxhi = domain->boxhi;
-
-    sublo = domain->sublo;
-    subhi = domain->subhi;
+    if (!triclinic) {
+      prd = domain->prd;
+      sublo = domain->sublo;
+      subhi = domain->subhi;
+    } else {
+      prd = domain->prd_lamda;
+      sublo = domain->sublo_lamda;
+      subhi = domain->subhi_lamda;
+    }
 
     double xprd = prd[0];
     double yprd = prd[1];
@@ -1130,6 +1190,7 @@ void MSM::set_grid_local()
     double cuthalf = 0.0;
     if (n == 0) cuthalf = 0.5*neighbor->skin; // only applies to finest grid
     dist[0] = dist[1] = dist[2] = cuthalf;
+    if (triclinic) kspacebbox(cuthalf,&dist[0]);
 
     int nlo,nhi;
 
@@ -2698,6 +2759,19 @@ void MSM::fieldforce()
     eky *= delyinv[0];
     ekz *= delzinv[0];
 
+    // effectively divide by length for a triclinic system
+
+    if (triclinic) {
+      double tmp[3];
+      tmp[0] = ekx;
+      tmp[1] = eky;
+      tmp[2] = ekz;
+      x2lamdaT(&tmp[0],&tmp[0]);
+      ekx = tmp[0];
+      eky = tmp[1];
+      ekz = tmp[2];
+    }
+
     // convert E-field to force
 
     const double qfactor = force->qqrd2e*scale*q[i];
@@ -3036,6 +3110,8 @@ void MSM::get_g_direct()
 
   int n,zk,zyk,k,ix,iy,iz;
   double xdiff,ydiff,zdiff;
+  double dx,dy,dz;
+  double tmp[3];
   double rsq,rho,two_n;
 
   two_n = 1.0;
@@ -3053,7 +3129,25 @@ void MSM::get_g_direct()
         zyk = (zk + iy + nyhi_direct)*nx;
         for (ix = nxlo_direct; ix <= nxhi_direct; ix++) {
           xdiff = ix/delxinv[n];
-          rsq = xdiff*xdiff + ydiff*ydiff + zdiff*zdiff;
+
+          // transform grid point pair-wise distance from lamda (0-1) coords to box coords          
+
+          if (triclinic) {
+            tmp[0] = xdiff;
+            tmp[1] = ydiff;
+            tmp[2] = zdiff;
+            lamda2xvector(&tmp[0],&tmp[0]);
+            dx = tmp[0];
+            dy = tmp[1];
+            dz = tmp[2];
+          } else {
+            dx = xdiff;
+            dy = ydiff;
+            dz = zdiff;
+          }
+
+          rsq = dx*dx + dy*dy + dz*dz;
+
           rho = sqrt(rsq)/(two_n*a);
           k = zyk + ix + nxhi_direct;
           g_direct[n][k] = gamma(rho)/(two_n*a) - gamma(rho/2.0)/(2.0*two_n*a);
@@ -3087,6 +3181,8 @@ void MSM::get_virial_direct()
 
   int n,zk,zyk,k,ix,iy,iz;
   double xdiff,ydiff,zdiff;
+  double dx,dy,dz;
+  double tmp[3];
   double rsq,r,rho,two_n,two_nsq,dg;
 
   two_n = 1.0;
@@ -3105,7 +3201,22 @@ void MSM::get_virial_direct()
         zyk = (zk + iy + nyhi_direct)*nx;
         for (ix = nxlo_direct; ix <= nxhi_direct; ix++) {
           xdiff = ix/delxinv[n];
-          rsq = xdiff*xdiff + ydiff*ydiff + zdiff*zdiff;
+
+          if (triclinic) {
+            tmp[0] = xdiff;
+            tmp[1] = ydiff;
+            tmp[2] = zdiff;
+            lamda2xvector(&tmp[0],&tmp[0]);
+            dx = tmp[0];
+            dy = tmp[1];
+            dz = tmp[2];
+          } else {
+            dx = xdiff;
+            dy = ydiff;
+            dz = zdiff;
+          }
+
+          rsq = dx*dx + dy*dy + dz*dz;
           k = zyk + ix + nxhi_direct;
           r = sqrt(rsq);
           if (r == 0) {
@@ -3119,12 +3230,12 @@ void MSM::get_virial_direct()
             rho = r/(two_n*a);
             dg = -(dgamma(rho)/(two_nsq*a_sq) -
               dgamma(rho/2.0)/(4.0*two_nsq*a_sq))/r;
-            v0_direct[n][k] = dg * xdiff * xdiff;
-            v1_direct[n][k] = dg * ydiff * ydiff;
-            v2_direct[n][k] = dg * zdiff * zdiff;
-            v3_direct[n][k] = dg * xdiff * ydiff;
-            v4_direct[n][k] = dg * xdiff * zdiff;
-            v5_direct[n][k] = dg * ydiff * zdiff;
+            v0_direct[n][k] = dg * dx * dx;
+            v1_direct[n][k] = dg * dy * dy;
+            v2_direct[n][k] = dg * dz * dz;
+            v3_direct[n][k] = dg * dx * dy;
+            v4_direct[n][k] = dg * dx * dz;
+            v5_direct[n][k] = dg * dy * dz;
           }
 
         }
@@ -3157,6 +3268,8 @@ void MSM::get_g_direct_top(int n)
 
   int zk,zyk,k,ix,iy,iz;
   double xdiff,ydiff,zdiff;
+  double dx,dy,dz;
+  double tmp[3];
   double rsq,rho,two_n;
 
   two_n = pow(2.0,n);
@@ -3169,7 +3282,22 @@ void MSM::get_g_direct_top(int n)
       zyk = (zk + iy + ny_top)*nx;
       for (ix = -nx_top; ix <= nx_top; ix++) {
         xdiff = ix/delxinv[n];
-        rsq = xdiff*xdiff + ydiff*ydiff + zdiff*zdiff;
+
+        if (triclinic) {
+          tmp[0] = xdiff;
+          tmp[1] = ydiff;
+          tmp[2] = zdiff;
+          lamda2xvector(&tmp[0],&tmp[0]);
+          dx = tmp[0];
+          dy = tmp[1];
+          dz = tmp[2];
+        } else {
+          dx = xdiff;
+          dy = ydiff;
+          dz = zdiff;
+        }
+
+        rsq = dx*dx + dy*dy + dz*dz;
         rho = sqrt(rsq)/(two_n*a);
         k = zyk + ix + nx_top;
         g_direct_top[k] = gamma(rho)/(two_n*a);
@@ -3212,6 +3340,8 @@ void MSM::get_virial_direct_top(int n)
 
   int zk,zyk,k,ix,iy,iz;
   double xdiff,ydiff,zdiff;
+  double dx,dy,dz;
+  double tmp[3];
   double rsq,r,rho,two_n,two_nsq,dg;
 
   two_n = pow(2.0,n);
@@ -3225,7 +3355,21 @@ void MSM::get_virial_direct_top(int n)
       zyk = (zk + iy + ny_top)*nx;
       for (ix = -nx_top; ix <= nx_top; ix++) {
         xdiff = ix/delxinv[n];
-        rsq = xdiff*xdiff + ydiff*ydiff + zdiff*zdiff;
+        if (triclinic) {
+          tmp[0] = xdiff;
+          tmp[1] = ydiff;
+          tmp[2] = zdiff;
+          lamda2xvector(&tmp[0],&tmp[0]);
+          dx = tmp[0];
+          dy = tmp[1];
+          dz = tmp[2];
+        } else {
+          dx = xdiff;
+          dy = ydiff;
+          dz = zdiff;
+        }
+
+        rsq = dx*dx + dy*dy + dz*dz;
         k = zyk + ix + nx_top;
         r = sqrt(rsq);
         if (r == 0) {
@@ -3238,12 +3382,12 @@ void MSM::get_virial_direct_top(int n)
         } else {
           rho = r/(two_n*a);
           dg = -(dgamma(rho)/(two_nsq*a_sq))/r;
-          v0_direct_top[k] = dg * xdiff * xdiff;
-          v1_direct_top[k] = dg * ydiff * ydiff;
-          v2_direct_top[k] = dg * zdiff * zdiff;
-          v3_direct_top[k] = dg * xdiff * ydiff;
-          v4_direct_top[k] = dg * xdiff * zdiff;
-          v5_direct_top[k] = dg * ydiff * zdiff;
+          v0_direct_top[k] = dg * dx * dx;
+          v1_direct_top[k] = dg * dy * dy;
+          v2_direct_top[k] = dg * dz * dz;
+          v3_direct_top[k] = dg * dx * dy;
+          v4_direct_top[k] = dg * dx * dz;
+          v5_direct_top[k] = dg * dy * dz;
         }
       }
     }
diff --git a/src/KSPACE/msm.h b/src/KSPACE/msm.h
index 781f91c4bd..9a91a6a5e6 100644
--- a/src/KSPACE/msm.h
+++ b/src/KSPACE/msm.h
@@ -44,7 +44,8 @@ class MSM : public KSpace {
   double qqrd2e;
   double cutoff;
   double volume;
-  double *delxinv,*delyinv,*delzinv,*delvolinv;
+  double *delxinv,*delyinv,*delzinv;
+  double h_x,h_y,h_z;
 
   int *nx_msm,*ny_msm,*nz_msm;
   int *nxlo_in,*nylo_in,*nzlo_in;
@@ -128,6 +129,10 @@ class MSM : public KSpace {
   void get_g_direct_top(int);
   void get_virial_direct_top(int);
 
+  // triclinic
+
+  int triclinic;
+
   // grid communication
   void pack_forward(int, double *, int, int *);
   void unpack_forward(int, double *, int, int *);
@@ -148,10 +153,6 @@ Self-explanatory.  Check the input script syntax and compare to the
 documentation for the command.  You can use -echo screen as a
 command-line option when running LAMMPS to see the offending line.
 
-E: Cannot (yet) use MSM with triclinic box
-
-This feature is not yet supported.
-
 E: Cannot (yet) use MSM with 2d simulation
 
 This feature is not yet supported.
diff --git a/src/KSPACE/pppm.cpp b/src/KSPACE/pppm.cpp
index 6778efbb35..53f373ac12 100644
--- a/src/KSPACE/pppm.cpp
+++ b/src/KSPACE/pppm.cpp
@@ -15,6 +15,7 @@
    Contributing authors: Roy Pollock (LLNL), Paul Crozier (SNL)
      per-atom energy/virial & group/group energy/force added by Stan Moore (BYU)
      analytic diff (2 FFT) option added by Rolf Isele-Holder (Aachen University)
+     triclinic added by Stan Moore (SNL)
 ------------------------------------------------------------------------- */
 
 #include "lmptype.h"
@@ -170,8 +171,10 @@ void PPPM::init()
 
   // error check
 
-  if (domain->triclinic)
-    error->all(FLERR,"Cannot (yet) use PPPM with triclinic box");
+  if (domain->triclinic && differentiation_flag == 1)
+    error->all(FLERR,"Cannot (yet) use PPPM with triclinic box and 'kspace_modify diff ad'");
+  if (domain->triclinic && slabflag)
+    error->all(FLERR,"Cannot (yet) use PPPM with triclinic box and slab correction");
   if (domain->dimension == 2) error->all(FLERR,
                                          "Cannot use PPPM with 2d simulation");
 
@@ -234,6 +237,8 @@ void PPPM::init()
     double theta = force->angle->equilibrium_angle(typeA);
     double blen = force->bond->equilibrium_distance(typeB);
     alpha = qdist / (cos(0.5*theta) * blen);
+    if (domain->triclinic)
+      error->all(FLERR,"Cannot (yet) use PPPM with triclinic box and TIP4P");
   }
 
   // compute qsum & qsqsum and warn if not charge-neutral
@@ -380,6 +385,11 @@ void PPPM::init()
 
 void PPPM::setup()
 {
+  if (triclinic) {
+    setup_triclinic();
+    return;
+  }
+
   int i,j,k,n;
   double *prd;
 
@@ -459,6 +469,87 @@ void PPPM::setup()
   else compute_gf_ik();
 }
 
+/* ----------------------------------------------------------------------
+   adjust PPPM coeffs, called initially and whenever volume has changed
+   for a triclinic system
+------------------------------------------------------------------------- */
+
+void PPPM::setup_triclinic()
+{
+  int i,j,k,n;
+  double *prd;
+
+  // volume-dependent factors
+  // adjust z dimension for 2d slab PPPM
+  // z dimension for 3d PPPM is zprd since slab_volfactor = 1.0
+
+  prd = domain->prd;
+
+  double xprd = prd[0];
+  double yprd = prd[1];
+  double zprd = prd[2];
+  double zprd_slab = zprd*slab_volfactor;
+  volume = xprd * yprd * zprd_slab;
+
+  // use lamda (0-1) coordinates
+
+  delxinv = nx_pppm;
+  delyinv = ny_pppm;
+  delzinv = nz_pppm;
+  delvolinv = delxinv*delyinv*delzinv/volume;
+
+  // fkx,fky,fkz for my FFT grid pts
+
+  double per_i,per_j,per_k;
+
+  n = 0;
+  for (k = nzlo_fft; k <= nzhi_fft; k++) {
+    per_k = k - nz_pppm*(2*k/nz_pppm);
+    for (j = nylo_fft; j <= nyhi_fft; j++) {
+      per_j = j - ny_pppm*(2*j/ny_pppm);
+      for (i = nxlo_fft; i <= nxhi_fft; i++) {
+        per_i = i - nx_pppm*(2*i/nx_pppm);
+
+        double unitk_lamda[3];
+        unitk_lamda[0] = 2.0*MY_PI*per_i;
+        unitk_lamda[1] = 2.0*MY_PI*per_j;
+        unitk_lamda[2] = 2.0*MY_PI*per_k;
+        x2lamdaT(&unitk_lamda[0],&unitk_lamda[0]);
+        fkx[n] = unitk_lamda[0];
+        fky[n] = unitk_lamda[1];
+        fkz[n] = unitk_lamda[2];
+        n++;
+      }
+    }
+  }
+
+  // virial coefficients
+
+  double sqk,vterm;
+
+  for (n = 0; n < nfft; n++) {
+    sqk = fkx[n]*fkx[n] + fky[n]*fky[n] + fkz[n]*fkz[n];
+    if (sqk == 0.0) {
+      vg[n][0] = 0.0;
+      vg[n][1] = 0.0;
+      vg[n][2] = 0.0;
+      vg[n][3] = 0.0;
+      vg[n][4] = 0.0;
+      vg[n][5] = 0.0;
+    } else {
+      vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald));
+      vg[n][0] = 1.0 + vterm*fkx[n]*fkx[n];
+      vg[n][1] = 1.0 + vterm*fky[n]*fky[n];
+      vg[n][2] = 1.0 + vterm*fkz[n]*fkz[n];
+      vg[n][3] = vterm*fkx[n]*fky[n];
+      vg[n][4] = vterm*fkx[n]*fkz[n];
+      vg[n][5] = vterm*fky[n]*fkz[n];
+    }
+  }
+
+  compute_gf_ik_triclinic();
+}
+
 /* ----------------------------------------------------------------------
    reset local grid arrays and communication stencils
    called by fix balance b/c it changed sizes of processor sub-domains
@@ -652,9 +743,15 @@ void PPPM::allocate()
   memory->create(work2,2*nfft_both,"pppm:work2");
   memory->create(vg,nfft_both,6,"pppm:vg");
 
-  memory->create1d_offset(fkx,nxlo_fft,nxhi_fft,"pppm:fkx");
-  memory->create1d_offset(fky,nylo_fft,nyhi_fft,"pppm:fky");
-  memory->create1d_offset(fkz,nzlo_fft,nzhi_fft,"pppm:fkz");
+  if (triclinic == 0) {
+    memory->create1d_offset(fkx,nxlo_fft,nxhi_fft,"pppm:fkx");
+    memory->create1d_offset(fky,nylo_fft,nyhi_fft,"pppm:fky");
+    memory->create1d_offset(fkz,nzlo_fft,nzhi_fft,"pppm:fkz");
+  } else {
+    memory->create(fkx,nfft_both,"pppm:fkx");
+    memory->create(fky,nfft_both,"pppm:fky");
+    memory->create(fkz,nfft_both,"pppm:fkz");
+  }
 
   if (differentiation_flag == 1) {
     memory->create3d_offset(u_brick,nzlo_out,nzhi_out,nylo_out,nyhi_out,
@@ -796,9 +893,15 @@ void PPPM::deallocate()
   memory->destroy(work2);
   memory->destroy(vg);
 
-  memory->destroy1d_offset(fkx,nxlo_fft);
-  memory->destroy1d_offset(fky,nylo_fft);
-  memory->destroy1d_offset(fkz,nzlo_fft);
+  if (triclinic == 0) {
+    memory->destroy1d_offset(fkx,nxlo_fft);
+    memory->destroy1d_offset(fky,nylo_fft);
+    memory->destroy1d_offset(fkz,nzlo_fft);
+  } else {
+    memory->destroy(fkx);
+    memory->destroy(fky);
+    memory->destroy(fkz);
+  }
 
   memory->destroy(gf_b);
   memory->destroy2d_offset(rho1d,-order/2);
@@ -838,7 +941,7 @@ void PPPM::deallocate_peratom()
 
 void PPPM::set_grid_global()
 {
-  // use xprd,yprd,zprd even if triclinic so grid size is the same
+  // use xprd,yprd,zprd (even if triclinic, and then scale later)
   // adjust z dimension for 2d slab PPPM
   // 3d PPPM just uses zprd since slab_volfactor = 1.0
 
@@ -852,7 +955,7 @@ void PPPM::set_grid_global()
   // fluid-occupied volume used to estimate real-space error
   // zprd used rather than zprd_slab
 
-  double h,h_x,h_y,h_z;
+  double h;
   bigint natoms = atom->natoms;
 
   if (!gewaldflag) {
@@ -944,6 +1047,19 @@ void PPPM::set_grid_global()
         h_z = zprd_slab/nz_pppm;
       }
     }
+
+    // scale grid for triclinic skew
+    
+    if (triclinic) {
+      double tmp[3];
+      tmp[0] = nx_pppm/xprd;
+      tmp[1] = ny_pppm/yprd;
+      tmp[2] = nz_pppm/zprd;
+      lamda2xT(&tmp[0],&tmp[0]);
+      nx_pppm = static_cast<int>(tmp[0]) + 1;
+      ny_pppm = static_cast<int>(tmp[1]) + 1;
+      nz_pppm = static_cast<int>(tmp[2]) + 1;
+    }
   }
 
   // boost grid size until it is factorable
@@ -952,6 +1068,21 @@ void PPPM::set_grid_global()
   while (!factorable(ny_pppm)) ny_pppm++;
   while (!factorable(nz_pppm)) nz_pppm++;
 
+  if (triclinic == 0) {
+    h_x = xprd/nx_pppm;
+    h_y = yprd/ny_pppm;
+    h_z = zprd_slab/nz_pppm;
+  } else {
+    double tmp[3];
+    tmp[0] = nx_pppm;
+    tmp[1] = ny_pppm;
+    tmp[2] = nz_pppm;
+    x2lamdaT(&tmp[0],&tmp[0]);
+    h_x = 1.0/tmp[0];
+    h_y = 1.0/tmp[1];
+    h_z = 1.0/tmp[2];
+  }
+
   if (nx_pppm >= OFFSET || ny_pppm >= OFFSET || nz_pppm >= OFFSET)
     error->all(FLERR,"PPPM grid is too large");
 }
@@ -994,9 +1125,9 @@ double PPPM::compute_df_kspace()
     double qopt = compute_qopt();
     df_kspace = sqrt(qopt/natoms)*q2/(xprd*yprd*zprd_slab);
   } else {
-    double lprx = estimate_ik_error(xprd/nx_pppm,xprd,natoms);
-    double lpry = estimate_ik_error(yprd/ny_pppm,yprd,natoms);
-    double lprz = estimate_ik_error(zprd_slab/nz_pppm,zprd_slab,natoms);
+    double lprx = estimate_ik_error(h_x,xprd,natoms);
+    double lpry = estimate_ik_error(h_y,yprd,natoms);
+    double lprz = estimate_ik_error(h_z,zprd_slab,natoms);
     df_kspace = sqrt(lprx*lprx + lpry*lpry + lprz*lprz) / sqrt(3.0);
   }
   return df_kspace;
@@ -1009,7 +1140,7 @@ double PPPM::compute_df_kspace()
 double PPPM::compute_qopt()
 {
   double qopt = 0.0;
-  double *prd = (triclinic==0) ? domain->prd : domain->prd_lamda;
+  double *prd = domain->prd;
   
   const double xprd = prd[0];
   const double yprd = prd[1];
@@ -1254,11 +1385,7 @@ void PPPM::set_grid_local()
   double dist[3];
   double cuthalf = 0.5*neighbor->skin + qdist;
   if (triclinic == 0) dist[0] = dist[1] = dist[2] = cuthalf;
-  else {
-    dist[0] = cuthalf/domain->prd[0];
-    dist[1] = cuthalf/domain->prd[1];
-    dist[2] = cuthalf/domain->prd[2];
-  }
+  else kspacebbox(cuthalf,&dist[0]);
 
   int nlo,nhi;
 
@@ -1369,7 +1496,7 @@ void PPPM::compute_gf_denom()
 
 void PPPM::compute_gf_ik()
 {
-  const double * const prd = (triclinic==0) ? domain->prd : domain->prd_lamda;
+  const double * const prd = domain->prd;
 
   const double xprd = prd[0];
   const double yprd = prd[1];
@@ -1446,14 +1573,105 @@ void PPPM::compute_gf_ik()
   }
 }
 
+/* ----------------------------------------------------------------------
+   pre-compute modified (Hockney-Eastwood) Coulomb Green's function
+   for a triclinic system
+------------------------------------------------------------------------- */
+
+void PPPM::compute_gf_ik_triclinic()
+{
+  double snx,sny,snz;
+  double argx,argy,argz,wx,wy,wz,sx,sy,sz,qx,qy,qz;
+  double sum1,dot1,dot2;
+  double numerator,denominator;
+  double sqk;
+
+  int k,l,m,n,nx,ny,nz,kper,lper,mper;
+
+  double tmp[3];
+  tmp[0] = (g_ewald/(MY_PI*nx_pppm)) * pow(-log(EPS_HOC),0.25);
+  tmp[1] = (g_ewald/(MY_PI*ny_pppm)) * pow(-log(EPS_HOC),0.25);
+  tmp[2] = (g_ewald/(MY_PI*nz_pppm)) * pow(-log(EPS_HOC),0.25);
+  lamda2xT(&tmp[0],&tmp[0]);
+  const int nbx = static_cast<int> (tmp[0]);
+  const int nby = static_cast<int> (tmp[1]);
+  const int nbz = static_cast<int> (tmp[2]);
+
+  const int twoorder = 2*order;
+
+  n = 0;
+  for (m = nzlo_fft; m <= nzhi_fft; m++) {
+    mper = m - nz_pppm*(2*m/nz_pppm);
+    snz = square(sin(MY_PI*mper/nz_pppm));
+
+    for (l = nylo_fft; l <= nyhi_fft; l++) {
+      lper = l - ny_pppm*(2*l/ny_pppm);
+      sny = square(sin(MY_PI*lper/ny_pppm));
+
+      for (k = nxlo_fft; k <= nxhi_fft; k++) {
+        kper = k - nx_pppm*(2*k/nx_pppm);
+        snx = square(sin(MY_PI*kper/nx_pppm));
+
+        double unitk_lamda[3];
+        unitk_lamda[0] = 2.0*MY_PI*kper;
+        unitk_lamda[1] = 2.0*MY_PI*lper;
+        unitk_lamda[2] = 2.0*MY_PI*mper;
+        x2lamdaT(&unitk_lamda[0],&unitk_lamda[0]);
+
+        sqk = square(unitk_lamda[0]) + square(unitk_lamda[1]) + square(unitk_lamda[2]);
+
+        if (sqk != 0.0) {
+          numerator = 12.5663706/sqk;
+          denominator = gf_denom(snx,sny,snz);
+          sum1 = 0.0;
+
+          for (nx = -nbx; nx <= nbx; nx++) {
+            argx = MY_PI*kper/nx_pppm + MY_PI*nx;
+            wx = powsinxx(argx,twoorder);
+
+            for (ny = -nby; ny <= nby; ny++) {
+              argy = MY_PI*lper/ny_pppm + MY_PI*ny;
+              wy = powsinxx(argy,twoorder);
+
+              for (nz = -nbz; nz <= nbz; nz++) {
+                argz = MY_PI*mper/nz_pppm + MY_PI*nz;
+                wz = powsinxx(argz,twoorder);
+
+                double b[3];
+                b[0] = 2.0*MY_PI*nx_pppm*nx;
+                b[1] = 2.0*MY_PI*ny_pppm*ny;
+                b[2] = 2.0*MY_PI*nz_pppm*nz;
+                x2lamdaT(&b[0],&b[0]);
+
+                qx = unitk_lamda[0]+b[0];
+                sx = exp(-0.25*square(qx/g_ewald));
+
+                qy = unitk_lamda[1]+b[1];
+                sy = exp(-0.25*square(qy/g_ewald));
+
+                qz = unitk_lamda[2]+b[2];
+                sz = exp(-0.25*square(qz/g_ewald));
+
+                dot1 = unitk_lamda[0]*qx + unitk_lamda[1]*qy + unitk_lamda[2]*qz;
+                dot2 = qx*qx+qy*qy+qz*qz;
+                sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz;
+              }
+            }
+          }
+          greensfn[n++] = numerator*sum1/denominator;
+        } else greensfn[n++] = 0.0;
+      }
+    }
+  }
+}
+
 /* ----------------------------------------------------------------------
    compute optimized Green's function for energy calculation
 ------------------------------------------------------------------------- */
 
 void PPPM::compute_gf_ad()
 {
-
-  const double * const prd = (triclinic==0) ? domain->prd : domain->prd_lamda;
+  const double * const prd = domain->prd;
 
   const double xprd = prd[0];
   const double yprd = prd[1];
@@ -1807,6 +2025,13 @@ void PPPM::poisson_ik()
 
   if (evflag_atom) poisson_peratom();
 
+  // triclinic system
+
+  if (triclinic) {
+    poisson_ik_triclinic();
+    return;
+  }
+
   // compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k)
   // FFT leaves data in 3d brick decomposition
   // copy it into inner portion of vdx,vdy,vdz arrays
@@ -1875,6 +2100,76 @@ void PPPM::poisson_ik()
       }
 }
 
+/* ----------------------------------------------------------------------
+   FFT-based Poisson solver for ik for a triclinic system
+------------------------------------------------------------------------- */
+
+void PPPM::poisson_ik_triclinic()
+{
+  int i,j,k,n;
+
+  // compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k)
+  // FFT leaves data in 3d brick decomposition
+  // copy it into inner portion of vdx,vdy,vdz arrays
+
+  // x direction gradient
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work2[n] = fkx[i]*work1[n+1];
+    work2[n+1] = -fkx[i]*work1[n];
+    n += 2;
+  }
+
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        vdx_brick[k][j][i] = work2[n];
+        n += 2;
+      }
+
+  // y direction gradient
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work2[n] = fky[i]*work1[n+1];
+    work2[n+1] = -fky[i]*work1[n];
+    n += 2;
+  }
+
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        vdy_brick[k][j][i] = work2[n];
+        n += 2;
+      }
+
+  // z direction gradient
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    work2[n] = fkz[i]*work1[n+1];
+    work2[n+1] = -fkz[i]*work1[n];
+    n += 2;
+  }
+
+  fft2->compute(work2,work2,-1);
+
+  n = 0;
+  for (k = nzlo_in; k <= nzhi_in; k++)
+    for (j = nylo_in; j <= nyhi_in; j++)
+      for (i = nxlo_in; i <= nxhi_in; i++) {
+        vdz_brick[k][j][i] = work2[n];
+        n += 2;
+      }
+}
+
 /* ----------------------------------------------------------------------
    FFT-based Poisson solver for ad
 ------------------------------------------------------------------------- */
@@ -2166,9 +2461,7 @@ void PPPM::fieldforce_ad()
   double sf = 0.0;
   double *prd;
 
-  if (triclinic == 0) prd = domain->prd;
-  else prd = domain->prd_lamda;
-
+  prd = domain->prd;
   double xprd = prd[0];
   double yprd = prd[1];
   double zprd = prd[2];
@@ -2717,6 +3010,7 @@ double PPPM::memory_usage()
   } else {
     bytes += 4 * nbrick * sizeof(FFT_SCALAR);
   }
+  if (triclinic) bytes += 3 * nfft_both * sizeof(double);
   bytes += 6 * nfft_both * sizeof(double);
   bytes += nfft_both * sizeof(double);
   bytes += nfft_both*5 * sizeof(FFT_SCALAR);
@@ -2748,11 +3042,23 @@ void PPPM::compute_group_group(int groupbit_A, int groupbit_B, int BA_flag)
     error->all(FLERR,"Cannot (yet) use K-space slab "
                "correction with compute group/group");
 
+  if (differentiation_flag)
+    error->all(FLERR,"Cannot (yet) use 'kspace_modify "
+               "diff ad' with compute group/group");
+
   if (!group_allocate_flag) {
     allocate_groups();
     group_allocate_flag = 1;
   }
 
+  // convert atoms from box to lamda coords
+
+  if (triclinic == 0) boxlo = domain->boxlo;
+  else {
+    boxlo = domain->boxlo_lamda;
+    domain->x2lamda(atom->nlocal);
+  }
+
   e2group = 0; //energy
   f2group[0] = 0; //force in x-direction
   f2group[1] = 0; //force in y-direction
@@ -2816,6 +3122,10 @@ void PPPM::compute_group_group(int groupbit_A, int groupbit_B, int BA_flag)
   MPI_Allreduce(f2group,f2group_all,3,MPI_DOUBLE,MPI_SUM,world);
 
   for (int i = 0; i < 3; i++) f2group[i] = qscale*volume*f2group_all[i];
+
+  // convert atoms back from lamda to box coords
+
+  if (triclinic) domain->lamda2x(atom->nlocal);
 }
 
 /* ----------------------------------------------------------------------
@@ -2979,6 +3289,13 @@ void PPPM::poisson_groups(int BA_flag)
     work_A[n++] *= s2 * greensfn[i];
   }
 
+  // triclinic system
+  
+  if (triclinic) {
+    poisson_groups_triclinic();
+    return;
+  }
+
   double partial_group;
 
   // force, x direction
@@ -3014,3 +3331,47 @@ void PPPM::poisson_groups(int BA_flag)
         n += 2;
       }
 }
+
+/* ----------------------------------------------------------------------
+   FFT-based Poisson solver for group-group interactions
+   for a triclinic system
+ ------------------------------------------------------------------------- */
+
+void PPPM::poisson_groups_triclinic()
+{
+  int i,j,k,n;
+
+  // reuse memory (already declared)
+
+  FFT_SCALAR *work_A = work1;
+  FFT_SCALAR *work_B = work2;
+
+  double partial_group;
+
+  // force, x direction
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    partial_group = work_A[n+1]*work_B[n] - work_A[n]*work_B[n+1];
+    f2group[0] += fkx[i] * partial_group;
+    n += 2;
+  }
+
+  // force, y direction
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    partial_group = work_A[n+1]*work_B[n] - work_A[n]*work_B[n+1];
+    f2group[1] += fky[i] * partial_group;
+    n += 2;
+  }
+
+  // force, z direction
+
+  n = 0;
+  for (i = 0; i < nfft; i++) {
+    partial_group = work_A[n+1]*work_B[n] - work_A[n]*work_B[n+1];
+    f2group[2] += fkz[i] * partial_group;
+    n += 2;
+  }
+}
diff --git a/src/KSPACE/pppm.h b/src/KSPACE/pppm.h
index 33783a13b1..7da6d80560 100644
--- a/src/KSPACE/pppm.h
+++ b/src/KSPACE/pppm.h
@@ -57,6 +57,7 @@ class PPPM : public KSpace {
   double cutoff;
   double volume;
   double delxinv,delyinv,delzinv,delvolinv;
+  double h_x,h_y,h_z;
   double shift,shiftone;
   int peratom_allocate_flag;
 
@@ -99,7 +100,6 @@ class PPPM : public KSpace {
   int **part2grid;             // storage for particle -> grid mapping
   int nmax;
 
-  int triclinic;               // domain settings, orthog or triclinic
   double *boxlo;
                                // TIP4P settings
   int typeH,typeO;             // atom types of TIP4P water H and O atoms
@@ -155,6 +155,14 @@ class PPPM : public KSpace {
   virtual void pack_reverse(int, FFT_SCALAR *, int, int *);
   virtual void unpack_reverse(int, FFT_SCALAR *, int, int *);
 
+  // triclinic
+
+  int triclinic;               // domain settings, orthog or triclinic
+  void setup_triclinic();
+  void compute_gf_ik_triclinic();
+  void poisson_ik_triclinic();
+  void poisson_groups_triclinic();
+
   // group-group interactions
 
   virtual void allocate_groups();
@@ -201,7 +209,11 @@ Self-explanatory.  Check the input script syntax and compare to the
 documentation for the command.  You can use -echo screen as a
 command-line option when running LAMMPS to see the offending line.
 
-E: Cannot (yet) use PPPM with triclinic box
+E: Cannot (yet) use PPPM with triclinic box and 'kspace_modify diff ad'
+
+This feature is not yet supported.
+
+E: Cannot (yet) use PPPM with triclinic box and slab correction
 
 This feature is not yet supported.
 
@@ -247,6 +259,10 @@ E: Bad TIP4P bond type for PPPM/TIP4P
 
 Specified bond type is not valid.
 
+E: Cannot (yet) use PPPM with triclinic box and TIP4P
+
+This feature is not yet supported.
+
 E: Cannot use kspace solver on system with no charge
 
 No atoms in system have a non-zero charge.
@@ -312,4 +328,8 @@ E: Cannot (yet) use K-space slab correction with compute group/group
 
 This option is not yet supported.
 
+E: Cannot (yet) use 'kspace_modify diff ad' with compute group/group
+
+This option is not yet supported.
+
 */
diff --git a/src/REPLICA/prd.cpp b/src/REPLICA/prd.cpp
index fce3b3adab..e68f53b309 100644
--- a/src/REPLICA/prd.cpp
+++ b/src/REPLICA/prd.cpp
@@ -563,37 +563,44 @@ int PRD::check_event(int replica_num)
   if (replica_num >= 0 && replica_num != universe->iworld) worldflag = 0;
 
   timer->barrier_start(TIME_LOOP);
+
   if (me == 0) MPI_Allreduce(&worldflag,&universeflag,1,
                              MPI_INT,MPI_SUM,comm_replica);
   MPI_Bcast(&universeflag,1,MPI_INT,0,world);
+
   ncoincident = universeflag;
+
   if (!universeflag) ireplica = -1;
   else {
 
     // multiple events, choose one at random
     // iwhich = random # from 1 to N, N = # of events to choose from
     // scanflag = 1 to N on replicas with an event, 0 on non-event replicas
-    // worldflag = 1 on chosen replica, 0 on all others
+    // exit with worldflag = 1 on chosen replica, 0 on all others
+    // note worldflag is already 0 on replicas that didn't perform event
 
     if (universeflag > 1) {
       int iwhich = static_cast<int>
         (universeflag*random_select->uniform()) + 1;
-      if (me == 0) {
+
+      if (me == 0)
         MPI_Scan(&worldflag,&scanflag,1,MPI_INT,MPI_SUM,comm_replica);
-        scanflag *= worldflag;
-      }
       MPI_Bcast(&scanflag,1,MPI_INT,0,world);
+
       if (scanflag != iwhich) worldflag = 0;
     }
 
     if (worldflag) replicaflag = universe->iworld;
     else replicaflag = 0;
+
     if (me == 0) MPI_Allreduce(&replicaflag,&ireplica,1,
                                MPI_INT,MPI_SUM,comm_replica);
     MPI_Bcast(&ireplica,1,MPI_INT,0,world);
   }
+
   timer->barrier_stop(TIME_LOOP);
   time_comm += timer->array[TIME_LOOP];
+
   return ireplica;
 }
 
diff --git a/src/comm.cpp b/src/comm.cpp
index 8a7e13eb8c..a0c14fa69a 100644
--- a/src/comm.cpp
+++ b/src/comm.cpp
@@ -534,6 +534,7 @@ void Comm::setup()
     maxneed[0] = MAX(all[0],all[1]);
     maxneed[1] = MAX(all[2],all[3]);
     maxneed[2] = MAX(all[4],all[5]);
+    if (me == 0) printf("MAXNEED %d %d %d\n",maxneed[0],maxneed[1],maxneed[2]);
   }
 
   // allocate comm memory
diff --git a/src/kspace.cpp b/src/kspace.cpp
index 66c152e34f..a4aeb07576 100644
--- a/src/kspace.cpp
+++ b/src/kspace.cpp
@@ -22,6 +22,7 @@
 #include "atom_masks.h"
 #include "error.h"
 #include "suffix.h"
+#include "domain.h"
 
 using namespace LAMMPS_NS;
 
@@ -242,6 +243,86 @@ double KSpace::estimate_table_accuracy(double q2_over_sqrt, double spr)
   return table_accuracy;
 }
 
+/* ----------------------------------------------------------------------
+   convert box coords vector to transposed triclinic lamda (0-1) coords
+   vector, lamda = [(H^-1)^T] * v, does not preserve vector magnitude
+   v and lamda can point to same 3-vector
+------------------------------------------------------------------------- */
+
+void KSpace::x2lamdaT(double *v, double *lamda)
+{
+  double *h_inv = domain->h_inv;
+  double lamda_tmp[3];
+
+  lamda_tmp[0] = h_inv[0]*v[0];
+  lamda_tmp[1] = h_inv[5]*v[0] + h_inv[1]*v[1];
+  lamda_tmp[2] = h_inv[4]*v[0] + h_inv[3]*v[1] + h_inv[2]*v[2];
+
+  lamda[0] = lamda_tmp[0];
+  lamda[1] = lamda_tmp[1];
+  lamda[2] = lamda_tmp[2];
+}
+
+/* ----------------------------------------------------------------------
+   convert lamda (0-1) coords vector to transposed box coords vector
+   lamda = (H^T) * v, does not preserve vector magnitude
+   v and lamda can point to same 3-vector
+------------------------------------------------------------------------- */
+
+void KSpace::lamda2xT(double *lamda, double *v)
+{
+  double *h = domain->h;
+  double v_tmp[3];
+
+  v_tmp[0] = h[0]*lamda[0];
+  v_tmp[1] = h[5]*lamda[0] + h[1]*lamda[1];
+  v_tmp[2] = h[4]*lamda[0] + h[3]*lamda[1] + h[2]*lamda[2];
+
+  v[0] = v_tmp[0];
+  v[1] = v_tmp[1];
+  v[2] = v_tmp[2];
+}
+
+/* ----------------------------------------------------------------------
+   convert triclinic lamda (0-1) coords vector to box coords vector
+   v = H * lamda, does not preserve vector magnitude
+   lamda and v can point to same 3-vector
+------------------------------------------------------------------------- */
+
+void KSpace::lamda2xvector(double *lamda, double *v)
+{
+  double *h = domain->h;
+
+  v[0] = h[0]*lamda[0] + h[5]*lamda[1] + h[4]*lamda[2];
+  v[1] = h[1]*lamda[1] + h[3]*lamda[2];
+  v[2] = h[2]*lamda[2];
+}
+
+/* ----------------------------------------------------------------------
+   convert a sphere in box coords to an ellipsoid in lamda (0-1)
+   coords and return the tight (axis-aligned) bounding box, does not 
+   preserve vector magnitude
+   see http://www.loria.fr/~shornus/ellipsoid-bbox.html and
+   http://yiningkarlli.blogspot.com/2013/02/bounding-boxes-for-ellipsoidsfigure.html
+------------------------------------------------------------------------- */
+
+void KSpace::kspacebbox(double r, double *b)
+{
+  double *h = domain->h;
+  double lx,ly,lz,xy,xz,yz;
+  lx = h[0];
+  ly = h[1];
+  lz = h[2];
+  yz = h[3];
+  xz = h[4];
+  xy = h[5];
+
+  b[0] = r*sqrt(ly*ly*lz*lz + ly*ly*xz*xz - 2.0*ly*xy*xz*yz + lz*lz*xy*xy + 
+         xy*xy*yz*yz)/(lx*ly*lz);
+  b[1] = r*sqrt(lz*lz + yz*yz)/(ly*lz);
+  b[2] = r/lz;
+}
+
 /* ----------------------------------------------------------------------
    modify parameters of the KSpace style
 ------------------------------------------------------------------------- */
diff --git a/src/kspace.h b/src/kspace.h
index 1f0a0a3e82..e27cd7297a 100644
--- a/src/kspace.h
+++ b/src/kspace.h
@@ -62,6 +62,13 @@ class KSpace : protected Pointers {
   void *extract(const char *);
   void compute_dummy(int, int);
 
+  // triclinic
+
+  void x2lamdaT(double *, double *);
+  void lamda2xT(double *, double *);
+  void lamda2xvector(double *, double *);
+  void kspacebbox(double, double *);
+
   // general child-class methods
 
   virtual void init() = 0;