From 8745580d5de6992ef8a55fe2c14c9f50f4379e63 Mon Sep 17 00:00:00 2001
From: sjplimp <sjplimp@f3b2605a-c512-4ea7-a41b-209d697bcdaa>
Date: Mon, 8 Aug 2011 19:40:03 +0000
Subject: [PATCH] git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6622
 f3b2605a-c512-4ea7-a41b-209d697bcdaa

---
 src/KSPACE/pppm.cpp       | 131 ++++---
 src/KSPACE/pppm.h         |  48 ++-
 src/KSPACE/pppm_tip4p.cpp |  75 ++--
 src/KSPACE/pppm_tip4p.h   |  10 +-
 src/KSPACE/remap.cpp      |  90 ++---
 src/KSPACE/remap.h        |  22 +-
 src/KSPACE/remap_wrap.cpp |   2 +-
 src/KSPACE/remap_wrap.h   |   4 +-
 src/pack.cpp              | 757 --------------------------------------
 src/pack.h                | 753 ++++++++++++++++++++++++++++++++++++-
 10 files changed, 944 insertions(+), 948 deletions(-)
 delete mode 100644 src/pack.cpp
diff --git a/src/KSPACE/pppm.cpp b/src/KSPACE/pppm.cpp
index e6025d8035..7c671d0a3c 100644
--- a/src/KSPACE/pppm.cpp
+++ b/src/KSPACE/pppm.cpp
@@ -43,6 +43,14 @@ using namespace LAMMPS_NS;
 #define LARGE 10000.0
 #define EPS_HOC 1.0e-7
 
+#ifdef FFT_SINGLE
+#define ZEROF 0.0f
+#define ONEF  1.0f
+#else
+#define ZEROF 0.0
+#define ONEF  1.0
+#endif
+
 #define MIN(a,b) ((a) < (b) ? (a) : (b))
 #define MAX(a,b) ((a) > (b) ? (a) : (b))
 
@@ -50,7 +58,7 @@ using namespace LAMMPS_NS;
 
 PPPM::PPPM(LAMMPS *lmp, int narg, char **arg) : KSpace(lmp, narg, arg)
 {
-  if (narg != 1) error->all("Illegal kspace_style pppm command");
+  if (narg < 1) error->all("Illegal kspace_style pppm command");
 
   precision = atof(arg[0]);
   PI = 4.0*atan(1.0);
@@ -754,7 +762,7 @@ void PPPM::allocate()
 
   // summation coeffs
 
-  gf_b = new double[order];
+  memory->create(gf_b,order,"pppm:gf_b");
   memory->create2d_offset(rho1d,3,-order/2,order/2,"pppm:rho1d");
   memory->create2d_offset(rho_coeff,order,(1-order)/2,order/2,"pppm:rho_coeff");
 
@@ -778,7 +786,7 @@ void PPPM::allocate()
   remap = new Remap(lmp,world,
 		    nxlo_in,nxhi_in,nylo_in,nyhi_in,nzlo_in,nzhi_in,
 		    nxlo_fft,nxhi_fft,nylo_fft,nyhi_fft,nzlo_fft,nzhi_fft,
-		    1,0,0,2);
+		    1,0,0,FFT_PRECISION);
 }
 
 /* ----------------------------------------------------------------------
@@ -805,7 +813,7 @@ void PPPM::deallocate()
   memory->destroy(buf1);
   memory->destroy(buf2);
 
-  delete [] gf_b;
+  memory->destroy(gf_b);
   memory->destroy2d_offset(rho1d,-order/2);
   memory->destroy2d_offset(rho_coeff,(1-order)/2);
 
@@ -967,17 +975,24 @@ void PPPM::set_grid()
   // print info
 
   if (me == 0) {
+#ifdef FFT_SINGLE
+    const char fft_prec[] = "single";
+#else
+    const char fft_prec[] = "double";
+#endif
     if (screen) {
       fprintf(screen,"  G vector = %g\n",g_ewald);
       fprintf(screen,"  grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm);
       fprintf(screen,"  stencil order = %d\n",order);
       fprintf(screen,"  RMS precision = %g\n",MAX(lpr,spr));
+      fprintf(screen,"  using %s precision FFTs\n",fft_prec);
     }
     if (logfile) {
       fprintf(logfile,"  G vector = %g\n",g_ewald);
       fprintf(logfile,"  grid = %d %d %d\n",nx_pppm,ny_pppm,nz_pppm);
       fprintf(logfile,"  stencil order = %d\n",order);
       fprintf(logfile,"  RMS precision = %g\n",MAX(lpr,spr));
+      fprintf(logfile,"  using %s precision FFTs\n",fft_prec);
     }
   }
 }
@@ -1036,7 +1051,7 @@ double PPPM::diffpr(double hx, double hy, double hz, double q2, double **acons)
   lprz = rms(hz,zprd*slab_volfactor,natoms,q2,acons);
   kspace_prec = sqrt(lprx*lprx + lpry*lpry + lprz*lprz) / sqrt(3.0);
   real_prec = 2.0*q2 * exp(-g_ewald*g_ewald*cutoff*cutoff) / 
-   sqrt(natoms*cutoff*xprd*yprd*zprd);
+   sqrt(static_cast<double>(natoms)*cutoff*xprd*yprd*zprd);
   double value = kspace_prec - real_prec;
   return value;
 }
@@ -1113,8 +1128,8 @@ void PPPM::brick2fft()
   if (comm->procneigh[0][1] == me)
     for (i = 0; i < n; i++) buf2[i] = buf1[i];
   else {
-    MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[0][0],0,world,&request);
-    MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[0][1],0,world);
+    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world,&request);
+    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world);
     MPI_Wait(&request,&status);
   }
 
@@ -1137,8 +1152,8 @@ void PPPM::brick2fft()
   if (comm->procneigh[0][0] == me)
     for (i = 0; i < n; i++) buf2[i] = buf1[i];
   else {
-    MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[0][1],0,world,&request);
-    MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[0][0],0,world);
+    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world,&request);
+    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world);
     MPI_Wait(&request,&status);
   }
 
@@ -1161,8 +1176,8 @@ void PPPM::brick2fft()
   if (comm->procneigh[1][1] == me)
     for (i = 0; i < n; i++) buf2[i] = buf1[i];
   else {
-    MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[1][0],0,world,&request);
-    MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[1][1],0,world);
+    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world,&request);
+    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world);
     MPI_Wait(&request,&status);
   }
 
@@ -1185,8 +1200,8 @@ void PPPM::brick2fft()
   if (comm->procneigh[1][0] == me)
     for (i = 0; i < n; i++) buf2[i] = buf1[i];
   else {
-    MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[1][1],0,world,&request);
-    MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[1][0],0,world);
+    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world,&request);
+    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world);
     MPI_Wait(&request,&status);
   }
 
@@ -1209,8 +1224,8 @@ void PPPM::brick2fft()
   if (comm->procneigh[2][1] == me)
     for (i = 0; i < n; i++) buf2[i] = buf1[i];
   else {
-    MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[2][0],0,world,&request);
-    MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[2][1],0,world);
+    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world,&request);
+    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world);
     MPI_Wait(&request,&status);
   }
 
@@ -1233,8 +1248,8 @@ void PPPM::brick2fft()
   if (comm->procneigh[2][0] == me)
     for (i = 0; i < n; i++) buf2[i] = buf1[i];
   else {
-    MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[2][1],0,world,&request);
-    MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[2][0],0,world);
+    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world,&request);
+    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world);
     MPI_Wait(&request,&status);
   }
 
@@ -1284,8 +1299,8 @@ void PPPM::fillbrick()
   if (comm->procneigh[2][1] == me)
     for (i = 0; i < n; i++) buf2[i] = buf1[i];
   else {
-    MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[2][0],0,world,&request);
-    MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[2][1],0,world);
+    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world,&request);
+    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world);
     MPI_Wait(&request,&status);
   }
 
@@ -1314,8 +1329,8 @@ void PPPM::fillbrick()
   if (comm->procneigh[2][0] == me)
     for (i = 0; i < n; i++) buf2[i] = buf1[i];
   else {
-    MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[2][1],0,world,&request);
-    MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[2][0],0,world);
+    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[2][1],0,world,&request);
+    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[2][0],0,world);
     MPI_Wait(&request,&status);
   }
 
@@ -1344,8 +1359,8 @@ void PPPM::fillbrick()
   if (comm->procneigh[1][1] == me)
     for (i = 0; i < n; i++) buf2[i] = buf1[i];
   else {
-    MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[1][0],0,world,&request);
-    MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[1][1],0,world);
+    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world,&request);
+    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world);
     MPI_Wait(&request,&status);
   }
 
@@ -1374,8 +1389,8 @@ void PPPM::fillbrick()
   if (comm->procneigh[1][0] == me)
     for (i = 0; i < n; i++) buf2[i] = buf1[i];
   else {
-    MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[1][1],0,world,&request);
-    MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[1][0],0,world);
+    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[1][1],0,world,&request);
+    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[1][0],0,world);
     MPI_Wait(&request,&status);
   }
 
@@ -1404,8 +1419,8 @@ void PPPM::fillbrick()
   if (comm->procneigh[0][1] == me)
     for (i = 0; i < n; i++) buf2[i] = buf1[i];
   else {
-    MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[0][0],0,world,&request);
-    MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[0][1],0,world);
+    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world,&request);
+    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world);
     MPI_Wait(&request,&status);
   }
 
@@ -1434,8 +1449,8 @@ void PPPM::fillbrick()
   if (comm->procneigh[0][0] == me)
     for (i = 0; i < n; i++) buf2[i] = buf1[i];
   else {
-    MPI_Irecv(buf2,nbuf,MPI_DOUBLE,comm->procneigh[0][1],0,world,&request);
-    MPI_Send(buf1,n,MPI_DOUBLE,comm->procneigh[0][0],0,world);
+    MPI_Irecv(buf2,nbuf,MPI_FFT_SCALAR,comm->procneigh[0][1],0,world,&request);
+    MPI_Send(buf1,n,MPI_FFT_SCALAR,comm->procneigh[0][0],0,world);
     MPI_Wait(&request,&status);
   }
 
@@ -1497,12 +1512,12 @@ void PPPM::particle_map()
 void PPPM::make_rho()
 {
   int i,l,m,n,nx,ny,nz,mx,my,mz;
-  double dx,dy,dz,x0,y0,z0;
+  FFT_SCALAR dx,dy,dz,x0,y0,z0;
 
   // clear 3d density array
 
-  double *vec = &density_brick[nzlo_out][nylo_out][nxlo_out];
-  for (i = 0; i < ngrid; i++) vec[i] = 0.0;
+  FFT_SCALAR *vec = &density_brick[nzlo_out][nylo_out][nxlo_out];
+  for (i = 0; i < ngrid; i++) vec[i] = ZEROF;
 
   // loop over my charges, add their contribution to nearby grid points
   // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
@@ -1554,7 +1569,7 @@ void PPPM::poisson(int eflag, int vflag)
   n = 0;
   for (i = 0; i < nfft; i++) {
     work1[n++] = density_fft[i];
-    work1[n++] = 0.0;
+    work1[n++] = ZEROF;
   }
  
   fft1->compute(work1,work1,1);
@@ -1667,8 +1682,8 @@ void PPPM::poisson(int eflag, int vflag)
 void PPPM::fieldforce()
 {
   int i,l,m,n,nx,ny,nz,mx,my,mz;
-  double dx,dy,dz,x0,y0,z0;
-  double ek[3];
+  FFT_SCALAR dx,dy,dz,x0,y0,z0;
+  FFT_SCALAR ekx,eky,ekz;
 
   // loop over my charges, interpolate electric field from nearby grid points
   // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
@@ -1679,6 +1694,7 @@ void PPPM::fieldforce()
   double *q = atom->q;
   double **x = atom->x;
   double **f = atom->f;
+
   int nlocal = atom->nlocal;
 
   for (i = 0; i < nlocal; i++) {
@@ -1691,7 +1707,7 @@ void PPPM::fieldforce()
 
     compute_rho1d(dx,dy,dz);
 
-    ek[0] = ek[1] = ek[2] = 0.0;
+    ekx = eky = ekz = ZEROF;
     for (n = nlower; n <= nupper; n++) {
       mz = n+nz;
       z0 = rho1d[2][n];
@@ -1701,18 +1717,18 @@ void PPPM::fieldforce()
 	for (l = nlower; l <= nupper; l++) {
 	  mx = l+nx;
 	  x0 = y0*rho1d[0][l];
-	  ek[0] -= x0*vdx_brick[mz][my][mx];;
-	  ek[1] -= x0*vdy_brick[mz][my][mx];;
-	  ek[2] -= x0*vdz_brick[mz][my][mx];;
+	  ekx -= x0*vdx_brick[mz][my][mx];
+	  eky -= x0*vdy_brick[mz][my][mx];
+	  ekz -= x0*vdz_brick[mz][my][mx];
 	}
       }
     }
 
     // convert E-field to force
-
-    f[i][0] += qqrd2e*scale * q[i]*ek[0];
-    f[i][1] += qqrd2e*scale * q[i]*ek[1];
-    f[i][2] += qqrd2e*scale * q[i]*ek[2];
+    const double qfactor = qqrd2e*scale*q[i];
+    f[i][0] += qfactor*ekx;
+    f[i][1] += qfactor*eky;
+    f[i][2] += qfactor*ekz;
   }
 }
 
@@ -1758,15 +1774,16 @@ void PPPM::procs2grid2d(int nprocs, int nx, int ny, int *px, int *py)
    charge assignment into rho1d
    dx,dy,dz = distance of particle from "lower left" grid point 
 ------------------------------------------------------------------------- */
-
-void PPPM::compute_rho1d(double dx, double dy, double dz)
+void PPPM::compute_rho1d(const FFT_SCALAR &dx, const FFT_SCALAR &dy,
+			 const FFT_SCALAR &dz)
 {
   int k,l;
 
   for (k = (1-order)/2; k <= order/2; k++) {
-    rho1d[0][k] = 0.0;
-    rho1d[1][k] = 0.0;
-    rho1d[2][k] = 0.0;
+    rho1d[0][k] = ZEROF;
+    rho1d[1][k] = ZEROF;
+    rho1d[2][k] = ZEROF;
+
     for (l = order-1; l >= 0; l--) {
       rho1d[0][k] = rho_coeff[l][k] + rho1d[0][k]*dx;
       rho1d[1][k] = rho_coeff[l][k] + rho1d[1][k]*dy;
@@ -1797,9 +1814,9 @@ void PPPM::compute_rho1d(double dx, double dy, double dz)
 void PPPM::compute_rho_coeff()
 {
   int j,k,l,m;
-  double s;
+  FFT_SCALAR s;
 
-  double **a;
+  FFT_SCALAR **a;
   memory->create2d_offset(a,order,-order,order,"pppm:a");
 
   for (k = -order; k <= order; k++) 
@@ -1812,8 +1829,13 @@ void PPPM::compute_rho_coeff()
       s = 0.0;
       for (l = 0; l < j; l++) {
 	a[l+1][k] = (a[l][k+1]-a[l][k-1]) / (l+1);
+#ifdef FFT_SINGLE
+	s += powf(0.5,(float) l+1) *
+	  (a[l][k-1] + powf(-1.0,(float) l) * a[l][k+1]) / (l+1);
+#else
 	s += pow(0.5,(double) l+1) * 
 	  (a[l][k-1] + pow(-1.0,(double) l) * a[l][k+1]) / (l+1);
+#endif
       }
       a[0][k] = s;
     }
@@ -1874,7 +1896,7 @@ void PPPM::timing(int n, double &time3d, double &time1d)
 {
   double time1,time2;
 
-  for (int i = 0; i < 2*nfft_both; i++) work1[i] = 0.0;
+  for (int i = 0; i < 2*nfft_both; i++) work1[i] = ZEROF;
 
   MPI_Barrier(world);
   time1 = MPI_Wtime();
@@ -1914,9 +1936,10 @@ double PPPM::memory_usage()
   double bytes = nmax*3 * sizeof(double);
   int nbrick = (nxhi_out-nxlo_out+1) * (nyhi_out-nylo_out+1) * 
     (nzhi_out-nzlo_out+1);
-  bytes += 4 * nbrick * sizeof(double);
+  bytes += 4 * nbrick * sizeof(FFT_SCALAR);
   bytes += 6 * nfft_both * sizeof(double);
-  bytes += nfft_both*6 * sizeof(double);
-  bytes += 2 * nbuf * sizeof(double);
+  bytes += nfft_both * sizeof(double);
+  bytes += nfft_both*5 * sizeof(FFT_SCALAR);
+  bytes += 2 * nbuf * sizeof(FFT_SCALAR);
   return bytes;
 }
diff --git a/src/KSPACE/pppm.h b/src/KSPACE/pppm.h
index f0ee75d12a..982b8f7c9e 100644
--- a/src/KSPACE/pppm.h
+++ b/src/KSPACE/pppm.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
@@ -20,6 +20,17 @@ KSpaceStyle(pppm,PPPM)
 #ifndef LMP_PPPM_H
 #define LMP_PPPM_H
 
+#include "lmptype.h"
+#include "mpi.h"
+
+#ifdef FFT_SINGLE
+typedef float FFT_SCALAR;
+#define MPI_FFT_SCALAR MPI_FLOAT
+#else
+typedef double FFT_SCALAR;
+#define MPI_FFT_SCALAR MPI_DOUBLE
+#endif
+
 #include "kspace.h"
 
 namespace LAMMPS_NS {
@@ -28,11 +39,11 @@ class PPPM : public KSpace {
  public:
   PPPM(class LAMMPS *, int, char **);
   virtual ~PPPM();
-  void init();
-  void setup();
-  void compute(int, int);
-  void timing(int, double &, double &);
-  double memory_usage();
+  virtual void init();
+  virtual void setup();
+  virtual void compute(int, int);
+  virtual void timing(int, double &, double &);
+  virtual double memory_usage();
 
  protected:
   int me,nprocs;
@@ -54,17 +65,17 @@ class PPPM : public KSpace {
   int nlower,nupper;
   int ngrid,nfft,nbuf,nfft_both;
 
-  double ***density_brick;
-  double ***vdx_brick,***vdy_brick,***vdz_brick;
+  FFT_SCALAR ***density_brick;
+  FFT_SCALAR ***vdx_brick,***vdy_brick,***vdz_brick;
   double *greensfn;
   double **vg;
   double *fkx,*fky,*fkz;
-  double *density_fft;
-  double *work1,*work2;
-  double *buf1,*buf2;
+  FFT_SCALAR *density_fft;
+  FFT_SCALAR *work1,*work2;
+  FFT_SCALAR *buf1,*buf2;
 
   double *gf_b;
-  double **rho1d,**rho_coeff;
+  FFT_SCALAR **rho1d,**rho_coeff;
 
   class FFT3d *fft1,*fft2;
   class Remap *remap;
@@ -80,8 +91,8 @@ class PPPM : public KSpace {
   double alpha;                // geometric factor
 
   void set_grid();
-  void allocate();
-  void deallocate();
+  virtual void allocate();
+  virtual void deallocate();
   int factorable(int);
   double rms(double, double, bigint, double, double **);
   double diffpr(double, double, double, double, double **);
@@ -89,12 +100,13 @@ class PPPM : public KSpace {
   double gf_denom(double, double, double);
   virtual void particle_map();
   virtual void make_rho();
-  void brick2fft();
-  void fillbrick();
-  void poisson(int, int);
+  virtual void brick2fft();
+  virtual void fillbrick();
+  virtual void poisson(int, int);
   virtual void fieldforce();
   void procs2grid2d(int,int,int,int *, int*);
-  void compute_rho1d(double, double, double);
+  void compute_rho1d(const FFT_SCALAR &, const FFT_SCALAR &, 
+		     const FFT_SCALAR &);
   void compute_rho_coeff();
   void slabcorr(int);
 };
diff --git a/src/KSPACE/pppm_tip4p.cpp b/src/KSPACE/pppm_tip4p.cpp
index 08ade46a07..af0af92fad 100644
--- a/src/KSPACE/pppm_tip4p.cpp
+++ b/src/KSPACE/pppm_tip4p.cpp
@@ -26,6 +26,14 @@ using namespace LAMMPS_NS;
 
 #define OFFSET 16384
 
+#ifdef FFT_SINGLE
+#define ZEROF 0.0f
+#define ONEF  1.0f
+#else
+#define ZEROF 0.0
+#define ONEF  1.0
+#endif
+
 /* ---------------------------------------------------------------------- */
 
 PPPMTIP4P::PPPMTIP4P(LAMMPS *lmp, int narg, char **arg) :
@@ -87,13 +95,13 @@ void PPPMTIP4P::particle_map()
 void PPPMTIP4P::make_rho()
 {
   int i,l,m,n,nx,ny,nz,mx,my,mz,iH1,iH2;
-  double dx,dy,dz,x0,y0,z0;
+  FFT_SCALAR dx,dy,dz,x0,y0,z0;
   double *xi,xM[3];
 
   // clear 3d density array
 
-  double *vec = &density_brick[nzlo_out][nylo_out][nxlo_out];
-  for (i = 0; i < ngrid; i++) vec[i] = 0.0;
+  FFT_SCALAR *vec = &density_brick[nzlo_out][nylo_out][nxlo_out];
+  for (i = 0; i < ngrid; i++) vec[i] = ZEROF;
 
   // loop over my charges, add their contribution to nearby grid points
   // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
@@ -143,13 +151,13 @@ void PPPMTIP4P::make_rho()
 void PPPMTIP4P::fieldforce()
 {
   int i,l,m,n,nx,ny,nz,mx,my,mz;
-  double dx,dy,dz,x0,y0,z0;
-  double ek[3];
+  FFT_SCALAR dx,dy,dz,x0,y0,z0;
+  FFT_SCALAR ekx,eky,ekz;
   double *xi;
   int iH1,iH2;
   double xM[3];
   double fx,fy,fz;
-  double ddotf, rOM[3], f1[3];
+  double ddotf, rOMx, rOMy, rOMz, f1x, f1y, f1z;
 
   // loop over my charges, interpolate electric field from nearby grid points
   // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
@@ -160,6 +168,7 @@ void PPPMTIP4P::fieldforce()
   double *q = atom->q;
   double **x = atom->x;
   double **f = atom->f;
+
   int *type = atom->type;
   int nlocal = atom->nlocal;
 
@@ -178,7 +187,7 @@ void PPPMTIP4P::fieldforce()
 
     compute_rho1d(dx,dy,dz);
 
-    ek[0] = ek[1] = ek[2] = 0.0;
+    ekx = eky = ekz = ZEROF;
     for (n = nlower; n <= nupper; n++) {
       mz = n+nz;
       z0 = rho1d[2][n];
@@ -188,47 +197,47 @@ void PPPMTIP4P::fieldforce()
 	for (l = nlower; l <= nupper; l++) {
 	  mx = l+nx;
 	  x0 = y0*rho1d[0][l];
-	  ek[0] -= x0*vdx_brick[mz][my][mx];
-	  ek[1] -= x0*vdy_brick[mz][my][mx];
-	  ek[2] -= x0*vdz_brick[mz][my][mx];
+	  ekx -= x0*vdx_brick[mz][my][mx];
+	  eky -= x0*vdy_brick[mz][my][mx];
+	  ekz -= x0*vdz_brick[mz][my][mx];
 	}
       }
     }
 
     // convert E-field to force
-
+    const double qfactor = qqrd2e*scale*q[i];
     if (type[i] != typeO) {
-      f[i][0] += qqrd2e*q[i]*ek[0];
-      f[i][1] += qqrd2e*q[i]*ek[1];
-      f[i][2] += qqrd2e*q[i]*ek[2];
+      f[i][0] += qfactor*ekx;
+      f[i][1] += qfactor*eky;
+      f[i][2] += qfactor*ekz;
     } else {
 
-      fx = qqrd2e * q[i] * ek[0];
-      fy = qqrd2e * q[i] * ek[1];
-      fz = qqrd2e * q[i] * ek[2];
+      fx = qfactor * ekx;
+      fy = qfactor * eky;
+      fz = qfactor * ekz;
       find_M(i,iH1,iH2,xM);
 
-      rOM[0] = xM[0] - x[i][0];
-      rOM[1] = xM[1] - x[i][1];
-      rOM[2] = xM[2] - x[i][2];
+      rOMx = xM[0] - x[i][0];
+      rOMy = xM[1] - x[i][1];
+      rOMz = xM[2] - x[i][2];
   
-      ddotf = (rOM[0] * fx + rOM[1] * fy + rOM[2] * fz) / (qdist * qdist);
+      ddotf = (rOMx * fx + rOMy * fy + rOMz * fz) / (qdist * qdist);
 
-      f1[0] = ddotf * rOM[0];
-      f1[1] = ddotf * rOM[1];
-      f1[2] = ddotf * rOM[2];
+      f1x = ddotf * rOMx;
+      f1y = ddotf * rOMy;
+      f1z = ddotf * rOMz;
 
-      f[i][0] += fx - alpha * (fx - f1[0]);
-      f[i][1] += fy - alpha * (fy - f1[1]);
-      f[i][2] += fz - alpha * (fz - f1[2]);
+      f[i][0] += fx - alpha * (fx - f1x);
+      f[i][1] += fy - alpha * (fy - f1y);
+      f[i][2] += fz - alpha * (fz - f1z);
 
-      f[iH1][0] += 0.5*alpha*(fx - f1[0]); 
-      f[iH1][1] += 0.5*alpha*(fy - f1[1]); 
-      f[iH1][2] += 0.5*alpha*(fz - f1[2]); 
+      f[iH1][0] += 0.5*alpha*(fx - f1x);
+      f[iH1][1] += 0.5*alpha*(fy - f1y);
+      f[iH1][2] += 0.5*alpha*(fz - f1z);
 
-      f[iH2][0] += 0.5*alpha*(fx - f1[0]); 
-      f[iH2][1] += 0.5*alpha*(fy - f1[1]); 
-      f[iH2][2] += 0.5*alpha*(fz - f1[2]); 
+      f[iH2][0] += 0.5*alpha*(fx - f1x);
+      f[iH2][1] += 0.5*alpha*(fy - f1y);
+      f[iH2][2] += 0.5*alpha*(fz - f1z);
     }
   }
 }
diff --git a/src/KSPACE/pppm_tip4p.h b/src/KSPACE/pppm_tip4p.h
index 4a56a0270b..764db68edd 100644
--- a/src/KSPACE/pppm_tip4p.h
+++ b/src/KSPACE/pppm_tip4p.h
@@ -27,12 +27,14 @@ namespace LAMMPS_NS {
 class PPPMTIP4P : public PPPM {
  public:
   PPPMTIP4P(class LAMMPS *, int, char **);
+  virtual ~PPPMTIP4P () {};
+
+ protected:
+  virtual void particle_map();
+  virtual void make_rho();
+  virtual void fieldforce();
 
  private:
-  void particle_map();
-  void make_rho();
-  void fieldforce();
-
   void find_M(int, int &, int &, double *); 
 };
 
diff --git a/src/KSPACE/remap.cpp b/src/KSPACE/remap.cpp
index f30a2d97f2..dbf9476a80 100644
--- a/src/KSPACE/remap.cpp
+++ b/src/KSPACE/remap.cpp
@@ -11,10 +11,12 @@
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
-#include "mpi.h"
 #include "stdio.h"
 #include "stdlib.h"
 #include "remap.h"
+
+#define PACK_DATA FFT_SCALAR
+
 #include "pack.h"
 
 #define MIN(A,B) ((A) < (B)) ? (A) : (B)
@@ -57,13 +59,13 @@
    plan         plan returned by previous call to remap_3d_create_plan
 ------------------------------------------------------------------------- */
 
-void remap_3d(double *in, double *out, double *buf,
+void remap_3d(FFT_SCALAR *in, FFT_SCALAR *out, FFT_SCALAR *buf,
 	      struct remap_plan_3d *plan)
 
 {
   MPI_Status status;
   int i,isend,irecv;
-  double *scratch;
+  FFT_SCALAR *scratch;
 
   if (plan->memory == 0)
     scratch = buf;
@@ -74,7 +76,7 @@ void remap_3d(double *in, double *out, double *buf,
 
   for (irecv = 0; irecv < plan->nrecv; irecv++)
     MPI_Irecv(&scratch[plan->recv_bufloc[irecv]],plan->recv_size[irecv],
-	      MPI_DOUBLE,plan->recv_proc[irecv],0,
+	      MPI_FFT_SCALAR,plan->recv_proc[irecv],0,
 	      plan->comm,&plan->request[irecv]);
 
   // send all messages to other procs 
@@ -82,7 +84,7 @@ void remap_3d(double *in, double *out, double *buf,
   for (isend = 0; isend < plan->nsend; isend++) {
     plan->pack(&in[plan->send_offset[isend]],
 	       plan->sendbuf,&plan->packplan[isend]);
-    MPI_Send(plan->sendbuf,plan->send_size[isend],MPI_DOUBLE,
+    MPI_Send(plan->sendbuf,plan->send_size[isend],MPI_FFT_SCALAR,
 	     plan->send_proc[isend],0,plan->comm);
   }       
 
@@ -150,13 +152,6 @@ struct remap_plan_3d *remap_3d_create_plan(
   MPI_Comm_rank(comm,&me);
   MPI_Comm_size(comm,&nprocs);
 
-  // single precision not yet supported 
-
-  if (precision == 1) {
-    if (me == 0) printf("Single precision not supported\n");
-    return NULL;
-  }
-
   // allocate memory for plan data struct 
 
   plan = (struct remap_plan_3d *) malloc(sizeof(struct remap_plan_3d));
@@ -209,10 +204,7 @@ struct remap_plan_3d *remap_3d_create_plan(
   // malloc space for send info 
 
   if (nsend) {
-    if (precision == 1)
-      plan->pack = NULL;
-    else
-      plan->pack = pack_3d;
+    plan->pack = pack_3d;
 
     plan->send_offset = (int *) malloc(nsend*sizeof(int));
     plan->send_size = (int *) malloc(nsend*sizeof(int));
@@ -272,45 +264,23 @@ struct remap_plan_3d *remap_3d_create_plan(
   // malloc space for recv info 
 
   if (nrecv) {
-    if (precision == 1) {
-      if (permute == 0)
-	plan->unpack = NULL;
-      else if (permute == 1) {
-	if (nqty == 1)
-	  plan->unpack = NULL;
-	else if (nqty == 2)
-	  plan->unpack = NULL;
-	else
-	  plan->unpack = NULL;
-      }
-      else if (permute == 2) {
-	if (nqty == 1)
-	  plan->unpack = NULL;
-	else if (nqty == 2)
-	  plan->unpack = NULL;
-	else
-	  plan->unpack = NULL;
-      }
+    if (permute == 0)
+      plan->unpack = unpack_3d;
+    else if (permute == 1) {
+      if (nqty == 1)
+	plan->unpack = unpack_3d_permute1_1;
+      else if (nqty == 2)
+	plan->unpack = unpack_3d_permute1_2;
+      else
+	plan->unpack = unpack_3d_permute1_n;
     }
-    else if (precision == 2) {
-      if (permute == 0)
-	plan->unpack = unpack_3d;
-      else if (permute == 1) {
-	if (nqty == 1)
-	  plan->unpack = unpack_3d_permute1_1;
-	else if (nqty == 2)
-	  plan->unpack = unpack_3d_permute1_2;
-	else
-	  plan->unpack = unpack_3d_permute1_n;
-      }
-      else if (permute == 2) {
-	if (nqty == 1)
-	  plan->unpack = unpack_3d_permute2_1;
-	else if (nqty == 2)
-	  plan->unpack = unpack_3d_permute2_2;
-	else
-	  plan->unpack = unpack_3d_permute2_n;
-      }
+    else if (permute == 2) {
+      if (nqty == 1)
+	plan->unpack = unpack_3d_permute2_1;
+      else if (nqty == 2)
+	plan->unpack = unpack_3d_permute2_2;
+      else
+	plan->unpack = unpack_3d_permute2_n;
     }
 
     plan->recv_offset = (int *) malloc(nrecv*sizeof(int));
@@ -408,10 +378,7 @@ struct remap_plan_3d *remap_3d_create_plan(
     size = MAX(size,plan->send_size[nsend]);
 
   if (size) {
-    if (precision == 1)
-      plan->sendbuf = NULL;
-    else
-      plan->sendbuf = (double *) malloc(size*sizeof(double));
+    plan->sendbuf = (FFT_SCALAR *) malloc(size*sizeof(FFT_SCALAR));
     if (plan->sendbuf == NULL) return NULL;
   }
 
@@ -422,11 +389,8 @@ struct remap_plan_3d *remap_3d_create_plan(
 
   if (memory == 1) {
     if (nrecv > 0) {
-      if (precision == 1)
-	plan->scratch = NULL;
-      else
-	plan->scratch =
-	  (double *) malloc(nqty*out.isize*out.jsize*out.ksize*sizeof(double));
+      plan->scratch =
+	(FFT_SCALAR *) malloc(nqty*out.isize*out.jsize*out.ksize*sizeof(FFT_SCALAR));
       if (plan->scratch == NULL) return NULL;
     }
   }
diff --git a/src/KSPACE/remap.h b/src/KSPACE/remap.h
index 6788354c69..648b3a0d16 100644
--- a/src/KSPACE/remap.h
+++ b/src/KSPACE/remap.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
@@ -11,14 +11,24 @@
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
+#include <mpi.h>
+
+#ifdef FFT_SINGLE
+typedef float FFT_SCALAR;
+#define MPI_FFT_SCALAR MPI_FLOAT
+#else
+typedef double FFT_SCALAR;
+#define MPI_FFT_SCALAR MPI_DOUBLE
+#endif
+
 // details of how to do a 3d remap 
 
 struct remap_plan_3d {
-  double *sendbuf;                  // buffer for MPI sends 
-  double *scratch;                  // scratch buffer for MPI recvs 
-  void (*pack)(double *, double *, struct pack_plan_3d *);
+  FFT_SCALAR *sendbuf;                  // buffer for MPI sends 
+  FFT_SCALAR *scratch;                  // scratch buffer for MPI recvs 
+  void (*pack)(FFT_SCALAR *, FFT_SCALAR *, struct pack_plan_3d *);
                                     // which pack function to use 
-  void (*unpack)(double *, double *, struct pack_plan_3d *);
+  void (*unpack)(FFT_SCALAR *, FFT_SCALAR *, struct pack_plan_3d *);
                                     // which unpack function to use 
   int *send_offset;                 // extraction loc for each send 
   int *send_size;                   // size of each send message 
@@ -47,7 +57,7 @@ struct extent_3d {
 
 // function prototypes 
 
-void remap_3d(double *, double *, double *, struct remap_plan_3d *);
+void remap_3d(FFT_SCALAR *, FFT_SCALAR *, FFT_SCALAR *, struct remap_plan_3d *);
 struct remap_plan_3d *remap_3d_create_plan(MPI_Comm, 
   int, int, int, int, int, int,	int, int, int, int, int, int,
   int, int, int, int);
diff --git a/src/KSPACE/remap_wrap.cpp b/src/KSPACE/remap_wrap.cpp
index d46df6be48..83631defe4 100644
--- a/src/KSPACE/remap_wrap.cpp
+++ b/src/KSPACE/remap_wrap.cpp
@@ -42,7 +42,7 @@ Remap::~Remap()
 
 /* ---------------------------------------------------------------------- */
 
-void Remap::perform(double *in, double *out, double *buf)
+void Remap::perform(FFT_SCALAR *in, FFT_SCALAR *out, FFT_SCALAR *buf)
 {
   remap_3d(in,out,buf,plan);
 }
diff --git a/src/KSPACE/remap_wrap.h b/src/KSPACE/remap_wrap.h
index 7145769d4c..dd57e18d17 100644
--- a/src/KSPACE/remap_wrap.h
+++ b/src/KSPACE/remap_wrap.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
@@ -24,7 +24,7 @@ class Remap : protected Pointers {
   Remap(class LAMMPS *, MPI_Comm,int,int,int,int,int,int,
 	int,int,int,int,int,int,int,int,int,int);
   ~Remap();
-  void perform(double *, double *, double *);
+  void perform(FFT_SCALAR *, FFT_SCALAR *, FFT_SCALAR *);
 
  private:
   struct remap_plan_3d *plan;
diff --git a/src/pack.cpp b/src/pack.cpp
deleted file mode 100644
index 0b6ba908a9..0000000000
--- a/src/pack.cpp
+++ /dev/null
@@ -1,757 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under 
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#include "pack.h"
-
-#if !defined(PACK_POINTER) && !defined(PACK_MEMCPY)
-#define PACK_ARRAY
-#endif
-
-/* ----------------------------------------------------------------------
-   Pack and unpack functions:
-
-   pack routines copy strided values from data into contiguous locs in buf
-   unpack routines copy contiguous values from buf into strided locs in data
-   different versions of unpack depending on permutation
-     and # of values/element
-   PACK_ARRAY routines work via array indices (default)
-   PACK_POINTER routines work via pointers
-   PACK_MEMCPY routines work via pointers and memcpy function
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   pack/unpack with array indices 
-------------------------------------------------------------------------- */
-
-#ifdef PACK_ARRAY
-
-/* ----------------------------------------------------------------------
-   pack from data -> buf 
-------------------------------------------------------------------------- */
-
-void pack_3d(double *data, double *buf, struct pack_plan_3d *plan)
-
-{
-  register int in,out,fast,mid,slow;
-  register int nfast,nmid,nslow,nstride_line,nstride_plane,plane;
-
-  nfast = plan->nfast;
-  nmid = plan->nmid;
-  nslow = plan->nslow;
-  nstride_line = plan->nstride_line;
-  nstride_plane = plan->nstride_plane;
-
-  in = 0;
-  for (slow = 0; slow < nslow; slow++) {
-    plane = slow*nstride_plane;
-    for (mid = 0; mid < nmid; mid++) {
-      out = plane + mid*nstride_line;
-      for (fast = 0; fast < nfast; fast++)
-	buf[in++] = data[out++];
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   unpack from buf -> data 
-------------------------------------------------------------------------- */
-
-void unpack_3d(double *buf, double *data, struct pack_plan_3d *plan)
-
-{
-  register int in,out,fast,mid,slow;
-  register int nfast,nmid,nslow,nstride_line,nstride_plane,plane;
-
-  nfast = plan->nfast;
-  nmid = plan->nmid;
-  nslow = plan->nslow;
-  nstride_line = plan->nstride_line;
-  nstride_plane = plan->nstride_plane;
-
-  out = 0;
-  for (slow = 0; slow < nslow; slow++) {
-    plane = slow*nstride_plane;
-    for (mid = 0; mid < nmid; mid++) {
-      in = plane + mid*nstride_line;
-      for (fast = 0; fast < nfast; fast++)
-	data[in++] = buf[out++];
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   unpack from buf -> data, one axis permutation, 1 value/element 
-------------------------------------------------------------------------- */
-
-void unpack_3d_permute1_1(double *buf, double *data, struct pack_plan_3d *plan)
-
-{
-  register int in,out,fast,mid,slow;
-  register int nfast,nmid,nslow,nstride_line,nstride_plane,plane;
-
-  nfast = plan->nfast;
-  nmid = plan->nmid;
-  nslow = plan->nslow;
-  nstride_line = plan->nstride_line;
-  nstride_plane = plan->nstride_plane;
-
-  out = 0;
-  for (slow = 0; slow < nslow; slow++) {
-    plane = slow*nstride_line;
-    for (mid = 0; mid < nmid; mid++) {
-      in = plane + mid;
-      for (fast = 0; fast < nfast; fast++, in += nstride_plane)
-	data[in] = buf[out++];
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   unpack from buf -> data, one axis permutation, 2 values/element 
-------------------------------------------------------------------------- */
-
-void unpack_3d_permute1_2(double *buf, double *data, struct pack_plan_3d *plan)
-
-{
-  register int in,out,fast,mid,slow;
-  register int nfast,nmid,nslow,nstride_line,nstride_plane,plane;
-
-  nfast = plan->nfast;
-  nmid = plan->nmid;
-  nslow = plan->nslow;
-  nstride_line = plan->nstride_line;
-  nstride_plane = plan->nstride_plane;
-
-  out = 0;
-  for (slow = 0; slow < nslow; slow++) {
-    plane = slow*nstride_line;
-    for (mid = 0; mid < nmid; mid++) {
-      in = plane + 2*mid;
-      for (fast = 0; fast < nfast; fast++, in += nstride_plane) {
-	data[in] = buf[out++];
-	data[in+1] = buf[out++];
-      }
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   unpack from buf -> data, one axis permutation, nqty values/element 
-------------------------------------------------------------------------- */
-
-void unpack_3d_permute1_n(double *buf, double *data, struct pack_plan_3d *plan)
-
-{
-  register int in,out,iqty,instart,fast,mid,slow;
-  register int nfast,nmid,nslow,nstride_line,nstride_plane,plane,nqty;
-
-  nfast = plan->nfast;
-  nmid = plan->nmid;
-  nslow = plan->nslow;
-  nstride_line = plan->nstride_line;
-  nstride_plane = plan->nstride_plane;
-  nqty = plan->nqty;
-
-  out = 0;
-  for (slow = 0; slow < nslow; slow++) {
-    plane = slow*nstride_line;
-    for (mid = 0; mid < nmid; mid++) {
-      instart = plane + nqty*mid;
-      for (fast = 0; fast < nfast; fast++, instart += nstride_plane) {
-	in = instart;
-	for (iqty = 0; iqty < nqty; iqty++) data[in++] = buf[out++];
-      }
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   unpack from buf -> data, two axis permutation, 1 value/element 
-------------------------------------------------------------------------- */
-
-void unpack_3d_permute2_1(double *buf, double *data, struct pack_plan_3d *plan)
-
-{
-  register int in,out,fast,mid,slow;
-  register int nfast,nmid,nslow,nstride_line,nstride_plane;
-
-  nfast = plan->nfast;
-  nmid = plan->nmid;
-  nslow = plan->nslow;
-  nstride_line = plan->nstride_line;
-  nstride_plane = plan->nstride_plane;
-
-  out = 0;
-  for (slow = 0; slow < nslow; slow++) {
-    for (mid = 0; mid < nmid; mid++) {
-      in = slow + mid*nstride_plane;
-      for (fast = 0; fast < nfast; fast++, in += nstride_line)
-	data[in] = buf[out++];
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   unpack from buf -> data, two axis permutation, 2 values/element 
-------------------------------------------------------------------------- */
-
-void unpack_3d_permute2_2(double *buf, double *data, struct pack_plan_3d *plan)
-
-{
-  register int in,out,fast,mid,slow;
-  register int nfast,nmid,nslow,nstride_line,nstride_plane;
-
-  nfast = plan->nfast;
-  nmid = plan->nmid;
-  nslow = plan->nslow;
-  nstride_line = plan->nstride_line;
-  nstride_plane = plan->nstride_plane;
-
-  out = 0;
-  for (slow = 0; slow < nslow; slow++) {
-    for (mid = 0; mid < nmid; mid++) {
-      in = 2*slow + mid*nstride_plane;
-      for (fast = 0; fast < nfast; fast++, in += nstride_line) {
-	data[in] = buf[out++];
-	data[in+1] = buf[out++];
-      }
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   unpack from buf -> data, two axis permutation, nqty values/element 
-------------------------------------------------------------------------- */
-
-void unpack_3d_permute2_n(double *buf, double *data, struct pack_plan_3d *plan)
-
-{
-  register int in,out,iqty,instart,fast,mid,slow;
-  register int nfast,nmid,nslow,nstride_line,nstride_plane,nqty;
-
-  nfast = plan->nfast;
-  nmid = plan->nmid;
-  nslow = plan->nslow;
-  nstride_line = plan->nstride_line;
-  nstride_plane = plan->nstride_plane;
-  nqty = plan->nqty;
-
-  out = 0;
-  for (slow = 0; slow < nslow; slow++) {
-    for (mid = 0; mid < nmid; mid++) {
-      instart = nqty*slow + mid*nstride_plane;
-      for (fast = 0; fast < nfast; fast++, instart += nstride_line) {
-	in = instart;
-	for (iqty = 0; iqty < nqty; iqty++) data[in++] = buf[out++];
-      }
-    }
-  }
-}
-
-#endif
-
-/* ----------------------------------------------------------------------
-   pack/unpack with pointers 
-------------------------------------------------------------------------- */
-
-#ifdef PACK_POINTER
-
-/* ----------------------------------------------------------------------
-   pack from data -> buf 
-------------------------------------------------------------------------- */
-
-void pack_3d(double *data, double *buf, struct pack_plan_3d *plan)
-
-{
-  register double *in,*out,*begin,*end;
-  register int mid,slow;
-  register int nfast,nmid,nslow,nstride_line,nstride_plane,plane;
-
-  nfast = plan->nfast;
-  nmid = plan->nmid;
-  nslow = plan->nslow;
-  nstride_line = plan->nstride_line;
-  nstride_plane = plan->nstride_plane;
-
-  in = buf;
-  for (slow = 0; slow < nslow; slow++) {
-    plane = slow*nstride_plane;
-    for (mid = 0; mid < nmid; mid++) {
-      begin = &(data[plane+mid*nstride_line]);
-      end = begin + nfast;
-      for (out = begin; out < end; out++)
-	*(in++) = *out;
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   unpack from buf -> data 
-------------------------------------------------------------------------- */
-
-void unpack_3d(double *buf, double *data, struct pack_plan_3d *plan)
-
-{
-  register double *in,*out,*begin,*end;
-  register int mid,slow;
-  register int nfast,nmid,nslow,nstride_line,nstride_plane,plane;
-
-  nfast = plan->nfast;
-  nmid = plan->nmid;
-  nslow = plan->nslow;
-  nstride_line = plan->nstride_line;
-  nstride_plane = plan->nstride_plane;
-
-  out = buf;
-  for (slow = 0; slow < nslow; slow++) {
-    plane = slow*nstride_plane;
-    for (mid = 0; mid < nmid; mid++) {
-      begin = &(data[plane+mid*nstride_line]);
-      end = begin + nfast;
-      for (in = begin; in < end; in++)
-	*in = *(out++);
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   unpack from buf -> data, one axis permutation, 1 value/element 
-------------------------------------------------------------------------- */
-
-void unpack_3d_permute1_1(double *buf, double *data, struct pack_plan_3d *plan)
-
-{
-  register double *in,*out,*begin,*end;
-  register int mid,slow;
-  register int nfast,nmid,nslow,nstride_line,nstride_plane,plane;
-
-  nfast = plan->nfast;
-  nmid = plan->nmid;
-  nslow = plan->nslow;
-  nstride_line = plan->nstride_line;
-  nstride_plane = plan->nstride_plane;
-
-  out = buf;
-  for (slow = 0; slow < nslow; slow++) {
-    plane = slow*nstride_line;
-    for (mid = 0; mid < nmid; mid++) {
-      begin = &(data[plane+mid]);
-      end = begin + nfast*nstride_plane;
-      for (in = begin; in < end; in += nstride_plane)
-	*in = *(out++);
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   unpack from buf -> data, one axis permutation, 2 values/element 
-------------------------------------------------------------------------- */
-
-void unpack_3d_permute1_2(double *buf, double *data, struct pack_plan_3d *plan)
-
-{
-  register double *in,*out,*begin,*end;
-  register int mid,slow;
-  register int nfast,nmid,nslow,nstride_line,nstride_plane,plane;
-
-  nfast = plan->nfast;
-  nmid = plan->nmid;
-  nslow = plan->nslow;
-  nstride_line = plan->nstride_line;
-  nstride_plane = plan->nstride_plane;
-
-  out = buf;
-  for (slow = 0; slow < nslow; slow++) {
-    plane = slow*nstride_line;
-    for (mid = 0; mid < nmid; mid++) {
-      begin = &(data[plane+2*mid]);
-      end = begin + nfast*nstride_plane;
-      for (in = begin; in < end; in += nstride_plane) {
-	*in = *(out++);
-	*(in+1) = *(out++);
-      }
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   unpack from buf -> data, one axis permutation, nqty values/element 
-------------------------------------------------------------------------- */
-
-void unpack_3d_permute1_n(double *buf, double *data, struct pack_plan_3d *plan)
-
-{
-  register double *in,*out,*instart,*begin,*end;
-  register int iqty,mid,slow;
-  register int nfast,nmid,nslow,nstride_line,nstride_plane,plane,nqty;
-
-  nfast = plan->nfast;
-  nmid = plan->nmid;
-  nslow = plan->nslow;
-  nstride_line = plan->nstride_line;
-  nstride_plane = plan->nstride_plane;
-  nqty = plan->nqty;
-
-  out = buf;
-  for (slow = 0; slow < nslow; slow++) {
-    plane = slow*nstride_line;
-    for (mid = 0; mid < nmid; mid++) {
-      begin = &(data[plane+nqty*mid]);
-      end = begin + nfast*nstride_plane;
-      for (instart = begin; instart < end; instart += nstride_plane) {
-	in = instart;
-	for (iqty = 0; iqty < nqty; iqty++) *(in++) = *(out++);
-      }
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   unpack from buf -> data, two axis permutation, 1 value/element 
-------------------------------------------------------------------------- */
-
-void unpack_3d_permute2_1(double *buf, double *data, struct pack_plan_3d *plan)
-
-{
-  register double *in,*out,*begin,*end;
-  register int mid,slow;
-  register int nfast,nmid,nslow,nstride_line,nstride_plane;
-
-  nfast = plan->nfast;
-  nmid = plan->nmid;
-  nslow = plan->nslow;
-  nstride_line = plan->nstride_line;
-  nstride_plane = plan->nstride_plane;
-
-  out = buf;
-  for (slow = 0; slow < nslow; slow++) {
-    for (mid = 0; mid < nmid; mid++) {
-      begin = &(data[slow+mid*nstride_plane]);
-      end = begin + nfast*nstride_line;
-      for (in = begin; in < end; in += nstride_line)
-	*in = *(out++);
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   unpack from buf -> data, two axis permutation, 2 values/element 
-------------------------------------------------------------------------- */
-
-void unpack_3d_permute2_2(double *buf, double *data, struct pack_plan_3d *plan)
-
-{
-  register double *in,*out,*begin,*end;
-  register int mid,slow;
-  register int nfast,nmid,nslow,nstride_line,nstride_plane;
-
-  nfast = plan->nfast;
-  nmid = plan->nmid;
-  nslow = plan->nslow;
-  nstride_line = plan->nstride_line;
-  nstride_plane = plan->nstride_plane;
-
-  out = buf;
-  for (slow = 0; slow < nslow; slow++) {
-    for (mid = 0; mid < nmid; mid++) {
-      begin = &(data[2*slow+mid*nstride_plane]);
-      end = begin + nfast*nstride_line;
-      for (in = begin; in < end; in += nstride_line) {
-	*in = *(out++);
-	*(in+1) = *(out++);
-      }
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   unpack from buf -> data, two axis permutation, nqty values/element 
-------------------------------------------------------------------------- */
-
-void unpack_3d_permute2_n(double *buf, double *data, struct pack_plan_3d *plan)
-
-{
-  register double *in,*out,*instart,*begin,*end;
-  register int iqty,mid,slow;
-  register int nfast,nmid,nslow,nstride_line,nstride_plane,nqty;
-
-  nfast = plan->nfast;
-  nmid = plan->nmid;
-  nslow = plan->nslow;
-  nstride_line = plan->nstride_line;
-  nstride_plane = plan->nstride_plane;
-  nqty = plan->nqty;
-
-  out = buf;
-  for (slow = 0; slow < nslow; slow++) {
-    for (mid = 0; mid < nmid; mid++) {
-      begin = &(data[nqty*slow+mid*nstride_plane]);
-      end = begin + nfast*nstride_line;
-      for (instart = begin; instart < end; instart += nstride_line) {
-	in = instart;
-	for (iqty = 0; iqty < nqty; iqty++) *(in++) = *(out++);
-      }
-    }
-  }
-}
-
-#endif
-
-/* ----------------------------------------------------------------------
-   pack/unpack with pointers and memcpy function 
-   no memcpy version of unpack_permute routines,
-     just use PACK_POINTER versions 
-------------------------------------------------------------------------- */
-
-#ifdef PACK_MEMCPY
-
-/* ----------------------------------------------------------------------
-   pack from data -> buf 
-------------------------------------------------------------------------- */
-
-void pack_3d(double *data, double *buf, struct pack_plan_3d *plan)
-
-{
-  register double *in,*out;
-  register int mid,slow,size;
-  register int nfast,nmid,nslow,nstride_line,nstride_plane,plane,upto;
-
-  nfast = plan->nfast;
-  nmid = plan->nmid;
-  nslow = plan->nslow;
-  nstride_line = plan->nstride_line;
-  nstride_plane = plan->nstride_plane;
-
-  size = nfast*sizeof(double);
-  for (slow = 0; slow < nslow; slow++) {
-    plane = slow*nstride_plane;
-    upto = slow*nmid*nfast;
-    for (mid = 0; mid < nmid; mid++) {
-      in = &(buf[upto+mid*nfast]);
-      out = &(data[plane+mid*nstride_line]);
-      memcpy(in,out,size);
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   unpack from buf -> data 
-------------------------------------------------------------------------- */
-
-void unpack_3d(double *buf, double *data, struct pack_plan_3d *plan)
-
-{
-  register double *in,*out;
-  register int mid,slow,size;
-  register int nfast,nmid,nslow,nstride_line,nstride_plane,plane,upto;
-
-  nfast = plan->nfast;
-  nmid = plan->nmid;
-  nslow = plan->nslow;
-  nstride_line = plan->nstride_line;
-  nstride_plane = plan->nstride_plane;
-
-  size = nfast*sizeof(double);
-  for (slow = 0; slow < nslow; slow++) {
-    plane = slow*nstride_plane;
-    upto = slow*nmid*nfast;
-    for (mid = 0; mid < nmid; mid++) {
-      in = &(data[plane+mid*nstride_line]);
-      out = &(buf[upto+mid*nfast]);
-      memcpy(in,out,size);
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   unpack from buf -> data, one axis permutation, 1 value/element 
-------------------------------------------------------------------------- */
-
-void unpack_3d_permute1_1(double *buf, double *data, struct pack_plan_3d *plan)
-
-{
-  register double *in,*out,*begin,*end;
-  register int mid,slow;
-  register int nfast,nmid,nslow,nstride_line,nstride_plane,plane;
-
-  nfast = plan->nfast;
-  nmid = plan->nmid;
-  nslow = plan->nslow;
-  nstride_line = plan->nstride_line;
-  nstride_plane = plan->nstride_plane;
-
-  out = buf;
-  for (slow = 0; slow < nslow; slow++) {
-    plane = slow*nstride_line;
-    for (mid = 0; mid < nmid; mid++) {
-      begin = &(data[plane+mid]);
-      end = begin + nfast*nstride_plane;
-      for (in = begin; in < end; in += nstride_plane)
-	*in = *(out++);
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   unpack from buf -> data, one axis permutation, 2 values/element 
-------------------------------------------------------------------------- */
-
-void unpack_3d_permute1_2(double *buf, double *data, struct pack_plan_3d *plan)
-
-{
-  register double *in,*out,*begin,*end;
-  register int mid,slow;
-  register int nfast,nmid,nslow,nstride_line,nstride_plane,plane;
-
-  nfast = plan->nfast;
-  nmid = plan->nmid;
-  nslow = plan->nslow;
-  nstride_line = plan->nstride_line;
-  nstride_plane = plan->nstride_plane;
-
-  out = buf;
-  for (slow = 0; slow < nslow; slow++) {
-    plane = slow*nstride_line;
-    for (mid = 0; mid < nmid; mid++) {
-      begin = &(data[plane+2*mid]);
-      end = begin + nfast*nstride_plane;
-      for (in = begin; in < end; in += nstride_plane) {
-	*in = *(out++);
-	*(in+1) = *(out++);
-      }
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   unpack from buf -> data, one axis permutation, nqty values/element 
-------------------------------------------------------------------------- */
-
-void unpack_3d_permute1_n(double *buf, double *data, struct pack_plan_3d *plan)
-
-{
-  register double *in,*out,*instart,*begin,*end;
-  register int iqty,mid,slow;
-  register int nfast,nmid,nslow,nstride_line,nstride_plane,plane,nqty;
-
-  nfast = plan->nfast;
-  nmid = plan->nmid;
-  nslow = plan->nslow;
-  nstride_line = plan->nstride_line;
-  nstride_plane = plan->nstride_plane;
-  nqty = plan->nqty;
-
-  out = buf;
-  for (slow = 0; slow < nslow; slow++) {
-    plane = slow*nstride_line;
-    for (mid = 0; mid < nmid; mid++) {
-      begin = &(data[plane+nqty*mid]);
-      end = begin + nfast*nstride_plane;
-      for (instart = begin; instart < end; instart += nstride_plane) {
-	in = instart;
-	for (iqty = 0; iqty < nqty; iqty++) *(in++) = *(out++);
-      }
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   unpack from buf -> data, two axis permutation, 1 value/element 
-------------------------------------------------------------------------- */
-
-void unpack_3d_permute2_1(double *buf, double *data, struct pack_plan_3d *plan)
-
-{
-  register double *in,*out,*begin,*end;
-  register int mid,slow;
-  register int nfast,nmid,nslow,nstride_line,nstride_plane;
-
-  nfast = plan->nfast;
-  nmid = plan->nmid;
-  nslow = plan->nslow;
-  nstride_line = plan->nstride_line;
-  nstride_plane = plan->nstride_plane;
-
-  out = buf;
-  for (slow = 0; slow < nslow; slow++) {
-    for (mid = 0; mid < nmid; mid++) {
-      begin = &(data[slow+mid*nstride_plane]);
-      end = begin + nfast*nstride_line;
-      for (in = begin; in < end; in += nstride_line)
-	*in = *(out++);
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   unpack from buf -> data, two axis permutation, 2 values/element 
-------------------------------------------------------------------------- */
-
-void unpack_3d_permute2_2(double *buf, double *data, struct pack_plan_3d *plan)
-
-{
-  register double *in,*out,*begin,*end;
-  register int mid,slow;
-  register int nfast,nmid,nslow,nstride_line,nstride_plane;
-
-  nfast = plan->nfast;
-  nmid = plan->nmid;
-  nslow = plan->nslow;
-  nstride_line = plan->nstride_line;
-  nstride_plane = plan->nstride_plane;
-
-  out = buf;
-  for (slow = 0; slow < nslow; slow++) {
-    for (mid = 0; mid < nmid; mid++) {
-      begin = &(data[2*slow+mid*nstride_plane]);
-      end = begin + nfast*nstride_line;
-      for (in = begin; in < end; in += nstride_line) {
-	*in = *(out++);
-	*(in+1) = *(out++);
-      }
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   unpack from buf -> data, two axis permutation, nqty values/element 
-------------------------------------------------------------------------- */
-
-void unpack_3d_permute2_n(double *buf, double *data, struct pack_plan_3d *plan)
-
-{
-  register double *in,*out,*instart,*begin,*end;
-  register int iqty,mid,slow;
-  register int nfast,nmid,nslow,nstride_line,nstride_plane,nqty;
-
-  nfast = plan->nfast;
-  nmid = plan->nmid;
-  nslow = plan->nslow;
-  nstride_line = plan->nstride_line;
-  nstride_plane = plan->nstride_plane;
-  nqty = plan->nqty;
-
-  out = buf;
-  for (slow = 0; slow < nslow; slow++) {
-    for (mid = 0; mid < nmid; mid++) {
-      begin = &(data[nqty*slow+mid*nstride_plane]);
-      end = begin + nfast*nstride_line;
-      for (instart = begin; instart < end; instart += nstride_line) {
-	in = instart;
-	for (iqty = 0; iqty < nqty; iqty++) *(in++) = *(out++);
-      }
-    }
-  }
-}
-
-#endif
diff --git a/src/pack.h b/src/pack.h
index bcb6208b2e..5f032e68c7 100644
--- a/src/pack.h
+++ b/src/pack.h
@@ -1,4 +1,4 @@
-/* ----------------------------------------------------------------------
+/* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    http://lammps.sandia.gov, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
@@ -22,13 +22,746 @@ struct pack_plan_3d {
   int nqty;                  // # of values/element 
 };
 
-// function prototypes 
 
-void pack_3d(double *, double *, struct pack_plan_3d *);
-void unpack_3d(double *, double *, struct pack_plan_3d *);
-void unpack_3d_permute1_1(double *, double *, struct pack_plan_3d *);
-void unpack_3d_permute1_2(double *, double *, struct pack_plan_3d *);
-void unpack_3d_permute1_n(double *, double *, struct pack_plan_3d *);
-void unpack_3d_permute2_1(double *, double *, struct pack_plan_3d *);
-void unpack_3d_permute2_2(double *, double *, struct pack_plan_3d *);
-void unpack_3d_permute2_n(double *, double *, struct pack_plan_3d *);
+#if !defined(PACK_POINTER) && !defined(PACK_MEMCPY)
+#define PACK_ARRAY
+#endif
+
+#ifndef PACK_DATA
+#define PACK_DATA double
+#endif
+
+/* ----------------------------------------------------------------------
+   Pack and unpack functions:
+
+   pack routines copy strided values from data into contiguous locs in buf
+   unpack routines copy contiguous values from buf into strided locs in data
+   different versions of unpack depending on permutation
+     and # of values/element
+   PACK_ARRAY routines work via array indices (default)
+   PACK_POINTER routines work via pointers
+   PACK_MEMCPY routines work via pointers and memcpy function
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   pack/unpack with array indices 
+------------------------------------------------------------------------- */
+
+#ifdef PACK_ARRAY
+
+/* ----------------------------------------------------------------------
+   pack from data -> buf 
+------------------------------------------------------------------------- */
+
+static void pack_3d(PACK_DATA *data, PACK_DATA *buf, struct pack_plan_3d *plan)
+{
+  register int in,out,fast,mid,slow;
+  register int nfast,nmid,nslow,nstride_line,nstride_plane,plane;
+
+  nfast = plan->nfast;
+  nmid = plan->nmid;
+  nslow = plan->nslow;
+  nstride_line = plan->nstride_line;
+  nstride_plane = plan->nstride_plane;
+
+  in = 0;
+  for (slow = 0; slow < nslow; slow++) {
+    plane = slow*nstride_plane;
+    for (mid = 0; mid < nmid; mid++) {
+      out = plane + mid*nstride_line;
+      for (fast = 0; fast < nfast; fast++)
+	buf[in++] = data[out++];
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   unpack from buf -> data 
+------------------------------------------------------------------------- */
+
+static void unpack_3d(PACK_DATA *buf, PACK_DATA *data, struct pack_plan_3d *plan)
+{
+  register int in,out,fast,mid,slow;
+  register int nfast,nmid,nslow,nstride_line,nstride_plane,plane;
+
+  nfast = plan->nfast;
+  nmid = plan->nmid;
+  nslow = plan->nslow;
+  nstride_line = plan->nstride_line;
+  nstride_plane = plan->nstride_plane;
+
+  out = 0;
+  for (slow = 0; slow < nslow; slow++) {
+    plane = slow*nstride_plane;
+    for (mid = 0; mid < nmid; mid++) {
+      in = plane + mid*nstride_line;
+      for (fast = 0; fast < nfast; fast++)
+	data[in++] = buf[out++];
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   unpack from buf -> data, one axis permutation, 1 value/element 
+------------------------------------------------------------------------- */
+
+static void unpack_3d_permute1_1(PACK_DATA *buf, PACK_DATA *data, struct pack_plan_3d *plan)
+{
+  register int in,out,fast,mid,slow;
+  register int nfast,nmid,nslow,nstride_line,nstride_plane,plane;
+
+  nfast = plan->nfast;
+  nmid = plan->nmid;
+  nslow = plan->nslow;
+  nstride_line = plan->nstride_line;
+  nstride_plane = plan->nstride_plane;
+
+  out = 0;
+  for (slow = 0; slow < nslow; slow++) {
+    plane = slow*nstride_line;
+    for (mid = 0; mid < nmid; mid++) {
+      in = plane + mid;
+      for (fast = 0; fast < nfast; fast++, in += nstride_plane)
+	data[in] = buf[out++];
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   unpack from buf -> data, one axis permutation, 2 values/element 
+------------------------------------------------------------------------- */
+
+static void unpack_3d_permute1_2(PACK_DATA *buf, PACK_DATA *data, struct pack_plan_3d *plan)
+{
+  register int in,out,fast,mid,slow;
+  register int nfast,nmid,nslow,nstride_line,nstride_plane,plane;
+
+  nfast = plan->nfast;
+  nmid = plan->nmid;
+  nslow = plan->nslow;
+  nstride_line = plan->nstride_line;
+  nstride_plane = plan->nstride_plane;
+
+  out = 0;
+  for (slow = 0; slow < nslow; slow++) {
+    plane = slow*nstride_line;
+    for (mid = 0; mid < nmid; mid++) {
+      in = plane + 2*mid;
+      for (fast = 0; fast < nfast; fast++, in += nstride_plane) {
+	data[in] = buf[out++];
+	data[in+1] = buf[out++];
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   unpack from buf -> data, one axis permutation, nqty values/element 
+------------------------------------------------------------------------- */
+
+static void unpack_3d_permute1_n(PACK_DATA *buf, PACK_DATA *data, struct pack_plan_3d *plan)
+
+{
+  register int in,out,iqty,instart,fast,mid,slow;
+  register int nfast,nmid,nslow,nstride_line,nstride_plane,plane,nqty;
+
+  nfast = plan->nfast;
+  nmid = plan->nmid;
+  nslow = plan->nslow;
+  nstride_line = plan->nstride_line;
+  nstride_plane = plan->nstride_plane;
+  nqty = plan->nqty;
+
+  out = 0;
+  for (slow = 0; slow < nslow; slow++) {
+    plane = slow*nstride_line;
+    for (mid = 0; mid < nmid; mid++) {
+      instart = plane + nqty*mid;
+      for (fast = 0; fast < nfast; fast++, instart += nstride_plane) {
+	in = instart;
+	for (iqty = 0; iqty < nqty; iqty++) data[in++] = buf[out++];
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   unpack from buf -> data, two axis permutation, 1 value/element 
+------------------------------------------------------------------------- */
+
+static void unpack_3d_permute2_1(PACK_DATA *buf, PACK_DATA *data, struct pack_plan_3d *plan)
+
+{
+  register int in,out,fast,mid,slow;
+  register int nfast,nmid,nslow,nstride_line,nstride_plane;
+
+  nfast = plan->nfast;
+  nmid = plan->nmid;
+  nslow = plan->nslow;
+  nstride_line = plan->nstride_line;
+  nstride_plane = plan->nstride_plane;
+
+  out = 0;
+  for (slow = 0; slow < nslow; slow++) {
+    for (mid = 0; mid < nmid; mid++) {
+      in = slow + mid*nstride_plane;
+      for (fast = 0; fast < nfast; fast++, in += nstride_line)
+	data[in] = buf[out++];
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   unpack from buf -> data, two axis permutation, 2 values/element 
+------------------------------------------------------------------------- */
+
+static void unpack_3d_permute2_2(PACK_DATA *buf, PACK_DATA *data, struct pack_plan_3d *plan)
+
+{
+  register int in,out,fast,mid,slow;
+  register int nfast,nmid,nslow,nstride_line,nstride_plane;
+
+  nfast = plan->nfast;
+  nmid = plan->nmid;
+  nslow = plan->nslow;
+  nstride_line = plan->nstride_line;
+  nstride_plane = plan->nstride_plane;
+
+  out = 0;
+  for (slow = 0; slow < nslow; slow++) {
+    for (mid = 0; mid < nmid; mid++) {
+      in = 2*slow + mid*nstride_plane;
+      for (fast = 0; fast < nfast; fast++, in += nstride_line) {
+	data[in] = buf[out++];
+	data[in+1] = buf[out++];
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   unpack from buf -> data, two axis permutation, nqty values/element 
+------------------------------------------------------------------------- */
+
+static void unpack_3d_permute2_n(PACK_DATA *buf, PACK_DATA *data, struct pack_plan_3d *plan)
+
+{
+  register int in,out,iqty,instart,fast,mid,slow;
+  register int nfast,nmid,nslow,nstride_line,nstride_plane,nqty;
+
+  nfast = plan->nfast;
+  nmid = plan->nmid;
+  nslow = plan->nslow;
+  nstride_line = plan->nstride_line;
+  nstride_plane = plan->nstride_plane;
+  nqty = plan->nqty;
+
+  out = 0;
+  for (slow = 0; slow < nslow; slow++) {
+    for (mid = 0; mid < nmid; mid++) {
+      instart = nqty*slow + mid*nstride_plane;
+      for (fast = 0; fast < nfast; fast++, instart += nstride_line) {
+	in = instart;
+	for (iqty = 0; iqty < nqty; iqty++) data[in++] = buf[out++];
+      }
+    }
+  }
+}
+
+#endif
+
+/* ----------------------------------------------------------------------
+   pack/unpack with pointers 
+------------------------------------------------------------------------- */
+
+#ifdef PACK_POINTER
+
+/* ----------------------------------------------------------------------
+   pack from data -> buf 
+------------------------------------------------------------------------- */
+
+static void pack_3d(PACK_DATA *data, PACK_DATA *buf, struct pack_plan_3d *plan)
+
+{
+  register PACK_DATA *in,*out,*begin,*end;
+  register int mid,slow;
+  register int nfast,nmid,nslow,nstride_line,nstride_plane,plane;
+
+  nfast = plan->nfast;
+  nmid = plan->nmid;
+  nslow = plan->nslow;
+  nstride_line = plan->nstride_line;
+  nstride_plane = plan->nstride_plane;
+
+  in = buf;
+  for (slow = 0; slow < nslow; slow++) {
+    plane = slow*nstride_plane;
+    for (mid = 0; mid < nmid; mid++) {
+      begin = &(data[plane+mid*nstride_line]);
+      end = begin + nfast;
+      for (out = begin; out < end; out++)
+	*(in++) = *out;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   unpack from buf -> data 
+------------------------------------------------------------------------- */
+
+static void unpack_3d(PACK_DATA *buf, PACK_DATA *data, struct pack_plan_3d *plan)
+
+{
+  register PACK_DATA *in,*out,*begin,*end;
+  register int mid,slow;
+  register int nfast,nmid,nslow,nstride_line,nstride_plane,plane;
+
+  nfast = plan->nfast;
+  nmid = plan->nmid;
+  nslow = plan->nslow;
+  nstride_line = plan->nstride_line;
+  nstride_plane = plan->nstride_plane;
+
+  out = buf;
+  for (slow = 0; slow < nslow; slow++) {
+    plane = slow*nstride_plane;
+    for (mid = 0; mid < nmid; mid++) {
+      begin = &(data[plane+mid*nstride_line]);
+      end = begin + nfast;
+      for (in = begin; in < end; in++)
+	*in = *(out++);
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   unpack from buf -> data, one axis permutation, 1 value/element 
+------------------------------------------------------------------------- */
+
+static void unpack_3d_permute1_1(PACK_DATA *buf, PACK_DATA *data, struct pack_plan_3d *plan)
+
+{
+  register PACK_DATA *in,*out,*begin,*end;
+  register int mid,slow;
+  register int nfast,nmid,nslow,nstride_line,nstride_plane,plane;
+
+  nfast = plan->nfast;
+  nmid = plan->nmid;
+  nslow = plan->nslow;
+  nstride_line = plan->nstride_line;
+  nstride_plane = plan->nstride_plane;
+
+  out = buf;
+  for (slow = 0; slow < nslow; slow++) {
+    plane = slow*nstride_line;
+    for (mid = 0; mid < nmid; mid++) {
+      begin = &(data[plane+mid]);
+      end = begin + nfast*nstride_plane;
+      for (in = begin; in < end; in += nstride_plane)
+	*in = *(out++);
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   unpack from buf -> data, one axis permutation, 2 values/element 
+------------------------------------------------------------------------- */
+
+static void unpack_3d_permute1_2(PACK_DATA *buf, PACK_DATA *data, struct pack_plan_3d *plan)
+
+{
+  register PACK_DATA *in,*out,*begin,*end;
+  register int mid,slow;
+  register int nfast,nmid,nslow,nstride_line,nstride_plane,plane;
+
+  nfast = plan->nfast;
+  nmid = plan->nmid;
+  nslow = plan->nslow;
+  nstride_line = plan->nstride_line;
+  nstride_plane = plan->nstride_plane;
+
+  out = buf;
+  for (slow = 0; slow < nslow; slow++) {
+    plane = slow*nstride_line;
+    for (mid = 0; mid < nmid; mid++) {
+      begin = &(data[plane+2*mid]);
+      end = begin + nfast*nstride_plane;
+      for (in = begin; in < end; in += nstride_plane) {
+	*in = *(out++);
+	*(in+1) = *(out++);
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   unpack from buf -> data, one axis permutation, nqty values/element 
+------------------------------------------------------------------------- */
+
+static void unpack_3d_permute1_n(PACK_DATA *buf, PACK_DATA *data, struct pack_plan_3d *plan)
+
+{
+  register PACK_DATA *in,*out,*instart,*begin,*end;
+  register int iqty,mid,slow;
+  register int nfast,nmid,nslow,nstride_line,nstride_plane,plane,nqty;
+
+  nfast = plan->nfast;
+  nmid = plan->nmid;
+  nslow = plan->nslow;
+  nstride_line = plan->nstride_line;
+  nstride_plane = plan->nstride_plane;
+  nqty = plan->nqty;
+
+  out = buf;
+  for (slow = 0; slow < nslow; slow++) {
+    plane = slow*nstride_line;
+    for (mid = 0; mid < nmid; mid++) {
+      begin = &(data[plane+nqty*mid]);
+      end = begin + nfast*nstride_plane;
+      for (instart = begin; instart < end; instart += nstride_plane) {
+	in = instart;
+	for (iqty = 0; iqty < nqty; iqty++) *(in++) = *(out++);
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   unpack from buf -> data, two axis permutation, 1 value/element 
+------------------------------------------------------------------------- */
+
+static void unpack_3d_permute2_1(PACK_DATA *buf, PACK_DATA *data, struct pack_plan_3d *plan)
+
+{
+  register PACK_DATA *in,*out,*begin,*end;
+  register int mid,slow;
+  register int nfast,nmid,nslow,nstride_line,nstride_plane;
+
+  nfast = plan->nfast;
+  nmid = plan->nmid;
+  nslow = plan->nslow;
+  nstride_line = plan->nstride_line;
+  nstride_plane = plan->nstride_plane;
+
+  out = buf;
+  for (slow = 0; slow < nslow; slow++) {
+    for (mid = 0; mid < nmid; mid++) {
+      begin = &(data[slow+mid*nstride_plane]);
+      end = begin + nfast*nstride_line;
+      for (in = begin; in < end; in += nstride_line)
+	*in = *(out++);
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   unpack from buf -> data, two axis permutation, 2 values/element 
+------------------------------------------------------------------------- */
+
+static void unpack_3d_permute2_2(PACK_DATA *buf, PACK_DATA *data, struct pack_plan_3d *plan)
+
+{
+  register PACK_DATA *in,*out,*begin,*end;
+  register int mid,slow;
+  register int nfast,nmid,nslow,nstride_line,nstride_plane;
+
+  nfast = plan->nfast;
+  nmid = plan->nmid;
+  nslow = plan->nslow;
+  nstride_line = plan->nstride_line;
+  nstride_plane = plan->nstride_plane;
+
+  out = buf;
+  for (slow = 0; slow < nslow; slow++) {
+    for (mid = 0; mid < nmid; mid++) {
+      begin = &(data[2*slow+mid*nstride_plane]);
+      end = begin + nfast*nstride_line;
+      for (in = begin; in < end; in += nstride_line) {
+	*in = *(out++);
+	*(in+1) = *(out++);
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   unpack from buf -> data, two axis permutation, nqty values/element 
+------------------------------------------------------------------------- */
+
+static void unpack_3d_permute2_n(PACK_DATA *buf, PACK_DATA *data, struct pack_plan_3d *plan)
+
+{
+  register PACK_DATA *in,*out,*instart,*begin,*end;
+  register int iqty,mid,slow;
+  register int nfast,nmid,nslow,nstride_line,nstride_plane,nqty;
+
+  nfast = plan->nfast;
+  nmid = plan->nmid;
+  nslow = plan->nslow;
+  nstride_line = plan->nstride_line;
+  nstride_plane = plan->nstride_plane;
+  nqty = plan->nqty;
+
+  out = buf;
+  for (slow = 0; slow < nslow; slow++) {
+    for (mid = 0; mid < nmid; mid++) {
+      begin = &(data[nqty*slow+mid*nstride_plane]);
+      end = begin + nfast*nstride_line;
+      for (instart = begin; instart < end; instart += nstride_line) {
+	in = instart;
+	for (iqty = 0; iqty < nqty; iqty++) *(in++) = *(out++);
+      }
+    }
+  }
+}
+
+#endif
+
+/* ----------------------------------------------------------------------
+   pack/unpack with pointers and memcpy function 
+   no memcpy version of unpack_permute routines,
+     just use PACK_POINTER versions 
+------------------------------------------------------------------------- */
+
+#ifdef PACK_MEMCPY
+
+/* ----------------------------------------------------------------------
+   pack from data -> buf 
+------------------------------------------------------------------------- */
+
+static void pack_3d(PACK_DATA *data, PACK_DATA *buf, struct pack_plan_3d *plan)
+
+{
+  register PACK_DATA *in,*out;
+  register int mid,slow,size;
+  register int nfast,nmid,nslow,nstride_line,nstride_plane,plane,upto;
+
+  nfast = plan->nfast;
+  nmid = plan->nmid;
+  nslow = plan->nslow;
+  nstride_line = plan->nstride_line;
+  nstride_plane = plan->nstride_plane;
+
+  size = nfast*sizeof(PACK_DATA);
+  for (slow = 0; slow < nslow; slow++) {
+    plane = slow*nstride_plane;
+    upto = slow*nmid*nfast;
+    for (mid = 0; mid < nmid; mid++) {
+      in = &(buf[upto+mid*nfast]);
+      out = &(data[plane+mid*nstride_line]);
+      memcpy(in,out,size);
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   unpack from buf -> data 
+------------------------------------------------------------------------- */
+
+static void unpack_3d(PACK_DATA *buf, PACK_DATA *data, struct pack_plan_3d *plan)
+
+{
+  register PACK_DATA *in,*out;
+  register int mid,slow,size;
+  register int nfast,nmid,nslow,nstride_line,nstride_plane,plane,upto;
+
+  nfast = plan->nfast;
+  nmid = plan->nmid;
+  nslow = plan->nslow;
+  nstride_line = plan->nstride_line;
+  nstride_plane = plan->nstride_plane;
+
+  size = nfast*sizeof(PACK_DATA);
+  for (slow = 0; slow < nslow; slow++) {
+    plane = slow*nstride_plane;
+    upto = slow*nmid*nfast;
+    for (mid = 0; mid < nmid; mid++) {
+      in = &(data[plane+mid*nstride_line]);
+      out = &(buf[upto+mid*nfast]);
+      memcpy(in,out,size);
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   unpack from buf -> data, one axis permutation, 1 value/element 
+------------------------------------------------------------------------- */
+
+static void unpack_3d_permute1_1(PACK_DATA *buf, PACK_DATA *data, struct pack_plan_3d *plan)
+
+{
+  register PACK_DATA *in,*out,*begin,*end;
+  register int mid,slow;
+  register int nfast,nmid,nslow,nstride_line,nstride_plane,plane;
+
+  nfast = plan->nfast;
+  nmid = plan->nmid;
+  nslow = plan->nslow;
+  nstride_line = plan->nstride_line;
+  nstride_plane = plan->nstride_plane;
+
+  out = buf;
+  for (slow = 0; slow < nslow; slow++) {
+    plane = slow*nstride_line;
+    for (mid = 0; mid < nmid; mid++) {
+      begin = &(data[plane+mid]);
+      end = begin + nfast*nstride_plane;
+      for (in = begin; in < end; in += nstride_plane)
+	*in = *(out++);
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   unpack from buf -> data, one axis permutation, 2 values/element 
+------------------------------------------------------------------------- */
+
+static void unpack_3d_permute1_2(PACK_DATA *buf, PACK_DATA *data, struct pack_plan_3d *plan)
+
+{
+  register PACK_DATA *in,*out,*begin,*end;
+  register int mid,slow;
+  register int nfast,nmid,nslow,nstride_line,nstride_plane,plane;
+
+  nfast = plan->nfast;
+  nmid = plan->nmid;
+  nslow = plan->nslow;
+  nstride_line = plan->nstride_line;
+  nstride_plane = plan->nstride_plane;
+
+  out = buf;
+  for (slow = 0; slow < nslow; slow++) {
+    plane = slow*nstride_line;
+    for (mid = 0; mid < nmid; mid++) {
+      begin = &(data[plane+2*mid]);
+      end = begin + nfast*nstride_plane;
+      for (in = begin; in < end; in += nstride_plane) {
+	*in = *(out++);
+	*(in+1) = *(out++);
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   unpack from buf -> data, one axis permutation, nqty values/element 
+------------------------------------------------------------------------- */
+
+static void unpack_3d_permute1_n(PACK_DATA *buf, PACK_DATA *data, struct pack_plan_3d *plan)
+
+{
+  register PACK_DATA *in,*out,*instart,*begin,*end;
+  register int iqty,mid,slow;
+  register int nfast,nmid,nslow,nstride_line,nstride_plane,plane,nqty;
+
+  nfast = plan->nfast;
+  nmid = plan->nmid;
+  nslow = plan->nslow;
+  nstride_line = plan->nstride_line;
+  nstride_plane = plan->nstride_plane;
+  nqty = plan->nqty;
+
+  out = buf;
+  for (slow = 0; slow < nslow; slow++) {
+    plane = slow*nstride_line;
+    for (mid = 0; mid < nmid; mid++) {
+      begin = &(data[plane+nqty*mid]);
+      end = begin + nfast*nstride_plane;
+      for (instart = begin; instart < end; instart += nstride_plane) {
+	in = instart;
+	for (iqty = 0; iqty < nqty; iqty++) *(in++) = *(out++);
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   unpack from buf -> data, two axis permutation, 1 value/element 
+------------------------------------------------------------------------- */
+
+static void unpack_3d_permute2_1(PACK_DATA *buf, PACK_DATA *data, struct pack_plan_3d *plan)
+
+{
+  register PACK_DATA *in,*out,*begin,*end;
+  register int mid,slow;
+  register int nfast,nmid,nslow,nstride_line,nstride_plane;
+
+  nfast = plan->nfast;
+  nmid = plan->nmid;
+  nslow = plan->nslow;
+  nstride_line = plan->nstride_line;
+  nstride_plane = plan->nstride_plane;
+
+  out = buf;
+  for (slow = 0; slow < nslow; slow++) {
+    for (mid = 0; mid < nmid; mid++) {
+      begin = &(data[slow+mid*nstride_plane]);
+      end = begin + nfast*nstride_line;
+      for (in = begin; in < end; in += nstride_line)
+	*in = *(out++);
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   unpack from buf -> data, two axis permutation, 2 values/element 
+------------------------------------------------------------------------- */
+
+static void unpack_3d_permute2_2(PACK_DATA *buf, PACK_DATA *data, struct pack_plan_3d *plan)
+
+{
+  register PACK_DATA *in,*out,*begin,*end;
+  register int mid,slow;
+  register int nfast,nmid,nslow,nstride_line,nstride_plane;
+
+  nfast = plan->nfast;
+  nmid = plan->nmid;
+  nslow = plan->nslow;
+  nstride_line = plan->nstride_line;
+  nstride_plane = plan->nstride_plane;
+
+  out = buf;
+  for (slow = 0; slow < nslow; slow++) {
+    for (mid = 0; mid < nmid; mid++) {
+      begin = &(data[2*slow+mid*nstride_plane]);
+      end = begin + nfast*nstride_line;
+      for (in = begin; in < end; in += nstride_line) {
+	*in = *(out++);
+	*(in+1) = *(out++);
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   unpack from buf -> data, two axis permutation, nqty values/element 
+------------------------------------------------------------------------- */
+
+static void unpack_3d_permute2_n(PACK_DATA *buf, PACK_DATA *data, struct pack_plan_3d *plan)
+
+{
+  register PACK_DATA *in,*out,*instart,*begin,*end;
+  register int iqty,mid,slow;
+  register int nfast,nmid,nslow,nstride_line,nstride_plane,nqty;
+
+  nfast = plan->nfast;
+  nmid = plan->nmid;
+  nslow = plan->nslow;
+  nstride_line = plan->nstride_line;
+  nstride_plane = plan->nstride_plane;
+  nqty = plan->nqty;
+
+  out = buf;
+  for (slow = 0; slow < nslow; slow++) {
+    for (mid = 0; mid < nmid; mid++) {
+      begin = &(data[nqty*slow+mid*nstride_plane]);
+      end = begin + nfast*nstride_line;
+      for (instart = begin; instart < end; instart += nstride_line) {
+	in = instart;
+	for (iqty = 0; iqty < nqty; iqty++) *(in++) = *(out++);
+      }
+    }
+  }
+}
+
+#endif