git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6760 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2011-08-23 17:07:41 +00:00
parent 1490fa2606
commit 301dad125f
12 changed files with 919 additions and 939 deletions
--- a/src/OPT/pair_eam_alloy_opt.cpp
+++ b/src/OPT/pair_eam_alloy_opt.cpp
@ -11,6 +11,14 @@
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */

+/* ----------------------------------------------------------------------
+   Contributing authors:  
+     James Fischer, High Performance Technologies, Inc.
+     Charles Cornwell, High Performance Technologies, Inc.
+     David Richie, Stone Ridge Technology
+     Vincent Natoli, Stone Ridge Technology
+------------------------------------------------------------------------- */
+
 #include "pair_eam_alloy_opt.h"

 using namespace LAMMPS_NS;
@ -18,6 +26,8 @@ using namespace LAMMPS_NS;
 /* ----------------------------------------------------------------------
   multiple inheritance from two parent classes
   invoke constructor of grandparent class, then of each parent
+   inherit optimized compute() from PairEAMOpt
+   inherit everything else from PairEAMAlloy
 ------------------------------------------------------------------------- */

 PairEAMAlloyOpt::PairEAMAlloyOpt(LAMMPS *lmp) :
--- a/src/OPT/pair_eam_alloy_opt.h
+++ b/src/OPT/pair_eam_alloy_opt.h
@ -25,10 +25,6 @@ PairStyle(eam/alloy/opt,PairEAMAlloyOpt)

 namespace LAMMPS_NS {

-// multiple inheritance from two parent classes
-// optimized compute() from PairEAMOpt
-// everything else from PairEAMAlloy
-
 class PairEAMAlloyOpt : public PairEAMAlloy, public PairEAMOpt {
 public:
  PairEAMAlloyOpt(class LAMMPS *);
--- a/src/OPT/pair_eam_fs_opt.cpp
+++ b/src/OPT/pair_eam_fs_opt.cpp
@ -11,6 +11,14 @@
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */

+/* ----------------------------------------------------------------------
+   Contributing authors:  
+     James Fischer, High Performance Technologies, Inc.
+     Charles Cornwell, High Performance Technologies, Inc.
+     David Richie, Stone Ridge Technology
+     Vincent Natoli, Stone Ridge Technology
+------------------------------------------------------------------------- */
+
 #include "pair_eam_fs_opt.h"

 using namespace LAMMPS_NS;
@ -18,6 +26,8 @@ using namespace LAMMPS_NS;
 /* ----------------------------------------------------------------------
   multiple inheritance from two parent classes
   invoke constructor of grandparent class, then of each parent
+   inherit optimized compute() from PairEAMOpt
+   inherit everything else from PairEAMFS
 ------------------------------------------------------------------------- */

 PairEAMFSOpt::PairEAMFSOpt(LAMMPS *lmp) :
--- a/src/OPT/pair_eam_fs_opt.h
+++ b/src/OPT/pair_eam_fs_opt.h
@ -25,10 +25,6 @@ PairStyle(eam/fs/opt,PairEAMFSOpt)

 namespace LAMMPS_NS {

-// multiple inheritance from two parent classes
-// optimized compute() from PairEAMOpt
-// everything else from PairEAMFS
-
 class PairEAMFSOpt : public PairEAMFS, public PairEAMOpt {
 public:
  PairEAMFSOpt(class LAMMPS *);
@ -37,5 +33,4 @@ class PairEAMFSOpt : public PairEAMFS, public PairEAMOpt {
 }

 #endif
-
 #endif
--- a/src/OPT/pair_eam_opt.cpp
+++ b/src/OPT/pair_eam_opt.cpp
@ -19,10 +19,20 @@
     Vincent Natoli, Stone Ridge Technology
 ------------------------------------------------------------------------- */

+#include "math.h"
+#include "stdlib.h"
 #include "pair_eam_opt.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neigh_list.h"
+#include "memory.h"

 using namespace LAMMPS_NS;

+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
 /* ---------------------------------------------------------------------- */

 PairEAMOpt::PairEAMOpt(LAMMPS *lmp) : PairEAM(lmp) {}
@ -47,3 +57,295 @@ void PairEAMOpt::compute(int eflag, int vflag)
    else return eval<0,0,0>();
  }
 }
+
+/* ---------------------------------------------------------------------- */
+
+template < int EVFLAG, int EFLAG, int NEWTON_PAIR >
+void PairEAMOpt::eval()
+{
+  typedef struct { double x,y,z; } vec3_t;
+  
+  typedef struct {
+    double rhor0i,rhor1i,rhor2i,rhor3i;
+    double rhor0j,rhor1j,rhor2j,rhor3j;
+  } fast_alpha_t;
+  
+  typedef struct {
+    double frho0,frho1,frho2,frho3,frho4,frho5,frho6;
+    double _pad[1];
+  } fast_beta_t;
+  
+  typedef struct {
+    double rhor4i,rhor5i,rhor6i;
+    double rhor4j,rhor5j,rhor6j;
+    double z2r0,z2r1,z2r2,z2r3,z2r4,z2r5,z2r6;
+    double _pad[3];
+  } fast_gamma_t;
+  
+  int i,j,ii,jj,inum,jnum,itype,jtype;
+  double evdwl = 0.0;
+  double* __restrict__ coeff;
+
+  // grow energy array if necessary
+  
+  if (atom->nmax > nmax) {
+    memory->sfree(rho);
+    memory->sfree(fp);
+    nmax = atom->nmax;
+    rho = (double *) memory->smalloc(nmax*sizeof(double),"pair:rho");
+    fp = (double *) memory->smalloc(nmax*sizeof(double),"pair:fp");
+  }
+  
+  double** __restrict__ x = atom->x;
+  double** __restrict__ f = atom->f;
+  int* __restrict__ type = atom->type;
+  int nlocal = atom->nlocal;
+  
+  vec3_t* __restrict__ xx = (vec3_t*)x[0];
+  vec3_t* __restrict__ ff = (vec3_t*)f[0];
+  
+  double tmp_cutforcesq = cutforcesq;
+  double tmp_rdr = rdr;
+  int nr2 = nr-2;
+  int nr1 = nr-1;
+  
+  inum = list->inum;
+  int* __restrict__ ilist = list->ilist;
+  int** __restrict__ firstneigh = list->firstneigh;
+  int* __restrict__ numneigh = list->numneigh;
+  
+  int ntypes = atom->ntypes;
+  int ntypes2 = ntypes*ntypes;
+  
+  fast_alpha_t* __restrict__ fast_alpha = 
+    (fast_alpha_t*) malloc(ntypes2*(nr+1)*sizeof(fast_alpha_t));
+  for (i = 0; i < ntypes; i++) for (j = 0; j < ntypes; j++) {
+    fast_alpha_t* __restrict__ tab = &fast_alpha[i*ntypes*nr+j*nr];
+    if (type2rhor[i+1][j+1] >= 0) {
+      for(int m = 1; m <= nr; m++) {
+	tab[m].rhor0i =  rhor_spline[type2rhor[i+1][j+1]][m][6];
+	tab[m].rhor1i =  rhor_spline[type2rhor[i+1][j+1]][m][5];
+	tab[m].rhor2i =  rhor_spline[type2rhor[i+1][j+1]][m][4];
+	tab[m].rhor3i =  rhor_spline[type2rhor[i+1][j+1]][m][3];
+      }
+    }
+    if (type2rhor[j+1][i+1] >= 0) {
+      for(int m = 1; m <= nr; m++) {
+	tab[m].rhor0j =  rhor_spline[type2rhor[j+1][i+1]][m][6];
+	tab[m].rhor1j =  rhor_spline[type2rhor[j+1][i+1]][m][5];
+	tab[m].rhor2j =  rhor_spline[type2rhor[j+1][i+1]][m][4];
+	tab[m].rhor3j =  rhor_spline[type2rhor[j+1][i+1]][m][3];
+      }
+    }
+  }
+  fast_alpha_t* __restrict__ tabeight = fast_alpha;
+  
+  fast_gamma_t* __restrict__ fast_gamma = 
+    (fast_gamma_t*) malloc(ntypes2*(nr+1)*sizeof(fast_gamma_t));
+  for (i = 0; i < ntypes; i++) for (j = 0; j < ntypes; j++) {
+    fast_gamma_t* __restrict__ tab = &fast_gamma[i*ntypes*nr+j*nr];
+    if (type2rhor[i+1][j+1] >= 0) {
+      for(int m = 1; m <= nr; m++) {
+	tab[m].rhor4i =  rhor_spline[type2rhor[i+1][j+1]][m][2];
+	tab[m].rhor5i =  rhor_spline[type2rhor[i+1][j+1]][m][1];
+	tab[m].rhor6i =  rhor_spline[type2rhor[i+1][j+1]][m][0];
+      }
+    }
+    if (type2rhor[j+1][i+1] >= 0) {
+      for(int m = 1; m <= nr; m++) {
+	tab[m].rhor4j =  rhor_spline[type2rhor[j+1][i+1]][m][2];
+	tab[m].rhor5j =  rhor_spline[type2rhor[j+1][i+1]][m][1];
+	tab[m].rhor6j =  rhor_spline[type2rhor[j+1][i+1]][m][0];
+	tab[m].z2r6 =  z2r_spline[type2z2r[i+1][j+1]][m][0];
+      }
+    }
+    if (type2z2r[i+1][j+1] >= 0) {
+      for(int m = 1; m <= nr; m++) {
+	tab[m].z2r0 =  z2r_spline[type2z2r[i+1][j+1]][m][6];
+	tab[m].z2r1 =  z2r_spline[type2z2r[i+1][j+1]][m][5];
+	tab[m].z2r2 =  z2r_spline[type2z2r[i+1][j+1]][m][4];
+	tab[m].z2r3 =  z2r_spline[type2z2r[i+1][j+1]][m][3];
+	tab[m].z2r4 =  z2r_spline[type2z2r[i+1][j+1]][m][2];
+	tab[m].z2r5 =  z2r_spline[type2z2r[i+1][j+1]][m][1];
+	tab[m].z2r6 =  z2r_spline[type2z2r[i+1][j+1]][m][0];
+      }
+    }
+  }
+  fast_gamma_t* __restrict__ tabss = fast_gamma;
+  
+  // zero out density
+  
+  if (NEWTON_PAIR) {
+    int m = nlocal + atom->nghost;
+    for (i = 0; i < m; i++) rho[i] = 0.0;
+  } else for (i = 0; i < nlocal; i++) rho[i] = 0.0;
+  
+  // rho = density at each atom
+  // loop over neighbors of my atoms
+  
+  // loop over neighbors of my atoms
+  
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    double xtmp = xx[i].x;
+    double ytmp = xx[i].y;
+    double ztmp = xx[i].z;
+    itype = type[i] - 1;
+    int* __restrict__ jlist = firstneigh[i];
+    jnum = numneigh[i];
+    
+    double tmprho = rho[i];
+    fast_alpha_t* __restrict__ tabeighti = &tabeight[itype*ntypes*nr];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+
+      double delx = xtmp - xx[j].x;
+      double dely = ytmp - xx[j].y;
+      double delz = ztmp - xx[j].z;
+      double rsq = delx*delx + dely*dely + delz*delz;
+      
+      if (rsq < tmp_cutforcesq) {
+	jtype = type[j] - 1;
+	
+	double p = sqrt(rsq)*tmp_rdr;
+	if ( (int)p <= nr2 ) {
+	  int m = (int)p + 1;
+	  p -= (double)((int)p);
+	  fast_alpha_t& a = tabeighti[jtype*nr+m];
+	  tmprho += ((a.rhor3j*p+a.rhor2j)*p+a.rhor1j)*p+a.rhor0j;
+	  if (NEWTON_PAIR || j < nlocal) {
+	    rho[j] += ((a.rhor3i*p+a.rhor2i)*p+a.rhor1i)*p+a.rhor0i;
+	  }
+	} else {
+	  fast_alpha_t& a = tabeighti[jtype*nr+nr1];
+	  tmprho += a.rhor3j+a.rhor2j+a.rhor1j+a.rhor0j;
+	  if (NEWTON_PAIR || j < nlocal) {
+	    rho[j] += a.rhor3i+a.rhor2i+a.rhor1i+a.rhor0i;
+	  }
+	}
+      }
+    }
+    rho[i] = tmprho;
+  }
+  
+  // communicate and sum densities
+  
+  if (NEWTON_PAIR) comm->reverse_comm_pair(this);
+  
+  // fp = derivative of embedding energy at each atom
+  // phi = embedding energy at each atom
+  
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    double p = rho[i]*rdrho;
+    int m = MIN((int)p,nrho-2);
+    p -= (double)m;
+    ++m;
+    coeff = frho_spline[type2frho[type[i]]][m];
+    fp[i] = (coeff[0]*p + coeff[1])*p + coeff[2];
+    if (EFLAG) {
+      double phi = ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6];
+      if (eflag_global) eng_vdwl += phi;
+      if (eflag_atom) eatom[i] += phi;
+    }
+  }
+  
+  // communicate derivative of embedding function
+  
+  comm->forward_comm_pair(this);
+  
+  // compute forces on each atom
+  // loop over neighbors of my atoms
+
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    double xtmp = xx[i].x;
+    double ytmp = xx[i].y;
+    double ztmp = xx[i].z;
+    int itype1 = type[i] - 1;
+    int* __restrict__ jlist = firstneigh[i];
+    jnum = numneigh[i];
+    
+    double tmpfx = 0.0;
+    double tmpfy = 0.0;
+    double tmpfz = 0.0;
+    
+    fast_gamma_t* __restrict__ tabssi = &tabss[itype1*ntypes*nr];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+
+      double delx = xtmp - xx[j].x;
+      double dely = ytmp - xx[j].y;
+      double delz = ztmp - xx[j].z;
+      double rsq = delx*delx + dely*dely + delz*delz;
+      
+      if (rsq < tmp_cutforcesq) {
+	jtype = type[j] - 1;
+	double r = sqrt(rsq);
+	double rhoip,rhojp,z2,z2p;
+	double p = r*tmp_rdr;
+	if ( (int)p <= nr2 ) {
+	  int m = (int) p + 1;
+	  p -= (double)((int) p);
+	  
+	  fast_gamma_t& a = tabssi[jtype*nr+m];
+	  rhoip = (a.rhor6i*p + a.rhor5i)*p + a.rhor4i;
+	  rhojp = (a.rhor6j*p + a.rhor5j)*p + a.rhor4j;
+	  z2 = ((a.z2r3*p + a.z2r2)*p + a.z2r1)*p + a.z2r0;
+	  z2p = (a.z2r6*p + a.z2r5)*p + a.z2r4;
+	  
+	} else {
+	  
+	  fast_gamma_t& a = tabssi[jtype*nr+nr1];
+	  rhoip = a.rhor6i + a.rhor5i + a.rhor4i;
+	  rhojp = a.rhor6j + a.rhor5j + a.rhor4j;
+	  z2 = a.z2r3 + a.z2r2 + a.z2r1 + a.z2r0;
+	  z2p = a.z2r6 + a.z2r5 + a.z2r4;
+	}
+	
+	// rhoip = derivative of (density at atom j due to atom i)
+	// rhojp = derivative of (density at atom i due to atom j)
+	// phi = pair potential energy
+	// phip = phi'
+	// z2 = phi * r
+	// z2p = (phi * r)' = (phi' r) + phi
+	// psip needs both fp[i] and fp[j] terms since r_ij appears in two
+	//   terms of embed eng: Fi(sum rho_ij) and Fj(sum rho_ji)
+	//   hence embed' = Fi(sum rho_ij) rhojp + Fj(sum rho_ji) rhoip
+	
+	double recip = 1.0/r;
+	double phi = z2*recip;
+	double phip = z2p*recip - phi*recip;
+	double psip = fp[i]*rhojp + fp[j]*rhoip + phip;
+	double fpair = -psip*recip;
+	
+	tmpfx += delx*fpair;
+	tmpfy += dely*fpair;
+	tmpfz += delz*fpair;
+	if (NEWTON_PAIR || j < nlocal) {
+	  ff[j].x -= delx*fpair;
+	  ff[j].y -= dely*fpair;
+	  ff[j].z -= delz*fpair;
+	}
+	
+	if (EFLAG) evdwl = phi;
+	
+	if (EVFLAG) ev_tally(i,j,nlocal,NEWTON_PAIR,
+			     evdwl,0.0,fpair,delx,dely,delz);
+      }
+    }
+
+    ff[i].x += tmpfx;
+    ff[i].y += tmpfy;
+    ff[i].z += tmpfz;
+  }
+  
+  free(fast_alpha); fast_alpha = 0;
+  free(fast_gamma); fast_gamma = 0;
+  
+  if (vflag_fdotr) virial_fdotr_compute();
+}
--- a/src/OPT/pair_eam_opt.h
+++ b/src/OPT/pair_eam_opt.h
@ -11,14 +11,6 @@
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */

-/* ----------------------------------------------------------------------
-   Contributing authors:  
-     James Fischer, High Performance Technologies, Inc.
-     Charles Cornwell, High Performance Technologies, Inc.
-     David Richie, Stone Ridge Technology
-     Vincent Natol, Stone Ridge Technology
------------------------------------------------------------------------- */
-
 #ifdef PAIR_CLASS

 PairStyle(eam/opt,PairEAMOpt)
@ -28,17 +20,7 @@ PairStyle(eam/opt,PairEAMOpt)
 #ifndef LMP_PAIR_EAM_OPT_H
 #define LMP_PAIR_EAM_OPT_H

-#include "math.h"
-#include "stdlib.h"
 #include "pair_eam.h"
-#include "atom.h"
-#include "comm.h"
-#include "force.h"
-#include "neigh_list.h"
-#include "memory.h"
-
-#define MIN(a,b) ((a) < (b) ? (a) : (b))
-#define MAX(a,b) ((a) > (b) ? (a) : (b))

 namespace LAMMPS_NS {

@ -54,296 +36,6 @@ class PairEAMOpt : virtual public PairEAM {
  template < int EVFLAG, int EFLAG, int NEWTON_PAIR > void eval();
 };

-template < int EVFLAG, int EFLAG, int NEWTON_PAIR >
-void PairEAMOpt::eval()
-{
-  typedef struct { double x,y,z; } vec3_t;
-  
-  typedef struct {
-    double rhor0i,rhor1i,rhor2i,rhor3i;
-    double rhor0j,rhor1j,rhor2j,rhor3j;
-  } fast_alpha_t;
-  
-  typedef struct {
-    double frho0,frho1,frho2,frho3,frho4,frho5,frho6;
-    double _pad[1];
-  } fast_beta_t;
-  
-  typedef struct {
-    double rhor4i,rhor5i,rhor6i;
-    double rhor4j,rhor5j,rhor6j;
-    double z2r0,z2r1,z2r2,z2r3,z2r4,z2r5,z2r6;
-    double _pad[3];
-  } fast_gamma_t;
-  
-  int i,j,ii,jj,inum,jnum,itype,jtype;
-  double evdwl = 0.0;
-  double* __restrict__ coeff;
-
-  // grow energy array if necessary
-  
-  if (atom->nmax > nmax) {
-    memory->sfree(rho);
-    memory->sfree(fp);
-    nmax = atom->nmax;
-    rho = (double *) memory->smalloc(nmax*sizeof(double),"pair:rho");
-    fp = (double *) memory->smalloc(nmax*sizeof(double),"pair:fp");
-  }
-  
-  double** __restrict__ x = atom->x;
-  double** __restrict__ f = atom->f;
-  int* __restrict__ type = atom->type;
-  int nlocal = atom->nlocal;
-  
-  vec3_t* __restrict__ xx = (vec3_t*)x[0];
-  vec3_t* __restrict__ ff = (vec3_t*)f[0];
-  
-  double tmp_cutforcesq = cutforcesq;
-  double tmp_rdr = rdr;
-  int nr2 = nr-2;
-  int nr1 = nr-1;
-  
-  inum = list->inum;
-  int* __restrict__ ilist = list->ilist;
-  int** __restrict__ firstneigh = list->firstneigh;
-  int* __restrict__ numneigh = list->numneigh;
-  
-  int ntypes = atom->ntypes;
-  int ntypes2 = ntypes*ntypes;
-  
-  fast_alpha_t* __restrict__ fast_alpha = 
-    (fast_alpha_t*) malloc(ntypes2*(nr+1)*sizeof(fast_alpha_t));
-  for (i = 0; i < ntypes; i++) for (j = 0; j < ntypes; j++) {
-    fast_alpha_t* __restrict__ tab = &fast_alpha[i*ntypes*nr+j*nr];
-    if (type2rhor[i+1][j+1] >= 0) {
-      for(int m = 1; m <= nr; m++) {
-	tab[m].rhor0i =  rhor_spline[type2rhor[i+1][j+1]][m][6];
-	tab[m].rhor1i =  rhor_spline[type2rhor[i+1][j+1]][m][5];
-	tab[m].rhor2i =  rhor_spline[type2rhor[i+1][j+1]][m][4];
-	tab[m].rhor3i =  rhor_spline[type2rhor[i+1][j+1]][m][3];
-      }
-    }
-    if (type2rhor[j+1][i+1] >= 0) {
-      for(int m = 1; m <= nr; m++) {
-	tab[m].rhor0j =  rhor_spline[type2rhor[j+1][i+1]][m][6];
-	tab[m].rhor1j =  rhor_spline[type2rhor[j+1][i+1]][m][5];
-	tab[m].rhor2j =  rhor_spline[type2rhor[j+1][i+1]][m][4];
-	tab[m].rhor3j =  rhor_spline[type2rhor[j+1][i+1]][m][3];
-      }
-    }
-  }
-  fast_alpha_t* __restrict__ tabeight = fast_alpha;
-  
-  fast_gamma_t* __restrict__ fast_gamma = 
-    (fast_gamma_t*) malloc(ntypes2*(nr+1)*sizeof(fast_gamma_t));
-  for (i = 0; i < ntypes; i++) for (j = 0; j < ntypes; j++) {
-    fast_gamma_t* __restrict__ tab = &fast_gamma[i*ntypes*nr+j*nr];
-    if (type2rhor[i+1][j+1] >= 0) {
-      for(int m = 1; m <= nr; m++) {
-	tab[m].rhor4i =  rhor_spline[type2rhor[i+1][j+1]][m][2];
-	tab[m].rhor5i =  rhor_spline[type2rhor[i+1][j+1]][m][1];
-	tab[m].rhor6i =  rhor_spline[type2rhor[i+1][j+1]][m][0];
-      }
-    }
-    if (type2rhor[j+1][i+1] >= 0) {
-      for(int m = 1; m <= nr; m++) {
-	tab[m].rhor4j =  rhor_spline[type2rhor[j+1][i+1]][m][2];
-	tab[m].rhor5j =  rhor_spline[type2rhor[j+1][i+1]][m][1];
-	tab[m].rhor6j =  rhor_spline[type2rhor[j+1][i+1]][m][0];
-	tab[m].z2r6 =  z2r_spline[type2z2r[i+1][j+1]][m][0];
-      }
-    }
-    if (type2z2r[i+1][j+1] >= 0) {
-      for(int m = 1; m <= nr; m++) {
-	tab[m].z2r0 =  z2r_spline[type2z2r[i+1][j+1]][m][6];
-	tab[m].z2r1 =  z2r_spline[type2z2r[i+1][j+1]][m][5];
-	tab[m].z2r2 =  z2r_spline[type2z2r[i+1][j+1]][m][4];
-	tab[m].z2r3 =  z2r_spline[type2z2r[i+1][j+1]][m][3];
-	tab[m].z2r4 =  z2r_spline[type2z2r[i+1][j+1]][m][2];
-	tab[m].z2r5 =  z2r_spline[type2z2r[i+1][j+1]][m][1];
-	tab[m].z2r6 =  z2r_spline[type2z2r[i+1][j+1]][m][0];
-      }
-    }
-  }
-  fast_gamma_t* __restrict__ tabss = fast_gamma;
-  
-  // zero out density
-  
-  if (NEWTON_PAIR) {
-    int m = nlocal + atom->nghost;
-    for (i = 0; i < m; i++) rho[i] = 0.0;
-  } else for (i = 0; i < nlocal; i++) rho[i] = 0.0;
-  
-  // rho = density at each atom
-  // loop over neighbors of my atoms
-  
-  // loop over neighbors of my atoms
-  
-  for (ii = 0; ii < inum; ii++) {
-    i = ilist[ii];
-    double xtmp = xx[i].x;
-    double ytmp = xx[i].y;
-    double ztmp = xx[i].z;
-    itype = type[i] - 1;
-    int* __restrict__ jlist = firstneigh[i];
-    jnum = numneigh[i];
-    
-    double tmprho = rho[i];
-    fast_alpha_t* __restrict__ tabeighti = &tabeight[itype*ntypes*nr];
-
-    for (jj = 0; jj < jnum; jj++) {
-      j = jlist[jj];
-      j &= NEIGHMASK;
-
-      double delx = xtmp - xx[j].x;
-      double dely = ytmp - xx[j].y;
-      double delz = ztmp - xx[j].z;
-      double rsq = delx*delx + dely*dely + delz*delz;
-      
-      if (rsq < tmp_cutforcesq) {
-	jtype = type[j] - 1;
-	
-	double p = sqrt(rsq)*tmp_rdr;
-	if ( (int)p <= nr2 ) {
-	  int m = (int)p + 1;
-	  p -= (double)((int)p);
-	  fast_alpha_t& a = tabeighti[jtype*nr+m];
-	  tmprho += ((a.rhor3j*p+a.rhor2j)*p+a.rhor1j)*p+a.rhor0j;
-	  if (NEWTON_PAIR || j < nlocal) {
-	    rho[j] += ((a.rhor3i*p+a.rhor2i)*p+a.rhor1i)*p+a.rhor0i;
-	  }
-	} else {
-	  fast_alpha_t& a = tabeighti[jtype*nr+nr1];
-	  tmprho += a.rhor3j+a.rhor2j+a.rhor1j+a.rhor0j;
-	  if (NEWTON_PAIR || j < nlocal) {
-	    rho[j] += a.rhor3i+a.rhor2i+a.rhor1i+a.rhor0i;
-	  }
-	}
-      }
-    }
-    rho[i] = tmprho;
-  }
-  
-  // communicate and sum densities
-  
-  if (NEWTON_PAIR) comm->reverse_comm_pair(this);
-  
-  // fp = derivative of embedding energy at each atom
-  // phi = embedding energy at each atom
-  
-  for (ii = 0; ii < inum; ii++) {
-    i = ilist[ii];
-    double p = rho[i]*rdrho;
-    int m = MIN((int)p,nrho-2);
-    p -= (double)m;
-    ++m;
-    coeff = frho_spline[type2frho[type[i]]][m];
-    fp[i] = (coeff[0]*p + coeff[1])*p + coeff[2];
-    if (EFLAG) {
-      double phi = ((coeff[3]*p + coeff[4])*p + coeff[5])*p + coeff[6];
-      if (eflag_global) eng_vdwl += phi;
-      if (eflag_atom) eatom[i] += phi;
-    }
-  }
-  
-  // communicate derivative of embedding function
-  
-  comm->forward_comm_pair(this);
-  
-  // compute forces on each atom
-  // loop over neighbors of my atoms
-
-  for (ii = 0; ii < inum; ii++) {
-    i = ilist[ii];
-    double xtmp = xx[i].x;
-    double ytmp = xx[i].y;
-    double ztmp = xx[i].z;
-    int itype1 = type[i] - 1;
-    int* __restrict__ jlist = firstneigh[i];
-    jnum = numneigh[i];
-    
-    double tmpfx = 0.0;
-    double tmpfy = 0.0;
-    double tmpfz = 0.0;
-    
-    fast_gamma_t* __restrict__ tabssi = &tabss[itype1*ntypes*nr];
-
-    for (jj = 0; jj < jnum; jj++) {
-      j = jlist[jj];
-      j &= NEIGHMASK;
-
-      double delx = xtmp - xx[j].x;
-      double dely = ytmp - xx[j].y;
-      double delz = ztmp - xx[j].z;
-      double rsq = delx*delx + dely*dely + delz*delz;
-      
-      if (rsq < tmp_cutforcesq) {
-	jtype = type[j] - 1;
-	double r = sqrt(rsq);
-	double rhoip,rhojp,z2,z2p;
-	double p = r*tmp_rdr;
-	if ( (int)p <= nr2 ) {
-	  int m = (int) p + 1;
-	  p -= (double)((int) p);
-	  
-	  fast_gamma_t& a = tabssi[jtype*nr+m];
-	  rhoip = (a.rhor6i*p + a.rhor5i)*p + a.rhor4i;
-	  rhojp = (a.rhor6j*p + a.rhor5j)*p + a.rhor4j;
-	  z2 = ((a.z2r3*p + a.z2r2)*p + a.z2r1)*p + a.z2r0;
-	  z2p = (a.z2r6*p + a.z2r5)*p + a.z2r4;
-	  
-	} else {
-	  
-	  fast_gamma_t& a = tabssi[jtype*nr+nr1];
-	  rhoip = a.rhor6i + a.rhor5i + a.rhor4i;
-	  rhojp = a.rhor6j + a.rhor5j + a.rhor4j;
-	  z2 = a.z2r3 + a.z2r2 + a.z2r1 + a.z2r0;
-	  z2p = a.z2r6 + a.z2r5 + a.z2r4;
-	}
-	
-	// rhoip = derivative of (density at atom j due to atom i)
-	// rhojp = derivative of (density at atom i due to atom j)
-	// phi = pair potential energy
-	// phip = phi'
-	// z2 = phi * r
-	// z2p = (phi * r)' = (phi' r) + phi
-	// psip needs both fp[i] and fp[j] terms since r_ij appears in two
-	//   terms of embed eng: Fi(sum rho_ij) and Fj(sum rho_ji)
-	//   hence embed' = Fi(sum rho_ij) rhojp + Fj(sum rho_ji) rhoip
-	
-	double recip = 1.0/r;
-	double phi = z2*recip;
-	double phip = z2p*recip - phi*recip;
-	double psip = fp[i]*rhojp + fp[j]*rhoip + phip;
-	double fpair = -psip*recip;
-	
-	tmpfx += delx*fpair;
-	tmpfy += dely*fpair;
-	tmpfz += delz*fpair;
-	if (NEWTON_PAIR || j < nlocal) {
-	  ff[j].x -= delx*fpair;
-	  ff[j].y -= dely*fpair;
-	  ff[j].z -= delz*fpair;
-	}
-	
-	if (EFLAG) evdwl = phi;
-	
-	if (EVFLAG) ev_tally(i,j,nlocal,NEWTON_PAIR,
-			     evdwl,0.0,fpair,delx,dely,delz);
-      }
-    }
-
-    ff[i].x += tmpfx;
-    ff[i].y += tmpfy;
-    ff[i].z += tmpfz;
-  }
-  
-  free(fast_alpha); fast_alpha = 0;
-  free(fast_gamma); fast_gamma = 0;
-  
-  if (vflag_fdotr) virial_fdotr_compute();
-}
-
 }

 #endif
--- a/src/OPT/pair_lj_charmm_coul_long_opt.cpp
+++ b/src/OPT/pair_lj_charmm_coul_long_opt.cpp
@ -18,10 +18,26 @@
     Vincent Natoli, Stone Ridge Technology
 ------------------------------------------------------------------------- */

+#include "math.h"
+#include "stdlib.h"
 #include "pair_lj_charmm_coul_long_opt.h"
+#include "atom.h"
+#include "force.h"
+#include "neigh_list.h"

 using namespace LAMMPS_NS;

+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#define MAX(a,b) ((a) > (b) ? (a) : (b))
+
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define EWALD_A1  0.254829592
+#define EWALD_A2 -0.284496736
+#define EWALD_A3  1.421413741
+#define EWALD_A4 -1.453152027
+#define EWALD_A5  1.061405429
+
 /* ---------------------------------------------------------------------- */

 PairLJCharmmCoulLongOpt::PairLJCharmmCoulLongOpt(LAMMPS *lmp) :
@ -47,3 +63,281 @@ void PairLJCharmmCoulLongOpt::compute(int eflag, int vflag)
    else return eval<0,0,0>();
  }
 }
+
+/* ---------------------------------------------------------------------- */
+
+template < int EVFLAG, int EFLAG, int NEWTON_PAIR >
+void PairLJCharmmCoulLongOpt::eval()
+{
+  typedef struct { double x,y,z; } vec3_t;
+  
+  typedef struct {
+    double cutsq,lj1,lj2,lj3,lj4,offset;
+    double _pad[2];
+  } fast_alpha_t;
+  
+  int i,j,ii,jj,inum,jnum,itype,jtype,itable,sbindex;
+  double fraction,table;
+  double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
+  double grij,expm2,prefactor,t,erfc;
+  double philj,switch1,switch2;
+ 
+  double rsq; 
+  
+  double evdwl = 0.0;
+  double ecoul = 0.0;
+  
+  double** __restrict__ x = atom->x;
+  double** __restrict__ f = atom->f;
+  double* __restrict__ q = atom->q;
+  int* __restrict__ type = atom->type;
+  int nlocal = atom->nlocal;
+  double* __restrict__ special_coul = force->special_coul;
+  double* __restrict__ special_lj = force->special_lj;
+  double qqrd2e = force->qqrd2e;
+
+  inum = list->inum;
+  int* __restrict__ ilist = list->ilist;
+  int** __restrict__ firstneigh = list->firstneigh;
+  int* __restrict__ numneigh = list->numneigh;
+  
+  vec3_t* __restrict__ xx = (vec3_t*)x[0];
+  vec3_t* __restrict__ ff = (vec3_t*)f[0];
+  
+  int ntypes = atom->ntypes;
+  int ntypes2 = ntypes*ntypes;
+  
+  double tmp_coef1 = 1.0/denom_lj;
+  double tmp_coef2 = cut_ljsq - 3.0*cut_lj_innersq;
+  
+  fast_alpha_t* __restrict__ fast_alpha = 
+    (fast_alpha_t*)malloc(ntypes2*sizeof(fast_alpha_t));
+  for (i = 0; i < ntypes; i++) for (j = 0; j < ntypes; j++) {
+    fast_alpha_t& a = fast_alpha[i*ntypes+j];
+    a.cutsq = cutsq[i+1][j+1];
+    a.lj1 = lj1[i+1][j+1];
+    a.lj2 = lj2[i+1][j+1];
+    a.lj3 = lj3[i+1][j+1];
+    a.lj4 = lj4[i+1][j+1];
+  }
+  fast_alpha_t* __restrict__ tabsix = fast_alpha;
+  
+  // loop over neighbors of my atoms
+  
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    double qtmp = q[i];
+    double xtmp = xx[i].x;
+    double ytmp = xx[i].y;
+    double ztmp = xx[i].z;
+    itype = type[i] - 1;
+    int* __restrict__ jlist = firstneigh[i];
+    jnum = numneigh[i];
+    
+    double tmpfx = 0.0;
+    double tmpfy = 0.0;
+    double tmpfz = 0.0;
+    
+    fast_alpha_t* __restrict__ tabsixi = (fast_alpha_t*) &tabsix[itype*ntypes];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      sbindex = sbmask(j);
+
+      if (sbindex == 0) {
+	double delx = xtmp - xx[j].x;
+	double dely = ytmp - xx[j].y;
+	double delz = ztmp - xx[j].z;
+	rsq = delx*delx + dely*dely + delz*delz;
+	double tmp_coef3 = qtmp*q[j];
+	
+	if (rsq < cut_bothsq) {
+	  r2inv = 1.0/rsq;
+	  
+	  forcecoul = 0.0;
+	  if (rsq < cut_coulsq) {
+	    if (!ncoultablebits || rsq <= tabinnersq) {
+	      r = sqrt(rsq);
+	      grij = g_ewald * r;
+	      expm2 = exp(-grij*grij);
+	      t = 1.0 / (1.0 + EWALD_P*grij);
+	      erfc = t * 
+		(EWALD_A1+t*(EWALD_A2+t*(EWALD_A3+t*(EWALD_A4+t*EWALD_A5)))) * 
+		expm2;
+	      prefactor = qqrd2e * tmp_coef3/r;
+	      forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
+	    } else {
+	      union_int_float_t rsq_lookup;
+	      rsq_lookup.f = rsq;
+	      itable = rsq_lookup.i & ncoulmask;
+	      itable >>= ncoulshiftbits;
+	      fraction = (rsq_lookup.f - rtable[itable]) * drtable[itable];
+	      table = ftable[itable] + fraction*dftable[itable];
+	      forcecoul = tmp_coef3 * table;
+	    }
+	  }  
+	  
+	  forcelj = 0.0;
+	  if (rsq < cut_ljsq) {
+	    r6inv = r2inv*r2inv*r2inv;
+	    jtype = type[j] - 1;
+	    fast_alpha_t& a = tabsixi[jtype];
+	    forcelj = r6inv * (a.lj1*r6inv - a.lj2);
+	    if (rsq > cut_lj_innersq) {
+	      switch1 = (cut_ljsq-rsq) * (cut_ljsq-rsq) *
+		(tmp_coef2 + 2.0*rsq) * tmp_coef1;
+	      switch2 = 12.0*rsq * (cut_ljsq-rsq) * 
+		(rsq-cut_lj_innersq) * tmp_coef1;
+	      philj = r6inv * (a.lj3*r6inv - a.lj4);
+	      forcelj = forcelj*switch1 + philj*switch2;
+	    }
+	  } 
+	  
+	  double fpair = (forcecoul + forcelj) * r2inv;
+	  
+	  tmpfx += delx*fpair;
+	  tmpfy += dely*fpair;
+	  tmpfz += delz*fpair;
+	  if (NEWTON_PAIR || j < nlocal) {
+	    ff[j].x -= delx*fpair;
+	    ff[j].y -= dely*fpair;
+	    ff[j].z -= delz*fpair;
+	  }
+	  
+	  if (EFLAG) {
+	    if (rsq < cut_coulsq) {
+	      if (!ncoultablebits || rsq <= tabinnersq)
+		ecoul = prefactor*erfc;
+	      else {
+		table = etable[itable] + fraction*detable[itable];
+		ecoul = tmp_coef3 * table;
+	      }
+	    } else ecoul = 0.0;
+
+	    if (rsq < cut_ljsq) {
+	      fast_alpha_t& a = tabsixi[jtype];
+	      evdwl = r6inv*(a.lj3*r6inv-a.lj4);
+	      if (rsq > cut_lj_innersq) {
+		switch1 = (cut_ljsq-rsq) * (cut_ljsq-rsq) *
+		  (tmp_coef2 + 2.0*rsq) * tmp_coef1;
+		evdwl *= switch1;
+	      }
+	    } else evdwl = 0.0;
+	  }
+
+	  if (EVFLAG) ev_tally(i,j,nlocal,NEWTON_PAIR,
+			       evdwl,ecoul,fpair,delx,dely,delz);
+	}
+
+      } else {
+	factor_lj = special_lj[sbindex];
+	factor_coul = special_coul[sbindex];
+	j &= NEIGHMASK;
+
+	double delx = xtmp - xx[j].x;
+	double dely = ytmp - xx[j].y;
+	double delz = ztmp - xx[j].z;
+	rsq = delx*delx + dely*dely + delz*delz;
+	double tmp_coef3 = qtmp*q[j];
+	
+	if (rsq < cut_bothsq) {
+	  r2inv = 1.0/rsq;
+	  
+	  forcecoul = 0.0;
+	  if (rsq < cut_coulsq) {
+	    if (!ncoultablebits || rsq <= tabinnersq) {
+	      r = sqrt(rsq);
+	      grij = g_ewald * r;
+	      expm2 = exp(-grij*grij);
+	      t = 1.0 / (1.0 + EWALD_P*grij);
+	      erfc = t * 
+		(EWALD_A1+t*(EWALD_A2+t*(EWALD_A3+t*(EWALD_A4+t*EWALD_A5)))) * 
+		expm2;
+	      prefactor = qqrd2e * tmp_coef3/r;
+	      forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
+	      if (factor_coul < 1.0) {
+		forcecoul -= (1.0-factor_coul)*prefactor;
+	      }
+	    } else {
+	      union_int_float_t rsq_lookup;
+	      rsq_lookup.f = rsq;
+	      itable = rsq_lookup.i & ncoulmask;
+	      itable >>= ncoulshiftbits;
+	      fraction = (rsq_lookup.f - rtable[itable]) * drtable[itable];
+	      table = ftable[itable] + fraction*dftable[itable];
+	      forcecoul = tmp_coef3 * table;
+	      if (factor_coul < 1.0) {
+		table = ctable[itable] + fraction*dctable[itable];
+		prefactor = tmp_coef3 * table;
+		forcecoul -= (1.0-factor_coul)*prefactor;
+	      }
+	    }
+	  }  
+	  
+	  forcelj = 0.0;
+	  if (rsq < cut_ljsq) {
+	    r6inv = r2inv*r2inv*r2inv;
+	    jtype = type[j] - 1;
+	    fast_alpha_t& a = tabsixi[jtype];
+	    forcelj = r6inv * (a.lj1*r6inv - a.lj2);
+	    if (rsq > cut_lj_innersq) {
+	      switch1 = (cut_ljsq-rsq) * (cut_ljsq-rsq) *
+		(tmp_coef2 + 2.0*rsq) * tmp_coef1;
+	      switch2 = 12.0*rsq * (cut_ljsq-rsq) * 
+		(rsq-cut_lj_innersq) * tmp_coef1;
+	      fast_alpha_t& a = tabsixi[jtype];
+	      philj = r6inv * (a.lj3*r6inv - a.lj4);
+	      forcelj = forcelj*switch1 + philj*switch2;
+	    }
+	  }  
+	  
+	  double fpair = (forcecoul + factor_lj*forcelj) * r2inv;
+	  
+	  tmpfx += delx*fpair;
+	  tmpfy += dely*fpair;
+	  tmpfz += delz*fpair;
+	  if (NEWTON_PAIR || j < nlocal) {
+	    ff[j].x -= delx*fpair;
+	    ff[j].y -= dely*fpair;
+	    ff[j].z -= delz*fpair;
+	  }
+	  
+	  if (EFLAG) {
+	    if (rsq < cut_coulsq) {
+	      if (!ncoultablebits || rsq <= tabinnersq)
+		ecoul = prefactor*erfc;
+	      else {
+		table = etable[itable] + fraction*detable[itable];
+		ecoul = tmp_coef3 * table;
+	      }
+	      if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
+	    } else ecoul = 0.0;
+
+	    if (rsq < cut_ljsq) {
+	      fast_alpha_t& a = tabsixi[jtype];
+	      evdwl = r6inv*(a.lj3*r6inv-a.lj4);
+	      if (rsq > cut_lj_innersq) {
+		switch1 = (cut_ljsq-rsq) * (cut_ljsq-rsq) *
+		  (tmp_coef2 + 2.0*rsq) * tmp_coef1;
+		evdwl *= switch1;
+	      }
+	      evdwl *= factor_lj;
+	    } else evdwl = 0.0;
+	  }
+	  
+	  if (EVFLAG) ev_tally(i,j,nlocal,NEWTON_PAIR,
+			       evdwl,ecoul,fpair,delx,dely,delz);
+	}
+      }
+    }
+
+    ff[i].x += tmpfx;
+    ff[i].y += tmpfy;
+    ff[i].z += tmpfz;
+  }
+
+  free(fast_alpha); fast_alpha = 0;
+
+  if (vflag_fdotr) virial_fdotr_compute();
+}
+
--- a/src/OPT/pair_lj_charmm_coul_long_opt.h
+++ b/src/OPT/pair_lj_charmm_coul_long_opt.h
@ -11,13 +11,6 @@
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */

-/* ----------------------------------------------------------------------
-   Contributing authors:  
-     James Fischer, High Performance Technologies, Inc.
-     David Richie, Stone Ridge Technology
-     Vincent Natol, Stone Ridge Technology
------------------------------------------------------------------------- */
-
 #ifdef PAIR_CLASS

 PairStyle(lj/charmm/coul/long/opt,PairLJCharmmCoulLongOpt)
@ -27,26 +20,10 @@ PairStyle(lj/charmm/coul/long/opt,PairLJCharmmCoulLongOpt)
 #ifndef LMP_PAIR_LJ_CHARMM_COUL_LONG_OPT_H
 #define LMP_PAIR_LJ_CHARMM_COUL_LONG_OPT_H

-#include "math.h"
-#include "stdlib.h"
 #include "pair_lj_charmm_coul_long.h"
-#include "atom.h"
-#include "force.h"
-#include "neigh_list.h"

 namespace LAMMPS_NS {

-#define EWALD_F   1.12837917
-#define EWALD_P   0.3275911
-#define EWALD_A1  0.254829592
-#define EWALD_A2 -0.284496736
-#define EWALD_A3  1.421413741
-#define EWALD_A4 -1.453152027
-#define EWALD_A5  1.061405429
-
-#define MIN(a,b) ((a) < (b) ? (a) : (b))
-#define MAX(a,b) ((a) > (b) ? (a) : (b))
-
 class PairLJCharmmCoulLongOpt : public PairLJCharmmCoulLong {
 public:
  PairLJCharmmCoulLongOpt(class LAMMPS *);
@ -55,292 +32,6 @@ class PairLJCharmmCoulLongOpt : public PairLJCharmmCoulLong {
 private:
  template < int EVFLAG, int EFLAG, int NEWTON_PAIR > void eval();
 };
-
-template < int EVFLAG, int EFLAG, int NEWTON_PAIR >
-void PairLJCharmmCoulLongOpt::eval()
-{
-  typedef struct { double x,y,z; } vec3_t;
-  
-  typedef struct {
-    double cutsq,lj1,lj2,lj3,lj4,offset;
-    double _pad[2];
-  } fast_alpha_t;
-  
-  int i,j,ii,jj,inum,jnum,itype,jtype,itable,sbindex;
-  double fraction,table;
-  double r,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
-  double grij,expm2,prefactor,t,erfc;
-  double philj,switch1,switch2;
- 
-  double rsq; 
-  
-  double evdwl = 0.0;
-  double ecoul = 0.0;
-  
-  double** __restrict__ x = atom->x;
-  double** __restrict__ f = atom->f;
-  double* __restrict__ q = atom->q;
-  int* __restrict__ type = atom->type;
-  int nlocal = atom->nlocal;
-  double* __restrict__ special_coul = force->special_coul;
-  double* __restrict__ special_lj = force->special_lj;
-  double qqrd2e = force->qqrd2e;
-
-  inum = list->inum;
-  int* __restrict__ ilist = list->ilist;
-  int** __restrict__ firstneigh = list->firstneigh;
-  int* __restrict__ numneigh = list->numneigh;
-  
-  vec3_t* __restrict__ xx = (vec3_t*)x[0];
-  vec3_t* __restrict__ ff = (vec3_t*)f[0];
-  
-  int ntypes = atom->ntypes;
-  int ntypes2 = ntypes*ntypes;
-  
-  double tmp_coef1 = 1.0/denom_lj;
-  double tmp_coef2 = cut_ljsq - 3.0*cut_lj_innersq;
-  
-  fast_alpha_t* __restrict__ fast_alpha = 
-    (fast_alpha_t*)malloc(ntypes2*sizeof(fast_alpha_t));
-  for (i = 0; i < ntypes; i++) for (j = 0; j < ntypes; j++) {
-    fast_alpha_t& a = fast_alpha[i*ntypes+j];
-    a.cutsq = cutsq[i+1][j+1];
-    a.lj1 = lj1[i+1][j+1];
-    a.lj2 = lj2[i+1][j+1];
-    a.lj3 = lj3[i+1][j+1];
-    a.lj4 = lj4[i+1][j+1];
-  }
-  fast_alpha_t* __restrict__ tabsix = fast_alpha;
-  
-  // loop over neighbors of my atoms
-  
-  for (ii = 0; ii < inum; ii++) {
-    i = ilist[ii];
-    double qtmp = q[i];
-    double xtmp = xx[i].x;
-    double ytmp = xx[i].y;
-    double ztmp = xx[i].z;
-    itype = type[i] - 1;
-    int* __restrict__ jlist = firstneigh[i];
-    jnum = numneigh[i];
-    
-    double tmpfx = 0.0;
-    double tmpfy = 0.0;
-    double tmpfz = 0.0;
-    
-    fast_alpha_t* __restrict__ tabsixi = (fast_alpha_t*) &tabsix[itype*ntypes];
-
-    for (jj = 0; jj < jnum; jj++) {
-      j = jlist[jj];
-      sbindex = sbmask(j);
-
-      if (sbindex == 0) {
-	double delx = xtmp - xx[j].x;
-	double dely = ytmp - xx[j].y;
-	double delz = ztmp - xx[j].z;
-	rsq = delx*delx + dely*dely + delz*delz;
-	double tmp_coef3 = qtmp*q[j];
-	
-	if (rsq < cut_bothsq) {
-	  r2inv = 1.0/rsq;
-	  
-	  forcecoul = 0.0;
-	  if (rsq < cut_coulsq) {
-	    if (!ncoultablebits || rsq <= tabinnersq) {
-	      r = sqrt(rsq);
-	      grij = g_ewald * r;
-	      expm2 = exp(-grij*grij);
-	      t = 1.0 / (1.0 + EWALD_P*grij);
-	      erfc = t * 
-		(EWALD_A1+t*(EWALD_A2+t*(EWALD_A3+t*(EWALD_A4+t*EWALD_A5)))) * 
-		expm2;
-	      prefactor = qqrd2e * tmp_coef3/r;
-	      forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
-	    } else {
-	      union_int_float_t rsq_lookup;
-	      rsq_lookup.f = rsq;
-	      itable = rsq_lookup.i & ncoulmask;
-	      itable >>= ncoulshiftbits;
-	      fraction = (rsq_lookup.f - rtable[itable]) * drtable[itable];
-	      table = ftable[itable] + fraction*dftable[itable];
-	      forcecoul = tmp_coef3 * table;
-	    }
-	  }  
-	  
-	  forcelj = 0.0;
-	  if (rsq < cut_ljsq) {
-	    r6inv = r2inv*r2inv*r2inv;
-	    jtype = type[j] - 1;
-	    fast_alpha_t& a = tabsixi[jtype];
-	    forcelj = r6inv * (a.lj1*r6inv - a.lj2);
-	    if (rsq > cut_lj_innersq) {
-	      switch1 = (cut_ljsq-rsq) * (cut_ljsq-rsq) *
-		(tmp_coef2 + 2.0*rsq) * tmp_coef1;
-	      switch2 = 12.0*rsq * (cut_ljsq-rsq) * 
-		(rsq-cut_lj_innersq) * tmp_coef1;
-	      philj = r6inv * (a.lj3*r6inv - a.lj4);
-	      forcelj = forcelj*switch1 + philj*switch2;
-	    }
-	  } 
-	  
-	  double fpair = (forcecoul + forcelj) * r2inv;
-	  
-	  tmpfx += delx*fpair;
-	  tmpfy += dely*fpair;
-	  tmpfz += delz*fpair;
-	  if (NEWTON_PAIR || j < nlocal) {
-	    ff[j].x -= delx*fpair;
-	    ff[j].y -= dely*fpair;
-	    ff[j].z -= delz*fpair;
-	  }
-	  
-	  if (EFLAG) {
-	    if (rsq < cut_coulsq) {
-	      if (!ncoultablebits || rsq <= tabinnersq)
-		ecoul = prefactor*erfc;
-	      else {
-		table = etable[itable] + fraction*detable[itable];
-		ecoul = tmp_coef3 * table;
-	      }
-	    } else ecoul = 0.0;
-
-	    if (rsq < cut_ljsq) {
-	      fast_alpha_t& a = tabsixi[jtype];
-	      evdwl = r6inv*(a.lj3*r6inv-a.lj4);
-	      if (rsq > cut_lj_innersq) {
-		switch1 = (cut_ljsq-rsq) * (cut_ljsq-rsq) *
-		  (tmp_coef2 + 2.0*rsq) * tmp_coef1;
-		evdwl *= switch1;
-	      }
-	    } else evdwl = 0.0;
-	  }
-
-	  if (EVFLAG) ev_tally(i,j,nlocal,NEWTON_PAIR,
-			       evdwl,ecoul,fpair,delx,dely,delz);
-	}
-
-      } else {
-	factor_lj = special_lj[sbindex];
-	factor_coul = special_coul[sbindex];
-	j &= NEIGHMASK;
-
-	double delx = xtmp - xx[j].x;
-	double dely = ytmp - xx[j].y;
-	double delz = ztmp - xx[j].z;
-	rsq = delx*delx + dely*dely + delz*delz;
-	double tmp_coef3 = qtmp*q[j];
-	
-	if (rsq < cut_bothsq) {
-	  r2inv = 1.0/rsq;
-	  
-	  forcecoul = 0.0;
-	  if (rsq < cut_coulsq) {
-	    if (!ncoultablebits || rsq <= tabinnersq) {
-	      r = sqrt(rsq);
-	      grij = g_ewald * r;
-	      expm2 = exp(-grij*grij);
-	      t = 1.0 / (1.0 + EWALD_P*grij);
-	      erfc = t * 
-		(EWALD_A1+t*(EWALD_A2+t*(EWALD_A3+t*(EWALD_A4+t*EWALD_A5)))) * 
-		expm2;
-	      prefactor = qqrd2e * tmp_coef3/r;
-	      forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
-	      if (factor_coul < 1.0) {
-		forcecoul -= (1.0-factor_coul)*prefactor;
-	      }
-	    } else {
-	      union_int_float_t rsq_lookup;
-	      rsq_lookup.f = rsq;
-	      itable = rsq_lookup.i & ncoulmask;
-	      itable >>= ncoulshiftbits;
-	      fraction = (rsq_lookup.f - rtable[itable]) * drtable[itable];
-	      table = ftable[itable] + fraction*dftable[itable];
-	      forcecoul = tmp_coef3 * table;
-	      if (factor_coul < 1.0) {
-		table = ctable[itable] + fraction*dctable[itable];
-		prefactor = tmp_coef3 * table;
-		forcecoul -= (1.0-factor_coul)*prefactor;
-	      }
-	    }
-	  }  
-	  
-	  forcelj = 0.0;
-	  if (rsq < cut_ljsq) {
-	    r6inv = r2inv*r2inv*r2inv;
-	    jtype = type[j] - 1;
-	    fast_alpha_t& a = tabsixi[jtype];
-	    forcelj = r6inv * (a.lj1*r6inv - a.lj2);
-	    if (rsq > cut_lj_innersq) {
-	      switch1 = (cut_ljsq-rsq) * (cut_ljsq-rsq) *
-		(tmp_coef2 + 2.0*rsq) * tmp_coef1;
-	      switch2 = 12.0*rsq * (cut_ljsq-rsq) * 
-		(rsq-cut_lj_innersq) * tmp_coef1;
-	      fast_alpha_t& a = tabsixi[jtype];
-	      philj = r6inv * (a.lj3*r6inv - a.lj4);
-	      forcelj = forcelj*switch1 + philj*switch2;
-	    }
-	  }  
-	  
-	  double fpair = (forcecoul + factor_lj*forcelj) * r2inv;
-	  
-	  tmpfx += delx*fpair;
-	  tmpfy += dely*fpair;
-	  tmpfz += delz*fpair;
-	  if (NEWTON_PAIR || j < nlocal) {
-	    ff[j].x -= delx*fpair;
-	    ff[j].y -= dely*fpair;
-	    ff[j].z -= delz*fpair;
-	  }
-	  
-	  if (EFLAG) {
-	    if (rsq < cut_coulsq) {
-	      if (!ncoultablebits || rsq <= tabinnersq)
-		ecoul = prefactor*erfc;
-	      else {
-		table = etable[itable] + fraction*detable[itable];
-		ecoul = tmp_coef3 * table;
-	      }
-	      if (factor_coul < 1.0) ecoul -= (1.0-factor_coul)*prefactor;
-	    } else ecoul = 0.0;
-
-	    if (rsq < cut_ljsq) {
-	      fast_alpha_t& a = tabsixi[jtype];
-	      evdwl = r6inv*(a.lj3*r6inv-a.lj4);
-	      if (rsq > cut_lj_innersq) {
-		switch1 = (cut_ljsq-rsq) * (cut_ljsq-rsq) *
-		  (tmp_coef2 + 2.0*rsq) * tmp_coef1;
-		evdwl *= switch1;
-	      }
-	      evdwl *= factor_lj;
-	    } else evdwl = 0.0;
-	  }
-	  
-	  if (EVFLAG) ev_tally(i,j,nlocal,NEWTON_PAIR,
-			       evdwl,ecoul,fpair,delx,dely,delz);
-	}
-      }
-    }
-
-    ff[i].x += tmpfx;
-    ff[i].y += tmpfy;
-    ff[i].z += tmpfz;
-  }
-
-  free(fast_alpha); fast_alpha = 0;
-
-  if (vflag_fdotr) virial_fdotr_compute();
-}
-
-#undef EWALD_F
-#undef EWALD_P
-#undef EWALD_A1
-#undef EWALD_A2
-#undef EWALD_A3
-#undef EWALD_A4
-#undef EWALD_A5
-
-#undef MIN
-#undef MAX
 
 }

--- a/src/OPT/pair_lj_cut_opt.cpp
+++ b/src/OPT/pair_lj_cut_opt.cpp
@ -18,7 +18,11 @@
     Vincent Natoli, Stone Ridge Technology
 ------------------------------------------------------------------------- */

+#include "stdlib.h"
 #include "pair_lj_cut_opt.h"
+#include "atom.h"
+#include "force.h"
+#include "neigh_list.h"

 using namespace LAMMPS_NS;

@ -46,3 +50,152 @@ void PairLJCutOpt::compute(int eflag, int vflag)
    else return eval<0,0,0>();
  }
 }
+
+/* ---------------------------------------------------------------------- */
+
+template < int EVFLAG, int EFLAG, int NEWTON_PAIR >
+void PairLJCutOpt::eval()
+{
+  typedef struct { double x,y,z; } vec3_t;
+  
+  typedef struct {
+    double cutsq,lj1,lj2,lj3,lj4,offset;
+    double _pad[2];
+  } fast_alpha_t;
+  
+  int i,j,ii,jj,inum,jnum,itype,jtype,sbindex;
+  double factor_lj;
+  double evdwl = 0.0;
+  
+  double** __restrict__ x = atom->x;
+  double** __restrict__ f = atom->f;
+  int* __restrict__ type = atom->type;
+  int nlocal = atom->nlocal;
+  double* __restrict__ special_lj = force->special_lj;
+
+  inum = list->inum;
+  int* __restrict__ ilist = list->ilist;
+  int** __restrict__ firstneigh = list->firstneigh;
+  int* __restrict__ numneigh = list->numneigh;
+  
+  vec3_t* __restrict__ xx = (vec3_t*)x[0];
+  vec3_t* __restrict__ ff = (vec3_t*)f[0];
+  
+  int ntypes = atom->ntypes;
+  int ntypes2 = ntypes*ntypes;
+  
+  fast_alpha_t* __restrict__ fast_alpha = 
+    (fast_alpha_t*) malloc(ntypes2*sizeof(fast_alpha_t));
+  for (i = 0; i < ntypes; i++) for (j = 0; j < ntypes; j++) {
+    fast_alpha_t& a = fast_alpha[i*ntypes+j];
+    a.cutsq = cutsq[i+1][j+1];
+    a.lj1 = lj1[i+1][j+1];
+    a.lj2 = lj2[i+1][j+1];
+    a.lj3 = lj3[i+1][j+1];
+    a.lj4 = lj4[i+1][j+1];
+    a.offset = offset[i+1][j+1];
+  }
+  fast_alpha_t* __restrict__ tabsix = fast_alpha;
+  
+  // loop over neighbors of my atoms
+  
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    double xtmp = xx[i].x;
+    double ytmp = xx[i].y;
+    double ztmp = xx[i].z;
+    itype = type[i] - 1;
+    int* __restrict__ jlist = firstneigh[i];
+    jnum = numneigh[i];
+    
+    double tmpfx = 0.0;
+    double tmpfy = 0.0;
+    double tmpfz = 0.0;
+    
+    fast_alpha_t* __restrict__ tabsixi = (fast_alpha_t*)&tabsix[itype*ntypes];
+    
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      sbindex = sbmask(j);
+
+      if (sbindex == 0) {
+	double delx = xtmp - xx[j].x;
+	double dely = ytmp - xx[j].y;
+	double delz = ztmp - xx[j].z;
+	double rsq = delx*delx + dely*dely + delz*delz;
+	
+	jtype = type[j] - 1;
+	
+	fast_alpha_t& a = tabsixi[jtype];
+	
+	if (rsq < a.cutsq) {
+	  double r2inv = 1.0/rsq;
+	  double r6inv = r2inv*r2inv*r2inv;
+	  double forcelj = r6inv * (a.lj1*r6inv - a.lj2);
+	  double fpair = forcelj*r2inv;
+	  
+	  tmpfx += delx*fpair;
+	  tmpfy += dely*fpair;
+	  tmpfz += delz*fpair;
+	  if (NEWTON_PAIR || j < nlocal) {
+	    ff[j].x -= delx*fpair;
+	    ff[j].y -= dely*fpair;
+	    ff[j].z -= delz*fpair;
+	  }
+
+	  if (EFLAG) evdwl = r6inv*(a.lj3*r6inv-a.lj4) - a.offset;
+
+	  if (EVFLAG)
+	    ev_tally(i,j,nlocal,NEWTON_PAIR,
+		     evdwl,0.0,fpair,delx,dely,delz);
+	}
+
+      } else {
+	factor_lj = special_lj[sbindex];
+	j &= NEIGHMASK;
+
+	double delx = xtmp - xx[j].x;
+	double dely = ytmp - xx[j].y;
+	double delz = ztmp - xx[j].z;
+	double rsq = delx*delx + dely*dely + delz*delz;
+	
+	int jtype1 = type[j];
+	jtype = jtype1 - 1;
+	
+	fast_alpha_t& a = tabsixi[jtype];
+	if (rsq < a.cutsq) {
+	  double r2inv = 1.0/rsq;
+	  double r6inv = r2inv*r2inv*r2inv;
+	  fast_alpha_t& a = tabsixi[jtype];
+	  double forcelj = r6inv * (a.lj1*r6inv - a.lj2);
+	  double fpair = factor_lj*forcelj*r2inv;
+	  
+	  tmpfx += delx*fpair;
+	  tmpfy += dely*fpair;
+	  tmpfz += delz*fpair;
+	  if (NEWTON_PAIR || j < nlocal) {
+	    ff[j].x -= delx*fpair;
+	    ff[j].y -= dely*fpair;
+	    ff[j].z -= delz*fpair;
+	  }
+
+	  if (EFLAG) {
+	    evdwl = r6inv*(a.lj3*r6inv-a.lj4) - a.offset;
+	    evdwl *= factor_lj;
+	  }
+	  
+	  if (EVFLAG) ev_tally(i,j,nlocal,NEWTON_PAIR,
+			       evdwl,0.0,fpair,delx,dely,delz);
+	}
+      }
+    }
+
+    ff[i].x += tmpfx;
+    ff[i].y += tmpfy;
+    ff[i].z += tmpfz;
+  }
+
+  free(fast_alpha); fast_alpha = 0;
+  
+  if (vflag_fdotr) virial_fdotr_compute();
+}
--- a/src/OPT/pair_lj_cut_opt.h
+++ b/src/OPT/pair_lj_cut_opt.h
@ -11,13 +11,6 @@
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */

-/* ----------------------------------------------------------------------
-   Contributing authors:  
-     James Fischer, High Performance Technologies, Inc.
-     David Richie, Stone Ridge Technology
-     Vincent Natol, Stone Ridge Technology
------------------------------------------------------------------------- */
-
 #ifdef PAIR_CLASS

 PairStyle(lj/cut/opt,PairLJCutOpt)
@ -27,11 +20,7 @@ PairStyle(lj/cut/opt,PairLJCutOpt)
 #ifndef LMP_PAIR_LJ_CUT_OPT_H
 #define LMP_PAIR_LJ_CUT_OPT_H

-#include "stdlib.h"
 #include "pair_lj_cut.h"
-#include "atom.h"
-#include "force.h"
-#include "neigh_list.h"

 namespace LAMMPS_NS {

@ -44,153 +33,6 @@ class PairLJCutOpt : public PairLJCut {
  template < int EVFLAG, int EFLAG, int NEWTON_PAIR > void eval();
 };

-template < int EVFLAG, int EFLAG, int NEWTON_PAIR >
-void PairLJCutOpt::eval()
-{
-  typedef struct { double x,y,z; } vec3_t;
-  
-  typedef struct {
-    double cutsq,lj1,lj2,lj3,lj4,offset;
-    double _pad[2];
-  } fast_alpha_t;
-  
-  int i,j,ii,jj,inum,jnum,itype,jtype,sbindex;
-  double factor_lj;
-  double evdwl = 0.0;
-  
-  double** __restrict__ x = atom->x;
-  double** __restrict__ f = atom->f;
-  int* __restrict__ type = atom->type;
-  int nlocal = atom->nlocal;
-  double* __restrict__ special_lj = force->special_lj;
-
-  inum = list->inum;
-  int* __restrict__ ilist = list->ilist;
-  int** __restrict__ firstneigh = list->firstneigh;
-  int* __restrict__ numneigh = list->numneigh;
-  
-  vec3_t* __restrict__ xx = (vec3_t*)x[0];
-  vec3_t* __restrict__ ff = (vec3_t*)f[0];
-  
-  int ntypes = atom->ntypes;
-  int ntypes2 = ntypes*ntypes;
-  
-  fast_alpha_t* __restrict__ fast_alpha = 
-    (fast_alpha_t*) malloc(ntypes2*sizeof(fast_alpha_t));
-  for (i = 0; i < ntypes; i++) for (j = 0; j < ntypes; j++) {
-    fast_alpha_t& a = fast_alpha[i*ntypes+j];
-    a.cutsq = cutsq[i+1][j+1];
-    a.lj1 = lj1[i+1][j+1];
-    a.lj2 = lj2[i+1][j+1];
-    a.lj3 = lj3[i+1][j+1];
-    a.lj4 = lj4[i+1][j+1];
-    a.offset = offset[i+1][j+1];
-  }
-  fast_alpha_t* __restrict__ tabsix = fast_alpha;
-  
-  // loop over neighbors of my atoms
-  
-  for (ii = 0; ii < inum; ii++) {
-    i = ilist[ii];
-    double xtmp = xx[i].x;
-    double ytmp = xx[i].y;
-    double ztmp = xx[i].z;
-    itype = type[i] - 1;
-    int* __restrict__ jlist = firstneigh[i];
-    jnum = numneigh[i];
-    
-    double tmpfx = 0.0;
-    double tmpfy = 0.0;
-    double tmpfz = 0.0;
-    
-    fast_alpha_t* __restrict__ tabsixi = (fast_alpha_t*)&tabsix[itype*ntypes];
-    
-    for (jj = 0; jj < jnum; jj++) {
-      j = jlist[jj];
-      sbindex = sbmask(j);
-
-      if (sbindex == 0) {
-	double delx = xtmp - xx[j].x;
-	double dely = ytmp - xx[j].y;
-	double delz = ztmp - xx[j].z;
-	double rsq = delx*delx + dely*dely + delz*delz;
-	
-	jtype = type[j] - 1;
-	
-	fast_alpha_t& a = tabsixi[jtype];
-	
-	if (rsq < a.cutsq) {
-	  double r2inv = 1.0/rsq;
-	  double r6inv = r2inv*r2inv*r2inv;
-	  double forcelj = r6inv * (a.lj1*r6inv - a.lj2);
-	  double fpair = forcelj*r2inv;
-	  
-	  tmpfx += delx*fpair;
-	  tmpfy += dely*fpair;
-	  tmpfz += delz*fpair;
-	  if (NEWTON_PAIR || j < nlocal) {
-	    ff[j].x -= delx*fpair;
-	    ff[j].y -= dely*fpair;
-	    ff[j].z -= delz*fpair;
-	  }
-
-	  if (EFLAG) evdwl = r6inv*(a.lj3*r6inv-a.lj4) - a.offset;
-
-	  if (EVFLAG)
-	    ev_tally(i,j,nlocal,NEWTON_PAIR,
-		     evdwl,0.0,fpair,delx,dely,delz);
-	}
-
-      } else {
-	factor_lj = special_lj[sbindex];
-	j &= NEIGHMASK;
-
-	double delx = xtmp - xx[j].x;
-	double dely = ytmp - xx[j].y;
-	double delz = ztmp - xx[j].z;
-	double rsq = delx*delx + dely*dely + delz*delz;
-	
-	int jtype1 = type[j];
-	jtype = jtype1 - 1;
-	
-	fast_alpha_t& a = tabsixi[jtype];
-	if (rsq < a.cutsq) {
-	  double r2inv = 1.0/rsq;
-	  double r6inv = r2inv*r2inv*r2inv;
-	  fast_alpha_t& a = tabsixi[jtype];
-	  double forcelj = r6inv * (a.lj1*r6inv - a.lj2);
-	  double fpair = factor_lj*forcelj*r2inv;
-	  
-	  tmpfx += delx*fpair;
-	  tmpfy += dely*fpair;
-	  tmpfz += delz*fpair;
-	  if (NEWTON_PAIR || j < nlocal) {
-	    ff[j].x -= delx*fpair;
-	    ff[j].y -= dely*fpair;
-	    ff[j].z -= delz*fpair;
-	  }
-
-	  if (EFLAG) {
-	    evdwl = r6inv*(a.lj3*r6inv-a.lj4) - a.offset;
-	    evdwl *= factor_lj;
-	  }
-	  
-	  if (EVFLAG) ev_tally(i,j,nlocal,NEWTON_PAIR,
-			       evdwl,0.0,fpair,delx,dely,delz);
-	}
-      }
-    }
-
-    ff[i].x += tmpfx;
-    ff[i].y += tmpfy;
-    ff[i].z += tmpfz;
-  }
-
-  free(fast_alpha); fast_alpha = 0;
-  
-  if (vflag_fdotr) virial_fdotr_compute();
-}
-
 }

 #endif
--- a/src/OPT/pair_morse_opt.cpp
+++ b/src/OPT/pair_morse_opt.cpp
@ -18,7 +18,12 @@
     Vincent Natoli, Stone Ridge Technology
 ------------------------------------------------------------------------- */

+#include "math.h"
+#include "stdlib.h"
 #include "pair_morse_opt.h"
+#include "atom.h"
+#include "force.h"
+#include "neigh_list.h"

 using namespace LAMMPS_NS;

@ -46,3 +51,148 @@ void PairMorseOpt::compute(int eflag, int vflag)
    else return eval<0,0,0>();
  }
 }
+
+/* ---------------------------------------------------------------------- */
+
+template < int EVFLAG, int EFLAG, int NEWTON_PAIR >
+void PairMorseOpt::eval()
+{
+  typedef struct { double x,y,z; } vec3_t;
+  
+  typedef struct {
+    double cutsq,r0,alpha,morse1,d0,offset;
+    double _pad[2];
+  } fast_alpha_t;
+  
+  int i,j,ii,jj,inum,jnum,itype,jtype,sbindex;
+  double factor_lj;
+  double evdwl = 0.0;
+  
+  double** __restrict__ x = atom->x;
+  double** __restrict__ f = atom->f;
+  int* __restrict__ type = atom->type;
+  int nlocal = atom->nlocal;
+  double* __restrict__ special_lj = force->special_lj;
+  
+  inum = list->inum;
+  int* __restrict__ ilist = list->ilist;
+  int** __restrict__ firstneigh = list->firstneigh;
+  int* __restrict__ numneigh = list->numneigh;
+  
+  vec3_t* __restrict__ xx = (vec3_t*)x[0];
+  vec3_t* __restrict__ ff = (vec3_t*)f[0];
+  
+  int ntypes = atom->ntypes;
+  int ntypes2 = ntypes*ntypes;
+  
+  fast_alpha_t* __restrict__ fast_alpha = 
+    (fast_alpha_t*) malloc(ntypes2*sizeof(fast_alpha_t));
+  for (i = 0; i < ntypes; i++) for (j = 0; j < ntypes; j++) {
+    fast_alpha_t& a = fast_alpha[i*ntypes+j];
+    a.cutsq = cutsq[i+1][j+1];
+    a.r0 = r0[i+1][j+1];
+    a.alpha = alpha[i+1][j+1];
+    a.morse1 = morse1[i+1][j+1];
+    a.d0 = d0[i+1][j+1];
+    a.offset = offset[i+1][j+1];
+  }
+  fast_alpha_t* __restrict__ tabsix = fast_alpha;
+  
+  // loop over neighbors of my atoms
+  
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    double xtmp = xx[i].x;
+    double ytmp = xx[i].y;
+    double ztmp = xx[i].z;
+    itype = type[i] - 1;
+    int* __restrict__ jlist = firstneigh[i];
+    jnum = numneigh[i];
+    
+    double tmpfx = 0.0;
+    double tmpfy = 0.0;
+    double tmpfz = 0.0;
+    
+    fast_alpha_t* __restrict__ tabsixi = (fast_alpha_t*)&tabsix[itype*ntypes];
+   
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      sbindex = sbmask(j);
+
+      if (sbindex == 0) {
+	double delx = xtmp - xx[j].x;
+	double dely = ytmp - xx[j].y;
+	double delz = ztmp - xx[j].z;
+	double rsq = delx*delx + dely*dely + delz*delz;
+	
+	jtype = type[j] - 1;
+	
+	fast_alpha_t& a = tabsixi[jtype];
+	if (rsq < a.cutsq) {
+	  double r = sqrt(rsq);
+	  double dr = r - a.r0;
+	  double dexp = exp(-a.alpha * dr);
+	  double fpair = a.morse1 * (dexp*dexp - dexp) / r;
+
+	  tmpfx += delx*fpair;
+	  tmpfy += dely*fpair;
+	  tmpfz += delz*fpair;
+	  if (NEWTON_PAIR || j < nlocal) {
+	    ff[j].x -= delx*fpair;
+	    ff[j].y -= dely*fpair;
+	    ff[j].z -= delz*fpair;
+	  }
+
+	  if (EFLAG) evdwl = a.d0 * (dexp*dexp - 2.0*dexp) - a.offset;
+
+	  if (EVFLAG) ev_tally(i,j,nlocal,NEWTON_PAIR,
+			       evdwl,0.0,fpair,delx,dely,delz);
+	}
+
+      } else {
+	factor_lj = special_lj[sbindex];
+	j &= NEIGHMASK;
+	
+	double delx = xtmp - xx[j].x;
+	double dely = ytmp - xx[j].y;
+	double delz = ztmp - xx[j].z;
+	double rsq = delx*delx + dely*dely + delz*delz;
+	
+	jtype = type[j] - 1;
+	
+	fast_alpha_t& a = tabsixi[jtype];
+	if (rsq < a.cutsq) {
+	  double r = sqrt(rsq);
+	  double dr = r - a.r0;
+	  double dexp = exp(-a.alpha * dr);
+	  double fpair = factor_lj * a.morse1 * (dexp*dexp - dexp) / r;
+
+	  tmpfx += delx*fpair;
+	  tmpfy += dely*fpair;
+	  tmpfz += delz*fpair;
+	  if (NEWTON_PAIR || j < nlocal) {
+	    ff[j].x -= delx*fpair;
+	    ff[j].y -= dely*fpair;
+	    ff[j].z -= delz*fpair;
+	  }
+
+	  if (EFLAG) {
+	    evdwl = a.d0 * (dexp*dexp - 2.0*dexp) - a.offset;
+	    evdwl *= factor_lj;
+	  }
+	  
+	  if (EVFLAG) ev_tally(i,j,nlocal,NEWTON_PAIR,
+			       evdwl,0.0,fpair,delx,dely,delz);
+	}
+      }
+    }
+
+    ff[i].x += tmpfx;
+    ff[i].y += tmpfy;
+    ff[i].z += tmpfz;
+  }
+
+  free(fast_alpha); fast_alpha = 0;
+
+  if (vflag_fdotr) virial_fdotr_compute();
+}
--- a/src/OPT/pair_morse_opt.h
+++ b/src/OPT/pair_morse_opt.h
@ -11,13 +11,6 @@
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */

-/* ----------------------------------------------------------------------
-   Contributing authors:  
-     James Fischer, High Performance Technologies, Inc.
-     David Richie, Stone Ridge Technology
-     Vincent Natol, Stone Ridge Technology
------------------------------------------------------------------------- */
-
 #ifdef PAIR_CLASS

 PairStyle(morse/opt,PairMorseOpt)
@ -27,12 +20,7 @@ PairStyle(morse/opt,PairMorseOpt)
 #ifndef LMP_PAIR_MORSE_OPT_H
 #define LMP_PAIR_MORSE_OPT_H

-#include "math.h"
-#include "stdlib.h"
 #include "pair_morse.h"
-#include "atom.h"
-#include "force.h"
-#include "neigh_list.h"

 namespace LAMMPS_NS {

@ -45,149 +33,6 @@ class PairMorseOpt : public PairMorse {
  template < int EVFLAG, int EFLAG, int NEWTON_PAIR > void eval();
 };

-template < int EVFLAG, int EFLAG, int NEWTON_PAIR >
-void PairMorseOpt::eval()
-{
-  typedef struct { double x,y,z; } vec3_t;
-  
-  typedef struct {
-    double cutsq,r0,alpha,morse1,d0,offset;
-    double _pad[2];
-  } fast_alpha_t;
-  
-  int i,j,ii,jj,inum,jnum,itype,jtype,sbindex;
-  double factor_lj;
-  double evdwl = 0.0;
-  
-  double** __restrict__ x = atom->x;
-  double** __restrict__ f = atom->f;
-  int* __restrict__ type = atom->type;
-  int nlocal = atom->nlocal;
-  double* __restrict__ special_lj = force->special_lj;
-  
-  inum = list->inum;
-  int* __restrict__ ilist = list->ilist;
-  int** __restrict__ firstneigh = list->firstneigh;
-  int* __restrict__ numneigh = list->numneigh;
-  
-  vec3_t* __restrict__ xx = (vec3_t*)x[0];
-  vec3_t* __restrict__ ff = (vec3_t*)f[0];
-  
-  int ntypes = atom->ntypes;
-  int ntypes2 = ntypes*ntypes;
-  
-  fast_alpha_t* __restrict__ fast_alpha = 
-    (fast_alpha_t*) malloc(ntypes2*sizeof(fast_alpha_t));
-  for (i = 0; i < ntypes; i++) for (j = 0; j < ntypes; j++) {
-    fast_alpha_t& a = fast_alpha[i*ntypes+j];
-    a.cutsq = cutsq[i+1][j+1];
-    a.r0 = r0[i+1][j+1];
-    a.alpha = alpha[i+1][j+1];
-    a.morse1 = morse1[i+1][j+1];
-    a.d0 = d0[i+1][j+1];
-    a.offset = offset[i+1][j+1];
-  }
-  fast_alpha_t* __restrict__ tabsix = fast_alpha;
-  
-  // loop over neighbors of my atoms
-  
-  for (ii = 0; ii < inum; ii++) {
-    i = ilist[ii];
-    double xtmp = xx[i].x;
-    double ytmp = xx[i].y;
-    double ztmp = xx[i].z;
-    itype = type[i] - 1;
-    int* __restrict__ jlist = firstneigh[i];
-    jnum = numneigh[i];
-    
-    double tmpfx = 0.0;
-    double tmpfy = 0.0;
-    double tmpfz = 0.0;
-    
-    fast_alpha_t* __restrict__ tabsixi = (fast_alpha_t*)&tabsix[itype*ntypes];
-   
-    for (jj = 0; jj < jnum; jj++) {
-      j = jlist[jj];
-      sbindex = sbmask(j);
-
-      if (sbindex == 0) {
-	double delx = xtmp - xx[j].x;
-	double dely = ytmp - xx[j].y;
-	double delz = ztmp - xx[j].z;
-	double rsq = delx*delx + dely*dely + delz*delz;
-	
-	jtype = type[j] - 1;
-	
-	fast_alpha_t& a = tabsixi[jtype];
-	if (rsq < a.cutsq) {
-	  double r = sqrt(rsq);
-	  double dr = r - a.r0;
-	  double dexp = exp(-a.alpha * dr);
-	  double fpair = a.morse1 * (dexp*dexp - dexp) / r;
-
-	  tmpfx += delx*fpair;
-	  tmpfy += dely*fpair;
-	  tmpfz += delz*fpair;
-	  if (NEWTON_PAIR || j < nlocal) {
-	    ff[j].x -= delx*fpair;
-	    ff[j].y -= dely*fpair;
-	    ff[j].z -= delz*fpair;
-	  }
-
-	  if (EFLAG) evdwl = a.d0 * (dexp*dexp - 2.0*dexp) - a.offset;
-
-	  if (EVFLAG) ev_tally(i,j,nlocal,NEWTON_PAIR,
-			       evdwl,0.0,fpair,delx,dely,delz);
-	}
-
-      } else {
-	factor_lj = special_lj[sbindex];
-	j &= NEIGHMASK;
-	
-	double delx = xtmp - xx[j].x;
-	double dely = ytmp - xx[j].y;
-	double delz = ztmp - xx[j].z;
-	double rsq = delx*delx + dely*dely + delz*delz;
-	
-	jtype = type[j] - 1;
-	
-	fast_alpha_t& a = tabsixi[jtype];
-	if (rsq < a.cutsq) {
-	  double r = sqrt(rsq);
-	  double dr = r - a.r0;
-	  double dexp = exp(-a.alpha * dr);
-	  double fpair = factor_lj * a.morse1 * (dexp*dexp - dexp) / r;
-
-	  tmpfx += delx*fpair;
-	  tmpfy += dely*fpair;
-	  tmpfz += delz*fpair;
-	  if (NEWTON_PAIR || j < nlocal) {
-	    ff[j].x -= delx*fpair;
-	    ff[j].y -= dely*fpair;
-	    ff[j].z -= delz*fpair;
-	  }
-
-	  if (EFLAG) {
-	    evdwl = a.d0 * (dexp*dexp - 2.0*dexp) - a.offset;
-	    evdwl *= factor_lj;
-	  }
-	  
-	  if (EVFLAG) ev_tally(i,j,nlocal,NEWTON_PAIR,
-			       evdwl,0.0,fpair,delx,dely,delz);
-	}
-      }
-    }
-
-    ff[i].x += tmpfx;
-    ff[i].y += tmpfy;
-    ff[i].z += tmpfz;
-  }
-
-  free(fast_alpha); fast_alpha = 0;
-
-  if (vflag_fdotr) virial_fdotr_compute();
-}
-
 }

 #endif