git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@10907 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2013-10-30 15:42:14 +00:00
parent 15b7374ff0
commit 0fa3867f34
16 changed files with 1791 additions and 5 deletions
--- a/src/USER-OMP/pair_adp_omp.cpp
+++ b/src/USER-OMP/pair_adp_omp.cpp
@ -71,7 +71,6 @@ void PairADPOMP::compute(int eflag, int vflag)

    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
    ThrData *thr = fix->get_thr(tid);
-    thr->timer(Timer::START);
    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);

    if (force->newton_pair)
@ -92,7 +91,6 @@ void PairADPOMP::compute(int eflag, int vflag)
      else eval<0,0,0>(ifrom, ito, thr);
    }

-    thr->timer(Timer::PAIR);
    reduce_thr(this, eflag, vflag, thr);
  } // end of omp parallel region
 }
--- a/src/USER-OMP/pair_coul_dsf_omp.cpp
+++ b/src/USER-OMP/pair_coul_dsf_omp.cpp
@ -0,0 +1,177 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_coul_dsf_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+#include "math_const.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define A1        0.254829592
+#define A2       -0.284496736
+#define A3        1.421413741
+#define A4       -1.453152027
+#define A5        1.061405429
+
+/* ---------------------------------------------------------------------- */
+
+PairCoulDSFOMP::PairCoulDSFOMP(LAMMPS *lmp) :
+  PairCoulDSF(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairCoulDSFOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairCoulDSFOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,ecoul,fpair;
+  double r,rsq,r2inv,forcecoul,factor_coul;
+  double prefactor,erfcc,erfcd,t;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  ecoul = 0.0;
+
+  const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
+  dbl3_t * _noalias const f = (dbl3_t *) thr->get_f()[0];
+  const double * _noalias const q = atom->q;
+  const int nlocal = atom->nlocal;
+  const double * _noalias const special_coul = force->special_coul;
+  const double qqrd2e = force->qqrd2e;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    qtmp = q[i];
+    xtmp = x[i].x;
+    ytmp = x[i].y;
+    ztmp = x[i].z;
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    if (EFLAG) {
+      double e_self = -(e_shift/2.0 + alpha/MY_PIS) * qtmp*qtmp*qqrd2e;
+      ev_tally_thr(this,i,i,nlocal,0,0.0,e_self,0.0,0.0,0.0,0.0,thr);
+    }
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_coul = special_coul[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j].x;
+      dely = ytmp - x[j].y;
+      delz = ztmp - x[j].z;
+      rsq = delx*delx + dely*dely + delz*delz;
+
+      if (rsq < cut_coulsq) {
+        r2inv = 1.0/rsq;
+
+        r = sqrt(rsq);
+        prefactor = factor_coul * qqrd2e*qtmp*q[j]/r;
+        erfcd = exp(-alpha*alpha*rsq);
+        t = 1.0 / (1.0 + EWALD_P*alpha*r);
+        erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
+        forcecoul = prefactor * (erfcc/r + 2.0*alpha/MY_PIS * erfcd +
+                                 r*f_shift) * r;
+        fpair = forcecoul * r2inv;
+        if (EFLAG) ecoul = prefactor * (erfcc - r*e_shift - rsq*f_shift);
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j].x -= delx*fpair;
+          f[j].y -= dely*fpair;
+          f[j].z -= delz*fpair;
+        }
+
+        if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+                                 0.0,ecoul,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i].x += fxtmp;
+    f[i].y += fytmp;
+    f[i].z += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairCoulDSFOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairCoulDSF::memory_usage();
+
+  return bytes;
+}
--- a/src/USER-OMP/pair_coul_dsf_omp.h
+++ b/src/USER-OMP/pair_coul_dsf_omp.h
@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(coul/dsf/omp,PairCoulDSFOMP)
+
+#else
+
+#ifndef LMP_PAIR_COUL_DSF_OMP_H
+#define LMP_PAIR_COUL_DSF_OMP_H
+
+#include "pair_coul_dsf.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairCoulDSFOMP : public PairCoulDSF, public ThrOMP {
+
+ public:
+  PairCoulDSFOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-OMP/pair_lj_cut_coul_dsf_omp.cpp
+++ b/src/USER-OMP/pair_lj_cut_coul_dsf_omp.cpp
@ -0,0 +1,201 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_cut_coul_dsf_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+#include "math_const.h"
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define A1        0.254829592
+#define A2       -0.284496736
+#define A3        1.421413741
+#define A4       -1.453152027
+#define A5        1.061405429
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCutCoulDSFOMP::PairLJCutCoulDSFOMP(LAMMPS *lmp) :
+  PairLJCutCoulDSF(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulDSFOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairLJCutCoulDSFOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
+  double r,rsq,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
+  double prefactor,erfcc,erfcd,t;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = ecoul = 0.0;
+
+  const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
+  dbl3_t * _noalias const f = (dbl3_t *) thr->get_f()[0];
+  const double * _noalias const q = atom->q;
+  const int * _noalias const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * _noalias const special_coul = force->special_coul;
+  const double * _noalias const special_lj = force->special_lj;
+  const double qqrd2e = force->qqrd2e;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    qtmp = q[i];
+    xtmp = x[i].x;
+    ytmp = x[i].y;
+    ztmp = x[i].z;
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    if (EVFLAG) {
+      double e_self = -(e_shift/2.0 + alpha/MY_PIS) * qtmp*qtmp*qqrd2e;
+      ev_tally_thr(this,i,i,nlocal,0,0.0,e_self,0.0,0.0,0.0,0.0,thr);
+    }
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      factor_coul = special_coul[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j].x;
+      dely = ytmp - x[j].y;
+      delz = ztmp - x[j].z;
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        r2inv = 1.0/rsq;
+
+        if (rsq < cut_ljsq[itype][jtype]) {
+          r6inv = r2inv*r2inv*r2inv;
+          forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+          forcelj *= factor_lj;
+        } else forcelj = 0.0;
+
+        if (rsq < cut_coulsq) {
+          r = sqrt(rsq);
+          prefactor = factor_coul * qqrd2e*qtmp*q[j]/r;
+          erfcd = exp(-alpha*alpha*r*r);
+          t = 1.0 / (1.0 + EWALD_P*alpha*r);
+          erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
+          forcecoul = prefactor * (erfcc/r + 2.0*alpha/MY_PIS * erfcd +
+            r*f_shift) * r;
+        } else forcecoul = 0.0;
+        fpair = (forcecoul + forcelj) * r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j].x -= delx*fpair;
+          f[j].y -= dely*fpair;
+          f[j].z -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          if (rsq < cut_ljsq[itype][jtype]) {
+            evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]) -
+              offset[itype][jtype];
+            evdwl *= factor_lj;
+          } else evdwl = 0.0;
+
+         if (rsq < cut_coulsq) {
+           ecoul = prefactor * (erfcc - r*e_shift - rsq*f_shift);
+          } else ecoul = 0.0;
+        }
+
+        if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,ecoul,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i].x += fxtmp;
+    f[i].y += fytmp;
+    f[i].z += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJCutCoulDSFOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairLJCutCoulDSF::memory_usage();
+
+  return bytes;
+}
--- a/src/USER-OMP/pair_lj_cut_coul_dsf_omp.h
+++ b/src/USER-OMP/pair_lj_cut_coul_dsf_omp.h
@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/cut/coul/dsf/omp,PairLJCutCoulDSFOMP)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CUT_COUL_DSF_OMP_H
+#define LMP_PAIR_LJ_CUT_COUL_DSF_OMP_H
+
+#include "pair_lj_cut_coul_dsf.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCutCoulDSFOMP : public PairLJCutCoulDSF, public ThrOMP {
+
+ public:
+  PairLJCutCoulDSFOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-OMP/pair_nm_cut_coul_cut_omp.cpp
+++ b/src/USER-OMP/pair_nm_cut_coul_cut_omp.cpp
@ -0,0 +1,198 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_nm_cut_coul_cut_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairNMCutCoulCutOMP::PairNMCutCoulCutOMP(LAMMPS *lmp) :
+  PairNMCutCoulCut(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairNMCutCoulCutOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairNMCutCoulCutOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int j,ii,jj,jnum,jtype;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
+  double r,rsq,r2inv,rminv,rninv,forcecoul,forcenm,factor_coul,factor_lj;
+  int *ilist,*numneigh,**firstneigh;
+
+  evdwl = ecoul = 0.0;
+
+  const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
+  dbl3_t * _noalias const f = (dbl3_t *) thr->get_f()[0];
+  const double * _noalias const q = atom->q;
+  const int * _noalias const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * _noalias const special_coul = force->special_coul;
+  const double * _noalias const special_lj = force->special_lj;
+  const double qqrd2e = force->qqrd2e;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    const int i = ilist[ii];
+    const int itype = type[i];
+    const int    * _noalias const jlist = firstneigh[i];
+    const double * _noalias const cutsqi = cutsq[itype];
+    const double * _noalias const cut_coulsqi = cut_coulsq[itype];
+    const double * _noalias const cut_ljsqi = cut_ljsq[itype];
+    const double * _noalias const offseti = offset[itype];
+    const double * _noalias const mmi = mm[itype];
+    const double * _noalias const nni = nn[itype];
+    const double * _noalias const nmi = nm[itype];
+    const double * _noalias const e0nmi = e0nm[itype];
+    const double * _noalias const r0mi = r0m[itype];
+    const double * _noalias const r0ni = r0n[itype];
+
+    qtmp = q[i];
+    xtmp = x[i].x;
+    ytmp = x[i].y;
+    ztmp = x[i].z;
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      factor_coul = special_coul[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j].x;
+      dely = ytmp - x[j].y;
+      delz = ztmp - x[j].z;
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsqi[jtype]) {
+        r2inv = 1.0/rsq;
+
+        if (rsq < cut_coulsqi[jtype]) {
+          const double rinv = sqrt(r2inv);
+          forcecoul = qqrd2e * qtmp*q[j]*rinv;
+          forcecoul *= factor_coul;
+          if (EFLAG) ecoul = factor_coul * qqrd2e * qtmp*q[j]*rinv;
+        } else {
+          forcecoul = 0.0;
+          if (EFLAG) ecoul = 0.0;
+        }
+
+        if (rsq < cut_ljsqi[jtype]) {
+          r = sqrt(rsq);
+          rminv = pow(r2inv,mmi[jtype]/2.0);
+          rninv = pow(r2inv,nni[jtype]/2.0);
+          forcenm = e0nmi[jtype]*nmi[jtype] *
+            (r0ni[jtype]/pow(r,nni[jtype]) -
+             r0mi[jtype]/pow(r,mmi[jtype]));
+          forcenm *= factor_lj;
+          if (EFLAG)
+            evdwl = (e0nmi[jtype]*(mmi[jtype] *
+                                   r0ni[jtype]*rninv -
+                                   nni[jtype] *
+                                   r0mi[jtype]*rminv) -
+                     offseti[jtype]) * factor_lj;
+        } else {
+          forcenm = 0.0;
+          if (EFLAG) evdwl = 0.0;
+        }
+
+        fpair = (forcecoul + forcenm) * r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j].x -= delx*fpair;
+          f[j].y -= dely*fpair;
+          f[j].z -= delz*fpair;
+        }
+
+        if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,ecoul,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i].x += fxtmp;
+    f[i].y += fytmp;
+    f[i].z += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairNMCutCoulCutOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairNMCutCoulCut::memory_usage();
+
+  return bytes;
+}
--- a/src/USER-OMP/pair_nm_cut_coul_cut_omp.h
+++ b/src/USER-OMP/pair_nm_cut_coul_cut_omp.h
@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(nm/cut/coul/cut/omp,PairNMCutCoulCutOMP)
+
+#else
+
+#ifndef LMP_PAIR_NM_CUT_COUL_CUT_OMP_H
+#define LMP_PAIR_NM_CUT_COUL_CUT_OMP_H
+
+#include "pair_nm_cut_coul_cut.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairNMCutCoulCutOMP : public PairNMCutCoulCut, public ThrOMP {
+
+ public:
+  PairNMCutCoulCutOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-OMP/pair_nm_cut_coul_long_omp.cpp
+++ b/src/USER-OMP/pair_nm_cut_coul_long_omp.cpp
@ -0,0 +1,234 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_nm_cut_coul_long_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define EWALD_F   1.12837917
+#define EWALD_P   0.3275911
+#define A1        0.254829592
+#define A2       -0.284496736
+#define A3        1.421413741
+#define A4       -1.453152027
+#define A5        1.061405429
+
+/* ---------------------------------------------------------------------- */
+
+PairNMCutCoulLongOMP::PairNMCutCoulLongOMP(LAMMPS *lmp) :
+  PairNMCutCoulLong(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairNMCutCoulLongOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairNMCutCoulLongOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,ii,jj,jnum,jtype,itable;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
+  double fraction,table;
+  double r,rsq,rinv,r2inv,factor_coul,factor_lj;
+  double forcecoul,forcenm,rminv,rninv;
+  double grij,expm2,prefactor,t,erfc;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = ecoul = 0.0;
+
+  const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
+  dbl3_t * _noalias const f = (dbl3_t *) thr->get_f()[0];
+  const double * _noalias const q = atom->q;
+  const int * _noalias const type = atom->type;
+  const int nlocal = atom->nlocal;
+  const double * _noalias const special_coul = force->special_coul;
+  const double * _noalias const special_lj = force->special_lj;
+  const double qqrd2e = force->qqrd2e;
+  double fxtmp,fytmp,fztmp;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // loop over neighbors of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    const int i = ilist[ii];
+    const int itype = type[i];
+    const int    * _noalias const jlist = firstneigh[i];
+    const double * _noalias const cutsqi = cutsq[itype];
+    const double * _noalias const cut_ljsqi = cut_ljsq[itype];
+    const double * _noalias const offseti = offset[itype];
+    const double * _noalias const mmi = mm[itype];
+    const double * _noalias const nni = nn[itype];
+    const double * _noalias const nmi = nm[itype];
+    const double * _noalias const e0nmi = e0nm[itype];
+    const double * _noalias const r0mi = r0m[itype];
+    const double * _noalias const r0ni = r0n[itype];
+
+    qtmp = q[i];
+    xtmp = x[i].x;
+    ytmp = x[i].y;
+    ztmp = x[i].z;
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      factor_coul = special_coul[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j].x;
+      dely = ytmp - x[j].y;
+      delz = ztmp - x[j].z;
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsqi[jtype]) {
+        r2inv = 1.0/rsq;
+
+        if (rsq < cut_coulsq) {
+          if (!ncoultablebits || rsq <= tabinnersq) {
+            r = sqrt(rsq);
+            grij = g_ewald * r;
+            expm2 = exp(-grij*grij);
+            t = 1.0 / (1.0 + EWALD_P*grij);
+            erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+            prefactor = qqrd2e * qtmp*q[j]/r;
+            forcecoul = prefactor * (erfc + EWALD_F*grij*expm2);
+            if (EFLAG) ecoul = prefactor*erfc;
+            if (factor_coul < 1.0) {
+              forcecoul -= (1.0-factor_coul)*prefactor;
+              if (EFLAG) ecoul -= (1.0-factor_coul)*prefactor;
+            }
+          } else {
+            union_int_float_t rsq_lookup;
+            rsq_lookup.f = rsq;
+            itable = rsq_lookup.i & ncoulmask;
+            itable >>= ncoulshiftbits;
+            fraction = (rsq_lookup.f - rtable[itable]) * drtable[itable];
+            table = ftable[itable] + fraction*dftable[itable];
+            forcecoul = qtmp*q[j] * table;
+            if (EFLAG)
+              ecoul = qtmp*q[j] * (etable[itable] + fraction*detable[itable]);
+            if (factor_coul < 1.0) {
+              table = ctable[itable] + fraction*dctable[itable];
+              prefactor = qtmp*q[j] * table;
+              forcecoul -= (1.0-factor_coul)*prefactor;
+              if (EFLAG) ecoul -= (1.0-factor_coul)*prefactor;
+            }
+          }
+        } else {
+            forcecoul = 0.0;
+            if (EFLAG) ecoul = 0.0;
+        }
+
+        if (rsq < cut_ljsqi[jtype]) {
+          r = sqrt(rsq);
+          rminv = pow(r2inv,mmi[jtype]/2.0);
+          rninv = pow(r2inv,nni[jtype]/2.0);
+          forcenm = e0nmi[jtype]*nmi[jtype] *
+            (r0ni[jtype]/pow(r,nni[jtype]) -
+             r0mi[jtype]/pow(r,mmi[jtype]));
+          forcenm *= factor_lj;
+          if (EFLAG)
+            evdwl = (e0nmi[jtype]*(mmi[jtype] *
+                                   r0ni[jtype]*rninv -
+                                   nni[jtype] *
+                                   r0mi[jtype]*rminv) -
+                     offseti[jtype]) * factor_lj;
+        } else {
+          forcenm = 0.0;
+          if (EFLAG) evdwl = 0.0;
+        }
+
+        fpair = (forcecoul + forcenm) * r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j].x -= delx*fpair;
+          f[j].y -= dely*fpair;
+          f[j].z -= delz*fpair;
+        }
+
+        if (EVFLAG) ev_tally_thr(this, i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,ecoul,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i].x += fxtmp;
+    f[i].y += fytmp;
+    f[i].z += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+  double PairNMCutCoulLongOMP::memory_usage()
+  {
+    double bytes = memory_usage_thr();
+    bytes += PairNMCutCoulLong::memory_usage();
+
+    return bytes;
+  }
--- a/src/USER-OMP/pair_nm_cut_coul_long_omp.h
+++ b/src/USER-OMP/pair_nm_cut_coul_long_omp.h
@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(nm/cut/coul/long/omp,PairNMCutCoulLongOMP)
+
+#else
+
+#ifndef LMP_PAIR_NM_CUT_COUL_LONG_OMP_H
+#define LMP_PAIR_NM_CUT_COUL_LONG_OMP_H
+
+#include "pair_nm_cut_coul_long.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairNMCutCoulLongOMP : public PairNMCutCoulLong, public ThrOMP {
+
+ public:
+  PairNMCutCoulLongOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-OMP/pair_nm_cut_omp.cpp
+++ b/src/USER-OMP/pair_nm_cut_omp.cpp
@ -0,0 +1,169 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_nm_cut_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairNMCutOMP::PairNMCutOMP(LAMMPS *lmp) :
+  PairNMCut(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairNMCutOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairNMCutOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
+  dbl3_t * _noalias const f = (dbl3_t *) thr->get_f()[0];
+  const int * _noalias const type = atom->type;
+  const double * _noalias const special_lj = force->special_lj;
+  const int * _noalias const ilist = list->ilist;
+  const int * _noalias const numneigh = list->numneigh;
+  const int * const * const firstneigh = list->firstneigh;
+
+  double xtmp,ytmp,ztmp,delx,dely,delz,fxtmp,fytmp,fztmp;
+  double r,rsq,r2inv,rminv,rninv,forcenm,factor_lj,evdwl,fpair;
+
+  const int nlocal = atom->nlocal;
+  int j,jj,jnum,jtype;
+
+  evdwl = 0.0;
+
+  // loop over neighbors of my atoms
+
+  for (int ii = iifrom; ii < iito; ++ii) {
+    const int i = ilist[ii];
+    const int itype = type[i];
+    const int    * _noalias const jlist = firstneigh[i];
+    const double * _noalias const cutsqi = cutsq[itype];
+    const double * _noalias const offseti = offset[itype];
+    const double * _noalias const mmi = mm[itype];
+    const double * _noalias const nni = nn[itype];
+    const double * _noalias const nmi = nm[itype];
+    const double * _noalias const e0nmi = e0nm[itype];
+    const double * _noalias const r0mi = r0m[itype];
+    const double * _noalias const r0ni = r0n[itype];
+
+    xtmp = x[i].x;
+    ytmp = x[i].y;
+    ztmp = x[i].z;
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j].x;
+      dely = ytmp - x[j].y;
+      delz = ztmp - x[j].z;
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsqi[jtype]) {
+        r2inv = 1.0/rsq;
+        r = sqrt(rsq);
+
+        rminv = pow(r2inv,mmi[jtype]*0.5);
+        rninv = pow(r2inv,nni[jtype]*0.5);
+
+        forcenm = e0nmi[jtype]*nmi[jtype] *
+          (r0ni[jtype]/pow(r,nni[jtype]) -
+           r0mi[jtype]/pow(r,mmi[jtype]));
+        fpair = factor_lj*forcenm*r2inv;
+
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j].x -= delx*fpair;
+          f[j].y -= dely*fpair;
+          f[j].z -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          evdwl = e0nmi[jtype] *
+            (mmi[jtype]*r0ni[jtype]*rninv -
+             nni[jtype]*r0mi[jtype]*rminv) - offseti[jtype];
+        }
+
+        if (EVFLAG) ev_tally_thr(this,i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,0.0,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i].x += fxtmp;
+    f[i].y += fytmp;
+    f[i].z += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairNMCutOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairNMCut::memory_usage();
+
+  return bytes;
+}
--- a/src/USER-OMP/pair_nm_cut_omp.h
+++ b/src/USER-OMP/pair_nm_cut_omp.h
@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(nm/cut/omp,PairNMCutOMP)
+
+#else
+
+#ifndef LMP_PAIR_NM_CUT_OMP_H
+#define LMP_PAIR_NM_CUT_OMP_H
+
+#include "pair_nm_cut.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairNMCutOMP : public PairNMCut, public ThrOMP {
+
+ public:
+  PairNMCutOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-OMP/pair_tersoff_mod_omp.cpp
+++ b/src/USER-OMP/pair_tersoff_mod_omp.cpp
@ -0,0 +1,250 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_tersoff_mod_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairTersoffMODOMP::PairTersoffMODOMP(LAMMPS *lmp) :
+  PairTersoffMOD(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairTersoffMODOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = vflag_atom = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (vflag_atom) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (vflag_atom) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else eval<0,0,0>(ifrom, ito, thr);
+
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int VFLAG_ATOM>
+void PairTersoffMODOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  int i,j,k,ii,jj,kk,jnum;
+  int itag,jtag,itype,jtype,ktype,iparam_ij,iparam_ijk;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,rsq1,rsq2;
+  double delr1[3],delr2[3],fi[3],fj[3],fk[3];
+  double zeta_ij,prefactor;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = 0.0;
+
+  const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
+  dbl3_t * _noalias const f = (dbl3_t *) thr->get_f()[0];
+  const int * _noalias const tag = atom->tag;
+  const int * _noalias const type = atom->type;
+  const int nlocal = atom->nlocal;
+
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  double fxtmp,fytmp,fztmp;
+
+  // loop over full neighbor list of my atoms
+
+  for (ii = iifrom; ii < iito; ++ii) {
+
+    i = ilist[ii];
+    itag = tag[i];
+    itype = map[type[i]];
+    xtmp = x[i].x;
+    ytmp = x[i].y;
+    ztmp = x[i].z;
+    fxtmp = fytmp = fztmp = 0.0;
+
+    // two-body interactions, skip half of them
+
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+      jtag = tag[j];
+
+      if (itag > jtag) {
+        if ((itag+jtag) % 2 == 0) continue;
+      } else if (itag < jtag) {
+        if ((itag+jtag) % 2 == 1) continue;
+      } else {
+        if (x[j].z < ztmp) continue;
+        if (x[j].z == ztmp && x[j].y < ytmp) continue;
+        if (x[j].z == ztmp && x[j].y == ytmp && x[j].x < xtmp) continue;
+      }
+
+      jtype = map[type[j]];
+
+      delx = xtmp - x[j].x;
+      dely = ytmp - x[j].y;
+      delz = ztmp - x[j].z;
+      rsq = delx*delx + dely*dely + delz*delz;
+
+      iparam_ij = elem2param[itype][jtype][jtype];
+      if (rsq > params[iparam_ij].cutsq) continue;
+
+      repulsive(&params[iparam_ij],rsq,fpair,EFLAG,evdwl);
+
+      fxtmp += delx*fpair;
+      fytmp += dely*fpair;
+      fztmp += delz*fpair;
+      f[j].x -= delx*fpair;
+      f[j].y -= dely*fpair;
+      f[j].z -= delz*fpair;
+
+      if (EVFLAG) ev_tally_thr(this,i,j,nlocal,/* newton_pair */ 1,
+                               evdwl,0.0,fpair,delx,dely,delz,thr);
+    }
+
+    // three-body interactions
+    // skip immediately if I-J is not within cutoff
+    double fjxtmp,fjytmp,fjztmp;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+      jtype = map[type[j]];
+      iparam_ij = elem2param[itype][jtype][jtype];
+
+      delr1[0] = x[j].x - xtmp;
+      delr1[1] = x[j].y - ytmp;
+      delr1[2] = x[j].z - ztmp;
+      rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
+      if (rsq1 > params[iparam_ij].cutsq) continue;
+
+      // accumulate bondorder zeta for each i-j interaction via loop over k
+
+      fjxtmp = fjytmp = fjztmp = 0.0;
+      zeta_ij = 0.0;
+
+      for (kk = 0; kk < jnum; kk++) {
+        if (jj == kk) continue;
+        k = jlist[kk];
+        k &= NEIGHMASK;
+        ktype = map[type[k]];
+        iparam_ijk = elem2param[itype][jtype][ktype];
+
+        delr2[0] = x[k].x - xtmp;
+        delr2[1] = x[k].y - ytmp;
+        delr2[2] = x[k].z - ztmp;
+        rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
+        if (rsq2 > params[iparam_ijk].cutsq) continue;
+
+        zeta_ij += zeta(&params[iparam_ijk],rsq1,rsq2,delr1,delr2);
+      }
+
+      // pairwise force due to zeta
+
+      force_zeta(&params[iparam_ij],rsq1,zeta_ij,fpair,prefactor,EFLAG,evdwl);
+
+      fxtmp += delr1[0]*fpair;
+      fytmp += delr1[1]*fpair;
+      fztmp += delr1[2]*fpair;
+      fjxtmp -= delr1[0]*fpair;
+      fjytmp -= delr1[1]*fpair;
+      fjztmp -= delr1[2]*fpair;
+
+      if (EVFLAG) ev_tally_thr(this,i,j,nlocal,/* newton_pair */ 1,evdwl,0.0,
+                               -fpair,-delr1[0],-delr1[1],-delr1[2],thr);
+
+      // attractive term via loop over k
+
+      for (kk = 0; kk < jnum; kk++) {
+        if (jj == kk) continue;
+        k = jlist[kk];
+        k &= NEIGHMASK;
+        ktype = map[type[k]];
+        iparam_ijk = elem2param[itype][jtype][ktype];
+
+        delr2[0] = x[k].x - xtmp;
+        delr2[1] = x[k].y - ytmp;
+        delr2[2] = x[k].z - ztmp;
+        rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
+        if (rsq2 > params[iparam_ijk].cutsq) continue;
+
+        attractive(&params[iparam_ijk],prefactor,
+                   rsq1,rsq2,delr1,delr2,fi,fj,fk);
+
+        fxtmp += fi[0];
+        fytmp += fi[1];
+        fztmp += fi[2];
+        fjxtmp += fj[0];
+        fjytmp += fj[1];
+        fjztmp += fj[2];
+        f[k].x += fk[0];
+        f[k].y += fk[1];
+        f[k].z += fk[2];
+
+        if (VFLAG_ATOM) v_tally3_thr(i,j,k,fj,fk,delr1,delr2,thr);
+      }
+      f[j].x += fjxtmp;
+      f[j].y += fjytmp;
+      f[j].z += fjztmp;
+    }
+    f[i].x += fxtmp;
+    f[i].y += fytmp;
+    f[i].z += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairTersoffMODOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairTersoffMOD::memory_usage();
+
+  return bytes;
+}
--- a/src/USER-OMP/pair_tersoff_mod_omp.h
+++ b/src/USER-OMP/pair_tersoff_mod_omp.h
@ -0,0 +1,43 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(tersoff/mod/omp,PairTersoffMODOMP)
+
+#else
+
+#ifndef LMP_PAIR_TERSOFF_MOD_OMP_H
+#define LMP_PAIR_TERSOFF_MOD_OMP_H
+
+#include "pair_tersoff_mod.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairTersoffMODOMP : public PairTersoffMOD, public ThrOMP {
+
+ public:
+  PairTersoffMODOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int VFLAG_ATOM>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-OMP/pair_zbl_omp.cpp
+++ b/src/USER-OMP/pair_zbl_omp.cpp
@ -0,0 +1,170 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_zbl_omp.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+using namespace PairZBLConstants;
+
+/* ---------------------------------------------------------------------- */
+
+PairZBLOMP::PairZBLOMP(LAMMPS *lmp) :
+  PairZBL(lmp), ThrOMP(lmp, THR_PAIR)
+{
+  suffix_flag |= Suffix::OMP;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairZBLOMP::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int inum = list->inum;
+
+#if defined(_OPENMP)
+#pragma omp parallel default(none) shared(eflag,vflag)
+#endif
+  {
+    int ifrom, ito, tid;
+
+    loop_setup_thr(ifrom, ito, tid, inum, nthreads);
+    ThrData *thr = fix->get_thr(tid);
+    ev_setup_thr(eflag, vflag, nall, eatom, vatom, thr);
+
+    if (evflag) {
+      if (eflag) {
+        if (force->newton_pair) eval<1,1,1>(ifrom, ito, thr);
+        else eval<1,1,0>(ifrom, ito, thr);
+      } else {
+        if (force->newton_pair) eval<1,0,1>(ifrom, ito, thr);
+        else eval<1,0,0>(ifrom, ito, thr);
+      }
+    } else {
+      if (force->newton_pair) eval<0,0,1>(ifrom, ito, thr);
+      else eval<0,0,0>(ifrom, ito, thr);
+    }
+    reduce_thr(this, eflag, vflag, thr);
+  } // end of omp parallel region
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+void PairZBLOMP::eval(int iifrom, int iito, ThrData * const thr)
+{
+  const dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
+  dbl3_t * _noalias const f = (dbl3_t *) thr->get_f()[0];
+  const int * _noalias const type = atom->type;
+  const int * _noalias const ilist = list->ilist;
+  const int * _noalias const numneigh = list->numneigh;
+  const int * const * const firstneigh = list->firstneigh;
+
+  double xtmp,ytmp,ztmp,delx,dely,delz,fxtmp,fytmp,fztmp;
+  double rsq,t,fswitch,eswitch,evdwl,fpair;
+
+  const int nlocal = atom->nlocal;
+  int j,jj,jnum,jtype;
+
+  evdwl = 0.0;
+
+  // loop over neighbors of my atoms
+
+  for (int ii = iifrom; ii < iito; ++ii) {
+    const int i = ilist[ii];
+    const int itype = type[i];
+    const int    * _noalias const jlist = firstneigh[i];
+    const double * _noalias const sw1i = sw1[itype];
+    const double * _noalias const sw2i = sw2[itype];
+    const double * _noalias const sw3i = sw3[itype];
+    const double * _noalias const sw4i = sw4[itype];
+    const double * _noalias const sw5i = sw5[itype];
+
+    xtmp = x[i].x;
+    ytmp = x[i].y;
+    ztmp = x[i].z;
+    jnum = numneigh[i];
+    fxtmp=fytmp=fztmp=0.0;
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j].x;
+      dely = ytmp - x[j].y;
+      delz = ztmp - x[j].z;
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cut_globalsq) {
+        const double r = sqrt(rsq);
+        fpair = dzbldr(r, itype, jtype);
+
+        if (r > cut_inner) {
+          t = r - cut_inner;
+          fswitch = t*t *
+            (sw1i[jtype] + sw2i[jtype]*t);
+          fpair += fswitch;
+        }
+
+        fpair *= -1.0/r;
+        fxtmp += delx*fpair;
+        fytmp += dely*fpair;
+        fztmp += delz*fpair;
+
+        if (NEWTON_PAIR || j < nlocal) {
+          f[j].x -= delx*fpair;
+          f[j].y -= dely*fpair;
+          f[j].z -= delz*fpair;
+        }
+
+        if (EFLAG) {
+          evdwl = e_zbl(r, itype, jtype);
+          evdwl += sw5i[jtype];
+          if (r > cut_inner) {
+            eswitch = t*t*t *
+              (sw3i[jtype] + sw4i[jtype]*t);
+            evdwl += eswitch;
+          }
+        }
+
+        if (EVFLAG) ev_tally_thr(this,i,j,nlocal,NEWTON_PAIR,
+                                 evdwl,0.0,fpair,delx,dely,delz,thr);
+      }
+    }
+    f[i].x += fxtmp;
+    f[i].y += fytmp;
+    f[i].z += fztmp;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairZBLOMP::memory_usage()
+{
+  double bytes = memory_usage_thr();
+  bytes += PairZBL::memory_usage();
+
+  return bytes;
+}
--- a/src/USER-OMP/pair_zbl_omp.h
+++ b/src/USER-OMP/pair_zbl_omp.h
@ -0,0 +1,48 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Axel Kohlmeyer (Temple U)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(zbl/omp,PairZBLOMP)
+
+#else
+
+#ifndef LMP_PAIR_ZBL_OMP_H
+#define LMP_PAIR_ZBL_OMP_H
+
+#include "pair_zbl.h"
+#include "thr_omp.h"
+
+namespace LAMMPS_NS {
+
+class PairZBLOMP : public PairZBL, public ThrOMP {
+
+ public:
+  PairZBLOMP(class LAMMPS *);
+
+  virtual void compute(int, int);
+  virtual double memory_usage();
+
+ private:
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR>
+  void eval(int ifrom, int ito, ThrData * const thr);
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-OMP/thr_data.cpp
+++ b/src/USER-OMP/thr_data.cpp
@ -25,6 +25,7 @@

 using namespace LAMMPS_NS;

+/* ---------------------------------------------------------------------- */

 ThrData::ThrData(int tid)
  : _f(0),_torque(0),_erforce(0),_de(0),_drho(0),_mu(0),_lambda(0),_rhoB(0),
@ -261,7 +262,7 @@ double ThrData::memory_usage()
 void LAMMPS_NS::data_reduce_thr(double *dall, int nall, int nthreads, int ndim, int tid)
 {
 #if defined(_OPENMP)
-  // NOOP in non-threaded execution.
+  // NOOP in single-threaded execution.
  if (nthreads == 1) return;
 #pragma omp barrier
  {
@ -270,15 +271,72 @@ void LAMMPS_NS::data_reduce_thr(double *dall, int nall, int nthreads, int ndim,
    const int ifrom = tid*idelta;
    const int ito   = ((ifrom + idelta) > nvals) ? nvals : (ifrom + idelta);

-    // this if protects against having more threads than atoms
+#if defined(USER_OMP_NO_UNROLL)
    if (ifrom < nvals) {
-      for (int m = ifrom; m < ito; ++m) {
+      int m = 0;
+
+      for (m = ifrom; m < ito; ++m) {
        for (int n = 1; n < nthreads; ++n) {
          dall[m] += dall[n*nvals + m];
          dall[n*nvals + m] = 0.0;
        }
      }
    }
+#else
+    // this if protects against having more threads than atoms
+    if (ifrom < nvals) {
+      int m = 0;
+
+      // for architectures that have L1 D-cache line sizes of 64 bytes
+      // (8 doubles) wide, explictly unroll this loop to  compute 8
+      // contiguous values in the array at a time
+      // -- modify this code based on the size of the cache line
+      double t0, t1, t2, t3, t4, t5, t6, t7;
+      for (m = ifrom; m < (ito-7); m+=8) {
+        t0 = dall[m  ];
+        t1 = dall[m+1];
+        t2 = dall[m+2];
+        t3 = dall[m+3];
+        t4 = dall[m+4];
+        t5 = dall[m+5];
+        t6 = dall[m+6];
+        t7 = dall[m+7];
+        for (int n = 1; n < nthreads; ++n) {
+          t0 += dall[n*nvals + m  ];
+          t1 += dall[n*nvals + m+1];
+          t2 += dall[n*nvals + m+2];
+          t3 += dall[n*nvals + m+3];
+          t4 += dall[n*nvals + m+4];
+          t5 += dall[n*nvals + m+5];
+          t6 += dall[n*nvals + m+6];
+          t7 += dall[n*nvals + m+7];
+          dall[n*nvals + m  ] = 0.0;
+          dall[n*nvals + m+1] = 0.0;
+          dall[n*nvals + m+2] = 0.0;
+          dall[n*nvals + m+3] = 0.0;
+          dall[n*nvals + m+4] = 0.0;
+          dall[n*nvals + m+5] = 0.0;
+          dall[n*nvals + m+6] = 0.0;
+          dall[n*nvals + m+7] = 0.0;
+        }
+        dall[m  ] = t0;
+        dall[m+1] = t1;
+        dall[m+2] = t2;
+        dall[m+3] = t3;
+        dall[m+4] = t4;
+        dall[m+5] = t5;
+        dall[m+6] = t6;
+        dall[m+7] = t7;
+      }
+      // do the last < 8 values
+      for (; m < ito; m++) {
+        for (int n = 1; n < nthreads; ++n) {
+          dall[m] += dall[n*nvals + m];
+          dall[n*nvals + m] = 0.0;
+        }
+      }
+    }
+#endif
  }
 #else
  // NOOP in non-threaded execution.