git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@14499 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2016-01-26 15:11:58 +00:00
parent ef429798c0
commit db9fe7ac9f
28 changed files with 2849 additions and 351 deletions
--- a/src/USER-INTEL/angle_charmm_intel.cpp
+++ b/src/USER-INTEL/angle_charmm_intel.cpp
@ -0,0 +1,331 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include <math.h>
+#include <stdlib.h>
+#include "angle_charmm_intel.h"
+#include "atom.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "comm.h"
+#include "force.h"
+#include "math_const.h"
+#include "memory.h"
+#include "suffix.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define SMALL (flt_t)0.001
+typedef struct { int a,b,c,t;  } int4_t;
+
+/* ---------------------------------------------------------------------- */
+
+AngleCharmmIntel::AngleCharmmIntel(LAMMPS *lmp) : AngleCharmm(lmp) 
+{
+  suffix_flag |= Suffix::INTEL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+AngleCharmmIntel::~AngleCharmmIntel()
+{
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AngleCharmmIntel::compute(int eflag, int vflag)
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_use_base) {
+    AngleCharmm::compute(eflag, vflag);
+    return;
+  }
+  #endif
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
+                          force_const_single);
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    compute<double,double>(eflag, vflag, fix->get_double_buffers(),
+                           force_const_double);
+  else
+    compute<float,float>(eflag, vflag, fix->get_single_buffers(),
+                         force_const_single);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void AngleCharmmIntel::compute(int eflag, int vflag,
+			       IntelBuffers<flt_t,acc_t> *buffers,
+			       const ForceConst<flt_t> &fc)
+{
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = 0;
+
+  if (evflag) {
+    if (eflag) {
+      if (force->newton_bond)
+	eval<1,1,1>(vflag, buffers, fc);
+      else
+	eval<1,1,0>(vflag, buffers, fc);
+    } else {
+      if (force->newton_bond)
+	eval<1,0,1>(vflag, buffers, fc);
+      else
+	eval<1,0,0>(vflag, buffers, fc);
+    }
+  } else {
+    if (force->newton_bond)
+      eval<0,0,1>(vflag, buffers, fc);
+    else
+      eval<0,0,0>(vflag, buffers, fc);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+void AngleCharmmIntel::eval(const int vflag, 
+			    IntelBuffers<flt_t,acc_t> *buffers,
+			    const ForceConst<flt_t> &fc)
+
+{
+  const int inum = neighbor->nanglelist;
+  if (inum == 0) return;
+
+  ATOM_T * _noalias const x = buffers->get_x(0);
+  const int nlocal = atom->nlocal;
+  const int nall = nlocal + atom->nghost;
+
+  int f_stride;
+  if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
+  else f_stride = buffers->get_stride(nlocal);
+
+  int tc;
+  FORCE_T * _noalias f_start;
+  acc_t * _noalias ev_global;
+  IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
+  const int nthreads = tc;
+
+  acc_t oeangle, ov0, ov1, ov2, ov3, ov4, ov5;
+  if (EVFLAG) {
+    if (EFLAG)
+      oeangle = (acc_t)0.0;
+    if (vflag) {
+      ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
+    }
+  }
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(f_start,f_stride,fc)	\
+    reduction(+:oeangle,ov0,ov1,ov2,ov3,ov4,ov5)
+  #endif
+  {
+    int nfrom, nto, tid;
+    IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
+
+    FORCE_T * _noalias const f = f_start + (tid * f_stride);
+    if (fix->need_zero(tid))
+      memset(f, 0, f_stride * sizeof(FORCE_T));
+
+    const int4_t * _noalias const anglelist = 
+      (int4_t *) neighbor->anglelist[0];
+
+    for (int n = nfrom; n < nto; n++) {
+      const int i1 = anglelist[n].a;
+      const int i2 = anglelist[n].b;
+      const int i3 = anglelist[n].c;
+      const int type = anglelist[n].t;
+
+      // 1st bond
+
+      const flt_t delx1 = x[i1].x - x[i2].x;
+      const flt_t dely1 = x[i1].y - x[i2].y;
+      const flt_t delz1 = x[i1].z - x[i2].z;
+
+      const flt_t rsq1 = delx1*delx1 + dely1*dely1 + delz1*delz1;
+      const flt_t r1 = sqrt(rsq1);
+
+      // 2nd bond
+
+      const flt_t delx2 = x[i3].x - x[i2].x;
+      const flt_t dely2 = x[i3].y - x[i2].y;
+      const flt_t delz2 = x[i3].z - x[i2].z;
+
+      const flt_t rsq2 = delx2*delx2 + dely2*dely2 + delz2*delz2;
+      const flt_t r2 = sqrt(rsq2);
+
+      // Urey-Bradley bond
+
+      const flt_t delxUB = x[i3].x - x[i1].x;
+      const flt_t delyUB = x[i3].y - x[i1].y;
+      const flt_t delzUB = x[i3].z - x[i1].z;
+
+      const flt_t rsqUB = delxUB*delxUB + delyUB*delyUB + delzUB*delzUB;
+      const flt_t rUB = sqrt(rsqUB);
+
+      // Urey-Bradley force & energy
+
+      const flt_t dr = rUB - fc.fc[type].r_ub;
+      const flt_t rk = fc.fc[type].k_ub * dr;
+
+      flt_t forceUB;
+      if (rUB > (flt_t)0.0) forceUB = (flt_t)-2.0*rk/rUB;
+      else forceUB = 0.0;
+
+      flt_t eangle;
+      if (EFLAG) eangle = rk*dr;
+
+      // angle (cos and sin)
+
+      flt_t c = delx1*delx2 + dely1*dely2 + delz1*delz2;
+      c /= r1*r2;
+
+      if (c > (flt_t)1.0) c = (flt_t)1.0;
+      if (c < (flt_t)-1.0) c = (flt_t)-1.0;
+
+      flt_t s = sqrt((flt_t)1.0 - c*c);
+      if (s < SMALL) s = SMALL;
+      s = (flt_t)1.0/s;
+
+      // harmonic force & energy
+
+      const flt_t dtheta = acos(c) - fc.fc[type].theta0;
+      const flt_t tk = fc.fc[type].k * dtheta;
+
+      if (EFLAG) eangle += tk*dtheta;
+
+      const flt_t a = (flt_t)-2.0 * tk * s;
+      const flt_t a11 = a*c / rsq1;
+      const flt_t a12 = -a / (r1*r2);
+      const flt_t a22 = a*c / rsq2;
+
+      const flt_t f1x = a11*delx1 + a12*delx2 - delxUB*forceUB;
+      const flt_t f1y = a11*dely1 + a12*dely2 - delyUB*forceUB;
+      const flt_t f1z = a11*delz1 + a12*delz2 - delzUB*forceUB;
+
+      const flt_t f3x = a22*delx2 + a12*delx1 + delxUB*forceUB;
+      const flt_t f3y = a22*dely2 + a12*dely1 + delyUB*forceUB;
+      const flt_t f3z = a22*delz2 + a12*delz1 + delzUB*forceUB;
+
+      // apply force to each of 3 atoms
+
+      if (NEWTON_BOND || i1 < nlocal) {
+        f[i1].x += f1x;
+	f[i1].y += f1y;
+	f[i1].z += f1z;
+      }
+
+      if (NEWTON_BOND || i2 < nlocal) {
+        f[i2].x -= f1x + f3x;
+        f[i2].y -= f1y + f3y;
+        f[i2].z -= f1z + f3z;
+      }
+
+      if (NEWTON_BOND || i3 < nlocal) {
+        f[i3].x += f3x;
+        f[i3].y += f3y;
+        f[i3].z += f3z;
+      }
+
+      if (EVFLAG) {
+	IP_PRE_ev_tally_angle(EFLAG, eatom, vflag, eangle, i1, i2, i3,f1x, 
+                              f1y, f1z, f3x, f3y, f3z, delx1, dely1, delz1, 
+                              delx2, dely2, delz2, oeangle, f, NEWTON_BOND, 
+                              nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
+      }
+    } // for n
+  } // omp parallel
+
+  if (EVFLAG) {
+    if (EFLAG)
+      energy += oeangle;
+    if (vflag) {
+      virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
+      virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; 
+    }
+  }
+
+  fix->set_reduce_flag();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AngleCharmmIntel::init_style()
+{
+  AngleCharmm::init_style();
+
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  fix = static_cast<FixIntel *>(modify->fix[ifix]);
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  _use_base = 0;
+  if (fix->offload_balance() != 0.0) {
+    _use_base = 1;
+    return;
+  }
+  #endif
+
+  fix->bond_init_check();
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
+    pack_force_const(force_const_single, fix->get_mixed_buffers());
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    pack_force_const(force_const_double, fix->get_double_buffers());
+  else
+    pack_force_const(force_const_single, fix->get_single_buffers());
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void AngleCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
+                                        IntelBuffers<flt_t,acc_t> *buffers)
+{
+  const int bp1 = atom->ndihedraltypes + 1;
+  fc.set_ntypes(bp1,memory);
+
+  for (int i = 0; i < bp1; i++) {
+    fc.fc[i].k = k[i];
+    fc.fc[i].theta0 = theta0[i];
+    fc.fc[i].k_ub = k_ub[i];
+    fc.fc[i].r_ub = r_ub[i];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t>
+void AngleCharmmIntel::ForceConst<flt_t>::set_ntypes(const int nangletypes,
+	                                             Memory *memory) {
+  if (nangletypes != _nangletypes) {
+    if (_nangletypes > 0)
+      _memory->destroy(fc);
+    
+    if (nangletypes > 0)
+      _memory->create(fc,nangletypes,"anglecharmmintel.fc");
+  }
+  _nangletypes = nangletypes;
+  _memory = memory;
+}
--- a/src/USER-INTEL/angle_charmm_intel.h
+++ b/src/USER-INTEL/angle_charmm_intel.h
@ -0,0 +1,88 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef ANGLE_CLASS
+
+AngleStyle(charmm/intel,AngleCharmmIntel)
+
+#else
+
+#ifndef LMP_ANGLE_CHARMM_INTEL_H
+#define LMP_ANGLE_CHARMM_INTEL_H
+
+#include <stdio.h>
+#include "angle_charmm.h"
+#include "fix_intel.h"
+
+namespace LAMMPS_NS {
+
+class AngleCharmmIntel : public AngleCharmm {
+ public:
+  AngleCharmmIntel(class LAMMPS *);
+  virtual ~AngleCharmmIntel();
+  virtual void compute(int, int);
+  virtual void init_style();
+
+ protected:
+  FixIntel *fix;
+
+  template <class flt_t> class ForceConst;
+  template <class flt_t, class acc_t>
+  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
+               const ForceConst<flt_t> &fc);
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, 
+	    const ForceConst<flt_t> &fc);
+  template <class flt_t, class acc_t>
+  void pack_force_const(ForceConst<flt_t> &fc,
+                        IntelBuffers<flt_t, acc_t> *buffers);
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  int _use_base;
+  #endif
+
+  template <class flt_t>
+  class ForceConst {
+   public:
+    typedef struct { flt_t k, theta0, k_ub, r_ub; } fc_packed1;
+
+    fc_packed1 *fc;
+    ForceConst() : _nangletypes(0)  {}
+    ~ForceConst() { set_ntypes(0, NULL); }
+
+    void set_ntypes(const int nangletypes, Memory *memory);
+
+   private:
+    int _nangletypes;
+    Memory *_memory;
+  };
+  ForceConst<float> force_const_single;
+  ForceConst<double> force_const_double;
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Incorrect args for angle coefficients
+
+Self-explanatory.  Check the input script or data file.
+
+*/
--- a/src/USER-INTEL/bond_harmonic_intel.cpp
+++ b/src/USER-INTEL/bond_harmonic_intel.cpp
@ -0,0 +1,261 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include <math.h>
+#include <stdlib.h>
+#include "bond_harmonic_intel.h"
+#include "atom.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "comm.h"
+#include "force.h"
+#include "memory.h"
+#include "suffix.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+typedef struct { int a,b,t;  } int3_t;
+
+/* ---------------------------------------------------------------------- */
+
+BondHarmonicIntel::BondHarmonicIntel(LAMMPS *lmp) : BondHarmonic(lmp) 
+{
+  suffix_flag |= Suffix::INTEL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+BondHarmonicIntel::~BondHarmonicIntel()
+{
+}
+
+/* ---------------------------------------------------------------------- */
+
+void BondHarmonicIntel::compute(int eflag, int vflag)
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_use_base) {
+    BondHarmonic::compute(eflag, vflag);
+    return;
+  }
+  #endif
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
+                          force_const_single);
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    compute<double,double>(eflag, vflag, fix->get_double_buffers(),
+                           force_const_double);
+  else
+    compute<float,float>(eflag, vflag, fix->get_single_buffers(),
+                         force_const_single);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void BondHarmonicIntel::compute(int eflag, int vflag,
+				IntelBuffers<flt_t,acc_t> *buffers,
+				const ForceConst<flt_t> &fc)
+{
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = 0;
+
+  if (evflag) {
+    if (eflag) {
+      if (force->newton_bond)
+	eval<1,1,1>(vflag, buffers, fc);
+      else
+	eval<1,1,0>(vflag, buffers, fc);
+    } else {
+      if (force->newton_bond)
+	eval<1,0,1>(vflag, buffers, fc);
+      else
+	eval<1,0,0>(vflag, buffers, fc);
+    }
+  } else {
+    if (force->newton_bond)
+      eval<0,0,1>(vflag, buffers, fc);
+    else
+      eval<0,0,0>(vflag, buffers, fc);
+  }
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+void BondHarmonicIntel::eval(const int vflag, 
+			     IntelBuffers<flt_t,acc_t> *buffers,
+			     const ForceConst<flt_t> &fc)
+{
+  const int inum = neighbor->nbondlist;
+  if (inum == 0) return;
+
+  ATOM_T * _noalias const x = buffers->get_x(0);
+  const int nlocal = atom->nlocal;
+  const int nall = nlocal + atom->nghost;
+
+  int f_stride;
+  if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
+  else f_stride = buffers->get_stride(nlocal);
+
+  int tc;
+  FORCE_T * _noalias f_start;
+  acc_t * _noalias ev_global;
+  IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
+  const int nthreads = tc;
+
+  acc_t oebond, ov0, ov1, ov2, ov3, ov4, ov5;
+  if (EVFLAG) {
+    if (EFLAG)
+      oebond = (acc_t)0.0;
+    if (vflag) {
+      ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
+    }
+  }
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(f_start,f_stride,fc)		  \
+    reduction(+:oebond,ov0,ov1,ov2,ov3,ov4,ov5)
+  #endif
+  {
+    int nfrom, nto, tid;
+    IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
+
+    FORCE_T * _noalias const f = f_start + (tid * f_stride);
+    if (fix->need_zero(tid))
+      memset(f, 0, f_stride * sizeof(FORCE_T));
+
+    const int3_t * _noalias const bondlist = 
+      (int3_t *) neighbor->bondlist[0];
+
+    for (int n = nfrom; n < nto; n++) {
+      const int i1 = bondlist[n].a;
+      const int i2 = bondlist[n].b;
+      const int type = bondlist[n].t;
+
+      const flt_t delx = x[i1].x - x[i2].x;
+      const flt_t dely = x[i1].y - x[i2].y;
+      const flt_t delz = x[i1].z - x[i2].z;
+
+      const flt_t rsq = delx*delx + dely*dely + delz*delz;
+      const flt_t r = sqrt(rsq);
+      const flt_t dr = r - fc.fc[type].r0;
+      const flt_t rk = fc.fc[type].k * dr;
+
+      // force & energy
+
+      flt_t fbond;
+      if (r > (flt_t)0.0) fbond = (flt_t)-2.0*rk/r;
+      else fbond = (flt_t)0.0;
+
+      flt_t ebond;
+      if (EFLAG) ebond = rk*dr;
+
+      // apply force to each of 2 atoms
+      if (NEWTON_BOND || i1 < nlocal) {
+        f[i1].x += delx*fbond;
+        f[i1].y += dely*fbond;
+        f[i1].z += delz*fbond;
+      }
+
+      if (NEWTON_BOND || i2 < nlocal) {
+        f[i2].x -= delx*fbond;
+        f[i2].y -= dely*fbond;
+        f[i2].z -= delz*fbond;
+      }
+
+      if (EVFLAG) {
+	IP_PRE_ev_tally_bond(EFLAG, eatom, vflag, ebond, i1, i2, fbond, 
+                             delx, dely, delz, oebond, f, NEWTON_BOND, 
+                             nlocal, ov0, ov1, ov2, ov3, ov4, ov5);
+      }
+    } // for n
+  } // omp parallel
+
+  if (EVFLAG) {
+    if (EFLAG)
+      energy += oebond;
+    if (vflag) {
+      virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
+      virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; 
+    }
+  }
+
+  fix->set_reduce_flag();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void BondHarmonicIntel::init_style()
+{
+  BondHarmonic::init_style();
+
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  fix = static_cast<FixIntel *>(modify->fix[ifix]);
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  _use_base = 0;
+  if (fix->offload_balance() != 0.0) {
+    _use_base = 1;
+    return;
+  }
+  #endif
+
+  fix->bond_init_check();
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
+    pack_force_const(force_const_single, fix->get_mixed_buffers());
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    pack_force_const(force_const_double, fix->get_double_buffers());
+  else
+    pack_force_const(force_const_single, fix->get_single_buffers());
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void BondHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
+                                         IntelBuffers<flt_t,acc_t> *buffers)
+{
+  const int bp1 = atom->nbondtypes + 1;
+  fc.set_ntypes(bp1,memory);
+
+  for (int i = 0; i < bp1; i++) {
+    fc.fc[i].k = k[i];
+    fc.fc[i].r0 = r0[i];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t>
+void BondHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
+	                                              Memory *memory) {
+  if (nbondtypes != _nbondtypes) {
+    if (_nbondtypes > 0)
+      _memory->destroy(fc);
+    
+    if (nbondtypes > 0)
+      _memory->create(fc,nbondtypes,"bondharmonicintel.fc");
+  }
+  _nbondtypes = nbondtypes;
+  _memory = memory;
+}
--- a/src/USER-INTEL/bond_harmonic_intel.h
+++ b/src/USER-INTEL/bond_harmonic_intel.h
@ -0,0 +1,88 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef BOND_CLASS
+
+BondStyle(harmonic/intel,BondHarmonicIntel)
+
+#else
+
+#ifndef LMP_BOND_HARMONIC_INTEL_H
+#define LMP_BOND_HARMONIC_INTEL_H
+
+#include <stdio.h>
+#include "bond_harmonic.h"
+#include "fix_intel.h"
+
+namespace LAMMPS_NS {
+
+class BondHarmonicIntel : public BondHarmonic {
+ public:
+  BondHarmonicIntel(class LAMMPS *);
+  virtual ~BondHarmonicIntel();
+  virtual void compute(int, int);
+  virtual void init_style();
+
+ protected:
+  FixIntel *fix;
+
+  template <class flt_t> class ForceConst;
+  template <class flt_t, class acc_t>
+  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
+               const ForceConst<flt_t> &fc);
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, 
+	    const ForceConst<flt_t> &fc);
+  template <class flt_t, class acc_t>
+  void pack_force_const(ForceConst<flt_t> &fc,
+                        IntelBuffers<flt_t, acc_t> *buffers);
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  int _use_base;
+  #endif
+
+  template <class flt_t>
+  class ForceConst {
+   public:
+    typedef struct { flt_t k, r0; } fc_packed1;
+    fc_packed1 *fc;
+
+    ForceConst() : _nbondtypes(0)  {}
+    ~ForceConst() { set_ntypes(0, NULL); }
+
+    void set_ntypes(const int nbondtypes, Memory *memory);
+
+   private:
+    int _nbondtypes;
+    Memory *_memory;
+  };
+  ForceConst<float> force_const_single;
+  ForceConst<double> force_const_double;
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Incorrect args for bond coefficients
+
+Self-explanatory.  Check the input script or data file.
+
+*/
--- a/src/USER-INTEL/dihedral_charmm_intel.cpp
+++ b/src/USER-INTEL/dihedral_charmm_intel.cpp
@ -0,0 +1,538 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include "mpi.h"
+#include "math.h"
+#include "dihedral_charmm_intel.h"
+#include "atom.h"
+#include "comm.h"
+#include "memory.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "force.h"
+#include "pair.h"
+#include "update.h"
+#include "error.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define PTOLERANCE (flt_t)1.05
+#define MTOLERANCE (flt_t)-1.05
+#define SMALL      (flt_t)0.001
+typedef struct { int a,b,c,d,t;  } int5_t;
+
+/* ---------------------------------------------------------------------- */
+
+DihedralCharmmIntel::DihedralCharmmIntel(class LAMMPS *lmp)
+  : DihedralCharmm(lmp)
+{
+  suffix_flag |= Suffix::INTEL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void DihedralCharmmIntel::compute(int eflag, int vflag)
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_use_base) {
+    DihedralCharmm::compute(eflag, vflag);
+    return;
+  }
+  #endif
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
+                          force_const_single);
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    compute<double,double>(eflag, vflag, fix->get_double_buffers(),
+                           force_const_double);
+  else
+    compute<float,float>(eflag, vflag, fix->get_single_buffers(),
+                         force_const_single);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void DihedralCharmmIntel::compute(int eflag, int vflag,
+				  IntelBuffers<flt_t,acc_t> *buffers,
+				  const ForceConst<flt_t> &fc)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  // insure pair->ev_tally() will use 1-4 virial contribution
+
+  if (weightflag && vflag_global == 2)
+    force->pair->vflag_either = force->pair->vflag_global = 1;
+
+  if (evflag) {
+    if (eflag) {
+      if (force->newton_bond)
+	eval<1,1,1>(vflag, buffers, fc);
+      else
+	eval<1,1,0>(vflag, buffers, fc);
+    } else {
+      if (force->newton_bond)
+	eval<1,0,1>(vflag, buffers, fc);
+      else
+	eval<1,0,0>(vflag, buffers, fc);
+    }
+  } else {
+    if (force->newton_bond)
+      eval<0,0,1>(vflag, buffers, fc);
+    else
+      eval<0,0,0>(vflag, buffers, fc);
+  }
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+void DihedralCharmmIntel::eval(const int vflag, 
+			       IntelBuffers<flt_t,acc_t> *buffers,
+			       const ForceConst<flt_t> &fc)
+
+{
+  const int inum = neighbor->ndihedrallist;
+  if (inum == 0) return;
+
+  ATOM_T * _noalias const x = buffers->get_x(0);
+  flt_t * _noalias const q = buffers->get_q(0);
+  const int nlocal = atom->nlocal;
+  const int nall = nlocal + atom->nghost;
+
+  int f_stride;
+  if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
+  else f_stride = buffers->get_stride(nlocal);
+
+  int tc;
+  FORCE_T * _noalias f_start;
+  acc_t * _noalias ev_global;
+  IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
+  const int nthreads = tc;
+
+  acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
+  acc_t oevdwl, oecoul, opv0, opv1, opv2, opv3, opv4, opv5;
+  if (EVFLAG) {
+    if (EFLAG)
+      oevdwl = oecoul = oedihedral = (acc_t)0.0;
+    if (vflag) {
+      ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
+      opv0 = opv1 = opv2 = opv3 = opv4 = opv5 = (acc_t)0.0;
+    }
+  }
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(f_start,f_stride,fc)		  \
+    reduction(+:oevdwl,oecoul,oedihedral,ov0,ov1,ov2,ov3,ov4,ov5, \
+	      opv0,opv1,opv2,opv3,opv4,opv5)
+  #endif
+  {
+    int nfrom, nto, tid;
+    IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
+
+    FORCE_T * _noalias const f = f_start + (tid * f_stride);
+    if (fix->need_zero(tid))
+      memset(f, 0, f_stride * sizeof(FORCE_T));
+
+    const int5_t * _noalias const dihedrallist = 
+      (int5_t *) neighbor->dihedrallist[0];
+    const flt_t qqrd2e = force->qqrd2e;
+
+    acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
+    acc_t sevdwl, secoul, spv0, spv1, spv2, spv3, spv4, spv5;
+    if (EVFLAG) {
+      if (EFLAG)
+	sevdwl = secoul = sedihedral = (acc_t)0.0;
+      if (vflag) {
+	sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
+	spv0 = spv1 = spv2 = spv3 = spv4 = spv5 = (acc_t)0.0;
+      }
+    }
+
+    for (int n = nfrom; n < nto; n++) {
+      const int i1 = dihedrallist[n].a;
+      const int i2 = dihedrallist[n].b;
+      const int i3 = dihedrallist[n].c;
+      const int i4 = dihedrallist[n].d;
+      const int type = dihedrallist[n].t;
+
+      // 1st bond
+
+      const flt_t vb1x = x[i1].x - x[i2].x;
+      const flt_t vb1y = x[i1].y - x[i2].y;
+      const flt_t vb1z = x[i1].z - x[i2].z;
+      const int itype = x[i1].w;
+
+      // 2nd bond
+
+      const flt_t vb2xm = x[i2].x - x[i3].x;
+      const flt_t vb2ym = x[i2].y - x[i3].y;
+      const flt_t vb2zm = x[i2].z - x[i3].z;
+
+      // 3rd bond
+      
+      const flt_t vb3x = x[i4].x - x[i3].x;
+      const flt_t vb3y = x[i4].y - x[i3].y;
+      const flt_t vb3z = x[i4].z - x[i3].z;
+      const int jtype = x[i4].w;
+
+      // 1-4
+
+      const flt_t delx = x[i1].x - x[i4].x;
+      const flt_t dely = x[i1].y - x[i4].y;
+      const flt_t delz = x[i1].z - x[i4].z;
+
+
+      // c,s calculation
+
+      const flt_t ax = vb1y*vb2zm - vb1z*vb2ym;
+      const flt_t ay = vb1z*vb2xm - vb1x*vb2zm;
+      const flt_t az = vb1x*vb2ym - vb1y*vb2xm;
+      const flt_t bx = vb3y*vb2zm - vb3z*vb2ym;
+      const flt_t by = vb3z*vb2xm - vb3x*vb2zm;
+      const flt_t bz = vb3x*vb2ym - vb3y*vb2xm;
+
+      const flt_t rasq = ax*ax + ay*ay + az*az;
+      const flt_t rbsq = bx*bx + by*by + bz*bz;
+      const flt_t rgsq = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm;
+      const flt_t rg = sqrt(rgsq);
+
+      flt_t rginv, ra2inv, rb2inv;
+      rginv = ra2inv = rb2inv = (flt_t)0.0;
+      if (rg > 0) rginv = (flt_t)1.0/rg;
+      if (rasq > 0) ra2inv = (flt_t)1.0/rasq;
+      if (rbsq > 0) rb2inv = (flt_t)1.0/rbsq;
+      const flt_t rabinv = sqrt(ra2inv*rb2inv);
+
+      flt_t c = (ax*bx + ay*by + az*bz)*rabinv;
+      const flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z);
+
+      // error check
+      if (c > PTOLERANCE || c < MTOLERANCE) {
+	int me = comm->me;
+
+	if (screen) {
+	  char str[128];
+	  sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
+		  TAGINT_FORMAT " " TAGINT_FORMAT " "
+		  TAGINT_FORMAT " " TAGINT_FORMAT,
+		  me,tid,update->ntimestep,
+		  atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
+	  error->warning(FLERR,str,0);
+	  fprintf(screen,"  1st atom: %d %g %g %g\n",
+		  me,x[i1].x,x[i1].y,x[i1].z);
+	  fprintf(screen,"  2nd atom: %d %g %g %g\n",
+		  me,x[i2].x,x[i2].y,x[i2].z);
+	  fprintf(screen,"  3rd atom: %d %g %g %g\n",
+		  me,x[i3].x,x[i3].y,x[i3].z);
+	  fprintf(screen,"  4th atom: %d %g %g %g\n",
+		  me,x[i4].x,x[i4].y,x[i4].z);
+	}
+      }
+
+      if (c > (flt_t)1.0) c = (flt_t)1.0;
+      if (c < (flt_t)-1.0) c = (flt_t)-1.0;
+
+      const flt_t tcos_shift = fc.bp[type].cos_shift;
+      const flt_t tsin_shift = fc.bp[type].sin_shift;
+      const flt_t tk = fc.bp[type].k;
+      const int m = fc.bp[type].multiplicity;
+
+      flt_t p = (flt_t)1.0;
+      flt_t ddf1, df1;
+      ddf1 = df1 = (flt_t)0.0;
+
+      for (int i = 0; i < m; i++) {
+	ddf1 = p*c - df1*s;
+	df1 = p*s + df1*c;
+	p = ddf1;
+      }
+
+      p = p*tcos_shift + df1*tsin_shift;
+      df1 = df1*tcos_shift - ddf1*tsin_shift;
+      df1 *= -m;
+      p += (flt_t)1.0;
+      
+      if (m == 0) {
+	p = (flt_t)1.0 + tcos_shift;
+	df1 = (flt_t)0.0;
+      }
+
+      const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
+      const flt_t hg = vb3x*vb2xm + vb3y*vb2ym + vb3z*vb2zm;
+      const flt_t fga = fg*ra2inv*rginv;
+      const flt_t hgb = hg*rb2inv*rginv;
+      const flt_t gaa = -ra2inv*rg;
+      const flt_t gbb = rb2inv*rg;
+
+      const flt_t dtfx = gaa*ax;
+      const flt_t dtfy = gaa*ay;
+      const flt_t dtfz = gaa*az;
+      const flt_t dtgx = fga*ax - hgb*bx;
+      const flt_t dtgy = fga*ay - hgb*by;
+      const flt_t dtgz = fga*az - hgb*bz;
+      const flt_t dthx = gbb*bx;
+      const flt_t dthy = gbb*by;
+      const flt_t dthz = gbb*bz;
+
+      const flt_t df = -tk * df1;
+
+      const flt_t sx2 = df*dtgx;
+      const flt_t sy2 = df*dtgy;
+      const flt_t sz2 = df*dtgz;
+
+      flt_t f1x = df*dtfx;
+      flt_t f1y = df*dtfy;
+      flt_t f1z = df*dtfz;
+
+      const flt_t f2x = sx2 - f1x;
+      const flt_t f2y = sy2 - f1y;
+      const flt_t f2z = sz2 - f1z;
+
+      flt_t f4x = df*dthx;
+      flt_t f4y = df*dthy;
+      flt_t f4z = df*dthz;
+
+      const flt_t f3x = -sx2 - f4x;
+      const flt_t f3y = -sy2 - f4y;
+      const flt_t f3z = -sz2 - f4z;
+
+      if (EVFLAG) {
+	flt_t deng;
+	if (EFLAG) deng = tk * p;
+	IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, deng, i1, i2, i3, i4, f1x, 
+			      f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, vb1x, 
+			      vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x, vb3y, 
+			      vb3z, sedihedral, f, NEWTON_BOND, nlocal,
+			      sv0, sv1, sv2, sv3, sv4, sv5);
+      }
+
+
+      {
+        if (NEWTON_BOND || i2 < nlocal) {
+	  f[i2].x += f2x;
+	  f[i2].y += f2y;
+	  f[i2].z += f2z;
+        }
+
+        if (NEWTON_BOND || i3 < nlocal) {
+	  f[i3].x += f3x;
+	  f[i3].y += f3y;
+	  f[i3].z += f3z;
+        }
+      }
+
+      // 1-4 LJ and Coulomb interactions
+      // tally energy/virial in pair, using newton_bond as newton flag
+
+      const flt_t tweight = fc.weight[type];
+      const flt_t rsq = delx*delx + dely*dely + delz*delz;
+      const flt_t r2inv = (flt_t)1.0/rsq;
+      const flt_t r6inv = r2inv*r2inv*r2inv;
+
+      flt_t forcecoul;
+      if (implicit) forcecoul = qqrd2e * q[i1]*q[i4]*r2inv;
+      else forcecoul = qqrd2e * q[i1]*q[i4]*sqrt(r2inv);
+      const flt_t forcelj = r6inv * (fc.ljp[itype][jtype].lj1*r6inv - 
+				     fc.ljp[itype][jtype].lj2);
+      const flt_t fpair = tweight * (forcelj+forcecoul)*r2inv;
+
+      if (NEWTON_BOND || i1 < nlocal) {
+	f1x += delx*fpair;
+	f1y += dely*fpair;
+	f1z += delz*fpair;
+      }
+      if (NEWTON_BOND || i4 < nlocal) {
+	f4x -= delx*fpair;
+	f4y -= dely*fpair;
+	f4z -= delz*fpair;
+      }
+
+      if (EVFLAG) {
+	flt_t ev_pre = (flt_t)0;
+	if (NEWTON_BOND || i1 < nlocal)
+	  ev_pre += (flt_t)0.5;
+	if (NEWTON_BOND || i4 < nlocal)
+	  ev_pre += (flt_t)0.5;
+
+	if (EFLAG) {
+	  flt_t ecoul, evdwl;
+	  ecoul = tweight * forcecoul;
+	  evdwl = tweight * r6inv * (fc.ljp[itype][jtype].lj3*r6inv - 
+				     fc.ljp[itype][jtype].lj4);
+	  secoul += ev_pre * ecoul;
+	  sevdwl += ev_pre * evdwl;
+	  if (eatom) {
+	    evdwl *= (flt_t)0.5;
+	    evdwl += (flt_t)0.5 * ecoul;
+	    if (NEWTON_BOND || i1 < nlocal)
+	      f[i1].w += evdwl;
+	    if (NEWTON_BOND || i4 < nlocal)
+	      f[i4].w += evdwl;
+	  }
+	}
+	if (vflag) {                                                    
+	  spv0 += ev_pre * delx * delx * fpair;                               
+	  spv1 += ev_pre * dely * dely * fpair;                               
+	  spv2 += ev_pre * delz * delz * fpair;                               
+	  spv3 += ev_pre * delx * dely * fpair;                               
+	  spv4 += ev_pre * delx * delz * fpair;                               
+	  spv5 += ev_pre * dely * delz * fpair;                               
+	}                                                                    
+      }
+
+      // apply force to each of 4 atoms
+      {
+        if (NEWTON_BOND || i1 < nlocal) {
+	  f[i1].x += f1x;
+	  f[i1].y += f1y;
+	  f[i1].z += f1z;
+        }
+
+        if (NEWTON_BOND || i4 < nlocal) {
+	  f[i4].x += f4x;
+	  f[i4].y += f4y;
+	  f[i4].z += f4z;
+        }
+      }
+    } // for n
+    if (EVFLAG) {
+      if (EFLAG) {
+	oedihedral += sedihedral;
+	oecoul += secoul;
+	oevdwl += sevdwl;
+      }
+      if (vflag) {
+	ov0 += sv0; ov1 += sv1; ov2 += sv2; ov3 += sv3; ov4 += sv4; ov5 += sv5;
+	opv0 += spv0; opv1 += spv1; opv2 += spv2; 
+	opv3 += spv3; opv4 += spv4; opv5 += spv5;
+      }
+    }
+  } // omp parallel
+
+  if (EVFLAG) {
+    if (EFLAG) {
+      energy += oedihedral;
+      force->pair->eng_vdwl += oevdwl;
+      force->pair->eng_coul += oecoul;
+    }
+    if (vflag) {
+      virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
+      virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
+      force->pair->virial[0] += opv0;
+      force->pair->virial[1] += opv1;
+      force->pair->virial[2] += opv2;
+      force->pair->virial[3] += opv3;
+      force->pair->virial[4] += opv4;
+      force->pair->virial[5] += opv5;
+    }
+  }
+
+  fix->set_reduce_flag();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void DihedralCharmmIntel::init_style()
+{
+  DihedralCharmm::init_style();
+
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  fix = static_cast<FixIntel *>(modify->fix[ifix]);
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  _use_base = 0;
+  if (fix->offload_balance() != 0.0) {
+    _use_base = 1;
+    return;
+  }
+  #endif
+
+  fix->bond_init_check();
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
+    pack_force_const(force_const_single, fix->get_mixed_buffers());
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    pack_force_const(force_const_double, fix->get_double_buffers());
+  else
+    pack_force_const(force_const_single, fix->get_single_buffers());
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void DihedralCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
+	                                   IntelBuffers<flt_t,acc_t> *buffers)
+{
+
+  const int tp1 = atom->ntypes + 1;
+  const int bp1 = atom->ndihedraltypes + 1;
+  fc.set_ntypes(tp1,bp1,memory);
+  buffers->set_ntypes(tp1);
+
+  for (int i = 0; i < tp1; i++) {
+    for (int j = 0; j < tp1; j++) {
+      fc.ljp[i][j].lj1 = lj14_1[i][j];
+      fc.ljp[i][j].lj2 = lj14_2[i][j];
+      fc.ljp[i][j].lj3 = lj14_3[i][j];
+      fc.ljp[i][j].lj4 = lj14_4[i][j];
+    }
+  }
+
+  for (int i = 0; i < bp1; i++) {
+    fc.bp[i].multiplicity = multiplicity[i];
+    fc.bp[i].cos_shift = cos_shift[i];
+    fc.bp[i].sin_shift = sin_shift[i];
+    fc.bp[i].k = k[i];
+    fc.weight[i] = weight[i];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t>
+void DihedralCharmmIntel::ForceConst<flt_t>::set_ntypes(const int npairtypes,
+            	                                        const int nbondtypes,
+	                                                Memory *memory) {
+  if (npairtypes != _npairtypes) {
+    if (_npairtypes > 0)
+      _memory->destroy(ljp);
+    if (npairtypes > 0)
+      memory->create(ljp,npairtypes,npairtypes,"fc.ljp");
+  }
+
+  if (nbondtypes != _nbondtypes) {
+    if (_nbondtypes > 0) {
+      _memory->destroy(bp);
+      _memory->destroy(weight);
+    }
+    
+    if (nbondtypes > 0) {
+      _memory->create(bp,nbondtypes,"dihedralcharmmintel.bp");
+      _memory->create(weight,nbondtypes,"dihedralcharmmintel.weight");
+    }
+  }
+  _npairtypes = npairtypes;
+  _nbondtypes = nbondtypes;
+  _memory = memory;
+}
--- a/src/USER-INTEL/dihedral_charmm_intel.h
+++ b/src/USER-INTEL/dihedral_charmm_intel.h
@ -0,0 +1,84 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef DIHEDRAL_CLASS
+
+DihedralStyle(charmm/intel,DihedralCharmmIntel)
+
+#else
+
+#ifndef LMP_DIHEDRAL_CHARMM_INTEL_H
+#define LMP_DIHEDRAL_CHARMM_INTEL_H
+
+#include "dihedral_charmm.h"
+#include "fix_intel.h"
+
+namespace LAMMPS_NS {
+
+class DihedralCharmmIntel : public DihedralCharmm {
+
+ public:
+  DihedralCharmmIntel(class LAMMPS *lmp);
+  virtual void compute(int, int);
+  void init_style();
+
+ private:
+  FixIntel *fix;
+
+  template <class flt_t> class ForceConst;
+  template <class flt_t, class acc_t>
+  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
+               const ForceConst<flt_t> &fc);
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, 
+	    const ForceConst<flt_t> &fc);
+  template <class flt_t, class acc_t>
+  void pack_force_const(ForceConst<flt_t> &fc,
+                        IntelBuffers<flt_t, acc_t> *buffers);
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  int _use_base;
+  #endif
+
+  template <class flt_t>
+  class ForceConst {
+   public:
+    typedef struct { flt_t lj1, lj2, lj3, lj4; } fc_packed1;
+    typedef struct { flt_t cos_shift, sin_shift, k; 
+                     int multiplicity; } fc_packed3;
+
+    fc_packed1 **ljp;
+    fc_packed3 *bp;
+    flt_t *weight;
+
+    ForceConst() : _npairtypes(0), _nbondtypes(0)  {}
+    ~ForceConst() { set_ntypes(0, 0, NULL); }
+
+    void set_ntypes(const int npairtypes, const int nbondtypes, Memory *memory);
+
+   private:
+    int _npairtypes, _nbondtypes;
+    Memory *_memory;
+  };
+  ForceConst<float> force_const_single;
+  ForceConst<double> force_const_double;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-INTEL/fix_intel.cpp
+++ b/src/USER-INTEL/fix_intel.cpp
@ -71,6 +71,12 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) :  Fix(lmp, narg, arg)
  _offload_threads = 0;
  _offload_tpc = 4;

+  _force_array_s = 0;
+  _force_array_m = 0;
+  _force_array_d = 0;
+  _ev_array_s = 0;
+  _ev_array_d = 0;
+
  #ifdef _LMP_INTEL_OFFLOAD
  if (ncops < 0) error->all(FLERR,"Illegal package intel command");
  _offload_affinity_set = 0;
@ -293,6 +299,8 @@ void FixIntel::init()
    _mixed_buffers->zero_ev();
  else
    _double_buffers->zero_ev();
+
+  _need_reduce = 0;
 }

 /* ---------------------------------------------------------------------- */
@ -317,6 +325,10 @@ void FixIntel::pair_init_check(const bool cdmessage)

  _nbor_pack_width = 1;

+  if (strstr(update->integrate_style,"intel") == 0)
+    error->all(FLERR,
+	"Specified run_style does not support the Intel package.");
+
  #ifdef _LMP_INTEL_OFFLOAD
  if (_offload_balance != 0.0) atom->sortfreq = 1;

@ -348,9 +360,6 @@ void FixIntel::pair_init_check(const bool cdmessage)
      _sync_at_pair = 2;
  } else {
    _sync_at_pair = 0;
-    if (strstr(update->integrate_style,"intel") == 0)
-      error->all(FLERR,
-		 "Specified run_style does not support the Intel package.");
  }
  #endif
  _nthreads = comm->nthreads;
@ -415,6 +424,35 @@ void FixIntel::pair_init_check(const bool cdmessage)

 /* ---------------------------------------------------------------------- */

+void FixIntel::bond_init_check()
+{
+  if (_offload_balance != 0.0 && atom->molecular && 
+      force->newton_pair != force->newton_bond)
+    error->all(FLERR,
+      "USER-INTEL package requires same setting for newton bond and non-bond.");
+
+  int intel_pair = 0;
+  if (force->pair_match("/intel", 0) != NULL)
+    intel_pair = 1;
+  else if (force->pair_match("hybrid", 1) != NULL) {
+    PairHybrid *hybrid = (PairHybrid *) force->pair;
+    for (int i = 0; i < hybrid->nstyles; i++)
+      if (strstr(hybrid->keywords[i], "/intel") != NULL)
+        intel_pair = 1;
+  } else if (force->pair_match("hybrid/overlay", 1) != NULL) {
+    PairHybridOverlay *hybrid = (PairHybridOverlay *) force->pair;
+    for (int i = 0; i < hybrid->nstyles; i++)
+      if (strstr(hybrid->keywords[i], "/intel") != NULL)
+        intel_pair = 1;
+  }
+
+  if (intel_pair == 0)
+    error->all(FLERR, "Intel styles for bond/angle/dihedral/improper "
+      "require intel pair style.");
+}
+
+/* ---------------------------------------------------------------------- */
+
 void FixIntel::check_neighbor_intel()
 {
  #ifdef _LMP_INTEL_OFFLOAD
@ -436,6 +474,82 @@ void FixIntel::check_neighbor_intel()

 /* ---------------------------------------------------------------------- */

+void FixIntel::sync()
+{
+  if (_force_array_m != 0) {
+    if (_need_reduce) {
+      reduce_results(_force_array_m);
+      _need_reduce = 0;
+    }
+    add_results(_force_array_m, _ev_array_d, _results_eatom, _results_vatom, 0);
+    _force_array_m = 0;
+  } else if (_force_array_d != 0) {
+    if (_need_reduce) {
+      reduce_results(_force_array_d);
+      _need_reduce = 0;
+    }
+    add_results(_force_array_d, _ev_array_d, _results_eatom, _results_vatom, 0);
+    _force_array_d = 0;
+  } else if (_force_array_s != 0) {
+    if (_need_reduce) {
+      reduce_results(_force_array_s);
+      _need_reduce = 0;
+    }
+    add_results(_force_array_s, _ev_array_s, _results_eatom, _results_vatom, 0);
+    _force_array_s = 0;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class ft>
+void FixIntel::reduce_results(ft * _noalias const f_start)
+{
+  int o_range, f_stride;
+  if (force->newton_pair)
+    o_range = atom->nlocal + atom->nghost;
+  else		
+    o_range = atom->nlocal;
+  IP_PRE_get_stride(f_stride, o_range, sizeof(ft), lmp->atom->torque);
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) shared(o_range, f_stride)
+  #endif
+  {
+    int iifrom, iito, tid;
+    IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, _nthreads,
+  			      sizeof(ft));
+
+    int t_off = f_stride;
+    if (_results_eatom) {
+      for (int t = 1; t < _nthreads; t++) {
+        _use_simd_pragma("vector nontemporal")
+        _use_simd_pragma("novector")
+        for (int n = iifrom; n < iito; n++) {
+	  f_start[n].x += f_start[n + t_off].x;
+	  f_start[n].y += f_start[n + t_off].y;
+	  f_start[n].z += f_start[n + t_off].z;
+	  f_start[n].w += f_start[n + t_off].w;
+	}
+	t_off += f_stride;
+      }
+    } else {
+      for (int t = 1; t < _nthreads; t++) {
+	_use_simd_pragma("vector nontemporal")
+	_use_simd_pragma("novector")
+	for (int n = iifrom; n < iito; n++) {
+	  f_start[n].x += f_start[n + t_off].x;
+	  f_start[n].y += f_start[n + t_off].y;
+	  f_start[n].z += f_start[n + t_off].z;
+	}
+	t_off += f_stride;
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
 void FixIntel::sync_coprocessor()
 {
  #ifdef _LMP_INTEL_OFFLOAD
@ -456,6 +570,153 @@ void FixIntel::sync_coprocessor()

 /* ---------------------------------------------------------------------- */

+template <class ft, class acc_t>
+void FixIntel::add_results(const ft * _noalias const f_in,
+                           const acc_t * _noalias const ev_global,
+                           const int eatom, const int vatom,
+			   const int offload) {
+  start_watch(TIME_PACK);
+  int f_length;
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_separate_buffers) {
+    if (offload) {
+      add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal);
+      if (force->newton_pair) {
+	const acc_t * _noalias const enull = 0;
+	int offset = _offload_nlocal;
+	if (atom->torque) offset *= 2;
+	add_oresults(f_in + offset, enull, eatom, vatom,
+		     _offload_min_ghost, _offload_nghost);
+      }
+    } else {
+      add_oresults(f_in, ev_global, eatom, vatom,
+		   _host_min_local, _host_used_local);
+      if (force->newton_pair) {
+	const acc_t * _noalias const enull = 0;
+	int offset = _host_used_local;
+	if (atom->torque) offset *= 2;
+	add_oresults(f_in + offset, enull, eatom,
+		     vatom, _host_min_ghost, _host_used_ghost);
+      }
+    }
+    stop_watch(TIME_PACK);
+    return;
+  }
+  if (force->newton_pair && (_offload_noghost == 0 || offload == 0))
+    f_length = atom->nlocal + atom->nghost;
+  else
+    f_length = atom->nlocal;
+  #else
+  if (force->newton_pair)
+    f_length = atom->nlocal + atom->nghost;
+  else
+    f_length = atom->nlocal;
+  #endif
+
+  add_oresults(f_in, ev_global, eatom, vatom, 0, f_length);
+  stop_watch(TIME_PACK);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class ft, class acc_t>
+void FixIntel::add_oresults(const ft * _noalias const f_in,
+			    const acc_t * _noalias const ev_global,
+			    const int eatom, const int vatom,
+			    const int out_offset, const int nall) {
+  lmp_ft * _noalias const f = (lmp_ft *) lmp->atom->f[0] + out_offset;
+  if (atom->torque) {
+    if (f_in[1].w)
+      if (f_in[1].w == 1)
+        error->all(FLERR,"Bad matrix inversion in mldivide3");
+      else
+        error->all(FLERR,
+                   "Sphere particles not yet supported for gayberne/intel");
+  }
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none)
+  #endif
+  {
+    #if defined(_OPENMP)
+    const int tid = omp_get_thread_num();
+    #else
+    const int tid = 0;
+    #endif
+    int ifrom, ito;
+    IP_PRE_omp_range_align(ifrom, ito, tid, nall, _nthreads, sizeof(acc_t));
+    if (atom->torque) {
+      int ii = ifrom * 2;
+      lmp_ft * _noalias const tor = (lmp_ft *) lmp->atom->torque[0] +
+	out_offset;
+      if (eatom) {
+	double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
+        #if defined(LMP_SIMD_COMPILER)
+	#pragma novector
+	#endif
+        for (int i = ifrom; i < ito; i++) {
+          f[i].x += f_in[ii].x;
+          f[i].y += f_in[ii].y;
+          f[i].z += f_in[ii].z;
+          lmp_eatom[i] += f_in[ii].w;
+          tor[i].x += f_in[ii+1].x;
+          tor[i].y += f_in[ii+1].y;
+          tor[i].z += f_in[ii+1].z;
+          ii += 2;
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+	#pragma novector
+	#endif
+        for (int i = ifrom; i < ito; i++) {
+          f[i].x += f_in[ii].x;
+          f[i].y += f_in[ii].y;
+          f[i].z += f_in[ii].z;
+          tor[i].x += f_in[ii+1].x;
+          tor[i].y += f_in[ii+1].y;
+          tor[i].z += f_in[ii+1].z;
+          ii += 2;
+        }
+      }
+    } else {
+      if (eatom) {
+	double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
+        #if defined(LMP_SIMD_COMPILER)
+	#pragma novector
+	#endif
+        for (int i = ifrom; i < ito; i++) {
+          f[i].x += f_in[i].x;
+          f[i].y += f_in[i].y;
+          f[i].z += f_in[i].z;
+          lmp_eatom[i] += f_in[i].w;
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+	#pragma novector
+	#endif
+        for (int i = ifrom; i < ito; i++) {
+          f[i].x += f_in[i].x;
+          f[i].y += f_in[i].y;
+          f[i].z += f_in[i].z;
+        }
+      }
+    }
+  }
+
+  if (ev_global != NULL) {
+    force->pair->eng_vdwl += ev_global[0];
+    force->pair->eng_coul += ev_global[1];
+    force->pair->virial[0] += ev_global[2];
+    force->pair->virial[1] += ev_global[3];
+    force->pair->virial[2] += ev_global[4];
+    force->pair->virial[3] += ev_global[5];
+    force->pair->virial[4] += ev_global[6];
+    force->pair->virial[5] += ev_global[7];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
 double FixIntel::memory_usage()
 {
  double bytes;
@ -473,6 +734,70 @@ double FixIntel::memory_usage()

 #ifdef _LMP_INTEL_OFFLOAD

+/* ---------------------------------------------------------------------- */
+
+template <class ft, class acc_t>
+void FixIntel::add_off_results(const ft * _noalias const f_in,
+                               const acc_t * _noalias const ev_global) {
+  if (_offload_balance < 0.0)
+    _balance_other_time = MPI_Wtime() - _balance_other_time;
+
+  start_watch(TIME_OFFLOAD_WAIT);
+  #ifdef _LMP_INTEL_OFFLOAD
+  #pragma offload_wait target(mic:_cop) wait(f_in)
+  #endif
+  double wait_time = stop_watch(TIME_OFFLOAD_WAIT);
+
+  if (neighbor->ago == 0) {
+    if (_off_overflow_flag[LMP_OVERFLOW])
+      error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
+    _offload_nlocal = _off_overflow_flag[LMP_LOCAL_MAX] + 1;
+    _offload_min_ghost = _off_overflow_flag[LMP_GHOST_MIN];
+    _offload_nghost = _off_overflow_flag[LMP_GHOST_MAX] + 1 -
+      _offload_min_ghost;
+    if (_offload_nghost < 0) _offload_nghost = 0;
+    _offload_nall = _offload_nlocal + _offload_nghost;
+      _offload_nlocal;
+  }
+
+  int nlocal = atom->nlocal;
+  // Load balance?
+  if (_offload_balance < 0.0) {
+    if (neighbor->ago == 0)
+      _balance_pair = _balance_neighbor;
+    double mic_time;
+    mic_time = *_stopwatch_offload_pair;
+    if (_balance_pair_time + _balance_other_time < mic_time) {
+      double ft = _balance_pair_time + _balance_other_time + wait_time -
+          mic_time;
+      _balance_fixed = (1.0 - INTEL_LB_MEAN_WEIGHT) * _balance_fixed +
+          INTEL_LB_MEAN_WEIGHT * ft;
+    }
+
+    double ctps = _balance_pair_time / (1.0-_balance_pair);
+    double otps = mic_time / _balance_pair;
+    double new_balance = (ctps + _balance_other_time - _balance_fixed) /
+        (otps + ctps);
+    if (new_balance < 0.01) new_balance = 0.01;
+    else if (new_balance > 0.99) new_balance = 0.99;
+    _balance_neighbor = (1.0 - INTEL_LB_MEAN_WEIGHT) *_balance_neighbor +
+        INTEL_LB_MEAN_WEIGHT * new_balance;
+  }
+
+  #ifdef TIME_BALANCE
+  start_watch(TIME_IMBALANCE);
+  MPI_Barrier(_real_space_comm);
+  stop_watch(TIME_IMBALANCE);
+  #endif
+  acc_timers();
+  if (atom->torque)
+    if (f_in[1].w < 0.0)
+      error->all(FLERR, "Bad matrix inversion in mldivide3");
+  add_results(f_in, ev_global, _off_results_eatom, _off_results_vatom, 1);
+}
+
+/* ---------------------------------------------------------------------- */
+
 void FixIntel::output_timing_data() {
  if (_im_real_space_task == 0 || _offload_affinity_set == 0) return;

--- a/src/USER-INTEL/fix_intel.h
+++ b/src/USER-INTEL/fix_intel.h
@ -44,7 +44,9 @@ class FixIntel : public Fix {
  virtual void init();
  virtual void setup(int);
  void pair_init_check(const bool cdmessage=false);
+  void bond_init_check();

+  void sync();
  // Get all forces, calculation results from coprocesser
  void sync_coprocessor();

@ -65,6 +67,12 @@ class FixIntel : public Fix {
  inline int nbor_pack_width() const { return _nbor_pack_width; }
  inline void nbor_pack_width(const int w) { _nbor_pack_width = w; }

+  inline int need_zero(const int tid) {
+    if (_need_reduce == 0 && tid > 0) return 1;
+    return 0;
+  }
+  inline void set_reduce_flag() { _need_reduce = 1; }
+
 protected:
  IntelBuffers<float,float> *_single_buffers;
  IntelBuffers<float,double> *_mixed_buffers;
@ -77,13 +85,16 @@ class FixIntel : public Fix {
  inline int* get_off_overflow_flag() { return _off_overflow_flag; }
  inline void add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
                               double *ev_in, const int offload,
-                               const int eatom = 0, const int vatom = 0);
+                               const int eatom = 0, const int vatom = 0,
+			       const int rflag = 0);
  inline void add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
                               double *ev_in, const int offload,
-                               const int eatom = 0, const int vatom = 0);
+                               const int eatom = 0, const int vatom = 0,
+			       const int rflag = 0);
  inline void add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
                               float *ev_in, const int offload,
-                               const int eatom = 0, const int vatom = 0);
+                               const int eatom = 0, const int vatom = 0,
+			       const int rflag = 0);
  inline void get_buffern(const int offload, int &nlocal, int &nall,
 			  int &minlocal);

@ -139,6 +150,15 @@ class FixIntel : public Fix {
  int _overflow_flag[5];
  _alignvar(int _off_overflow_flag[5],64);
  int _allow_separate_buffers, _offload_ghost;
+
+  IntelBuffers<float,float>::vec3_acc_t *_force_array_s;
+  IntelBuffers<float,double>::vec3_acc_t *_force_array_m;
+  IntelBuffers<double,double>::vec3_acc_t *_force_array_d;
+  float *_ev_array_s;
+  double *_ev_array_d;
+  int _results_eatom, _results_vatom;
+  int _need_reduce;
+
  #ifdef _LMP_INTEL_OFFLOAD
  double _balance_pair_time, _balance_other_time;
  int _offload_nlocal, _offload_nall, _offload_min_ghost, _offload_nghost;
@ -168,6 +188,9 @@ class FixIntel : public Fix {
  _alignvar(double _stopwatch_offload_neighbor[1],64);
  _alignvar(double _stopwatch_offload_pair[1],64);

+  template <class ft>
+  void reduce_results(ft * _noalias const f_in);
+
  template <class ft, class acc_t>
  inline void add_results(const ft * _noalias const f_in,
                          const acc_t * _noalias const ev_global,
@ -226,7 +249,8 @@ void FixIntel::get_buffern(const int offload, int &nlocal, int &nall,

 void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
                                double *ev_in, const int offload,
-                                const int eatom, const int vatom) {
+                                const int eatom, const int vatom,
+				const int rflag) {
  #ifdef _LMP_INTEL_OFFLOAD
  if (offload) {
    _off_results_eatom = eatom;
@ -237,11 +261,23 @@ void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
    return;
  }
  #endif
-  add_results(f_in, ev_in, eatom, vatom, 0);
+
+  _force_array_d = f_in;
+  _ev_array_d = ev_in;
+  _results_eatom = eatom;
+  _results_vatom = vatom;
+  #ifndef _LMP_INTEL_OFFLOAD
+  if (rflag != 2 && _nthreads > 1) _need_reduce = 1;
+  #endif
+
  if (_overflow_flag[LMP_OVERFLOW])
    error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
+
  #ifdef _LMP_INTEL_OFFLOAD
-  if (_sync_at_pair) sync_coprocessor();
+  if (_sync_at_pair) {
+    sync();
+    sync_coprocessor();
+  }
  #endif
 }

@ -249,7 +285,8 @@ void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,

 void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
                                double *ev_in, const int offload,
-                                const int eatom, const int vatom) {
+                                const int eatom, const int vatom,
+				const int rflag) {
  #ifdef _LMP_INTEL_OFFLOAD
  if (offload) {
    _off_results_eatom = eatom;
@ -260,11 +297,23 @@ void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
    return;
  }
  #endif
-  add_results(f_in, ev_in, eatom, vatom, 0);
+
+  _force_array_m = f_in;
+  _ev_array_d = ev_in;
+  _results_eatom = eatom;
+  _results_vatom = vatom;
+  #ifndef _LMP_INTEL_OFFLOAD
+  if (rflag != 2 && _nthreads > 1) _need_reduce = 1;
+  #endif
+
  if (_overflow_flag[LMP_OVERFLOW])
    error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
+
  #ifdef _LMP_INTEL_OFFLOAD
-  if (_sync_at_pair) sync_coprocessor();
+  if (_sync_at_pair) {
+    sync();
+    sync_coprocessor();
+  }
  #endif
 }

@ -272,7 +321,8 @@ void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,

 void FixIntel::add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
                                float *ev_in, const int offload,
-                                const int eatom, const int vatom) {
+                                const int eatom, const int vatom,
+                                const int rflag) {
  #ifdef _LMP_INTEL_OFFLOAD
  if (offload) {
    _off_results_eatom = eatom;
@ -283,161 +333,28 @@ void FixIntel::add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
    return;
  }
  #endif
-  add_results(f_in, ev_in, eatom, vatom, 0);
+
+  _force_array_s = f_in;
+  _ev_array_s = ev_in;
+  _results_eatom = eatom;
+  _results_vatom = vatom;
+  #ifndef _LMP_INTEL_OFFLOAD
+  if (rflag != 2 && _nthreads > 1) _need_reduce = 1;
+  #endif
+
  if (_overflow_flag[LMP_OVERFLOW])
    error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
+
  #ifdef _LMP_INTEL_OFFLOAD
-  if (_sync_at_pair) sync_coprocessor();
+  if (_sync_at_pair) {
+    sync();
+    sync_coprocessor();
+  }
  #endif
 }

 /* ---------------------------------------------------------------------- */

-template <class ft, class acc_t>
-void FixIntel::add_results(const ft * _noalias const f_in,
-                           const acc_t * _noalias const ev_global,
-                           const int eatom, const int vatom,
-			   const int offload) {
-  start_watch(TIME_PACK);
-  int f_length;
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (_separate_buffers) {
-    if (offload) {
-      add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal);
-      if (force->newton_pair) {
-	const acc_t * _noalias const enull = 0;
-	int offset = _offload_nlocal;
-	if (atom->torque) offset *= 2;
-	add_oresults(f_in + offset, enull, eatom, vatom,
-		     _offload_min_ghost, _offload_nghost);
-      }
-    } else {
-      add_oresults(f_in, ev_global, eatom, vatom,
-		   _host_min_local, _host_used_local);
-      if (force->newton_pair) {
-	const acc_t * _noalias const enull = 0;
-	int offset = _host_used_local;
-	if (atom->torque) offset *= 2;
-	add_oresults(f_in + offset, enull, eatom,
-		     vatom, _host_min_ghost, _host_used_ghost);
-      }
-    }
-    stop_watch(TIME_PACK);
-    return;
-  }
-  if (force->newton_pair && (_offload_noghost == 0 || offload == 0))
-    f_length = atom->nlocal + atom->nghost;
-  else
-    f_length = atom->nlocal;
-  #else
-  if (force->newton_pair)
-    f_length = atom->nlocal + atom->nghost;
-  else
-    f_length = atom->nlocal;
-  #endif
-
-  add_oresults(f_in, ev_global, eatom, vatom, 0, f_length);
-  stop_watch(TIME_PACK);
-}
-
-/* ---------------------------------------------------------------------- */
-
-template <class ft, class acc_t>
-void FixIntel::add_oresults(const ft * _noalias const f_in,
-			    const acc_t * _noalias const ev_global,
-			    const int eatom, const int vatom,
-			    const int out_offset, const int nall) {
-  lmp_ft * _noalias const f = (lmp_ft *) lmp->atom->f[0] + out_offset;
-  if (atom->torque) {
-    if (f_in[1].w)
-      if (f_in[1].w == 1)
-        error->all(FLERR,"Bad matrix inversion in mldivide3");
-      else
-        error->all(FLERR,
-                   "Sphere particles not yet supported for gayberne/intel");
-  }
-
-  #if defined(_OPENMP)
-  #pragma omp parallel default(none)
-  #endif
-  {
-    #if defined(_OPENMP)
-    const int tid = omp_get_thread_num();
-    #else
-    const int tid = 0;
-    #endif
-    int ifrom, ito;
-    IP_PRE_omp_range_align(ifrom, ito, tid, nall, _nthreads, sizeof(acc_t));
-    if (atom->torque) {
-      int ii = ifrom * 2;
-      lmp_ft * _noalias const tor = (lmp_ft *) lmp->atom->torque[0] +
-	out_offset;
-      if (eatom) {
-	double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
-        #if defined(LMP_SIMD_COMPILER)
-	#pragma novector
-	#endif
-        for (int i = ifrom; i < ito; i++) {
-          f[i].x += f_in[ii].x;
-          f[i].y += f_in[ii].y;
-          f[i].z += f_in[ii].z;
-          lmp_eatom[i] += f_in[ii].w;
-          tor[i].x += f_in[ii+1].x;
-          tor[i].y += f_in[ii+1].y;
-          tor[i].z += f_in[ii+1].z;
-          ii += 2;
-        }
-      } else {
-        #if defined(LMP_SIMD_COMPILER)
-	#pragma novector
-	#endif
-        for (int i = ifrom; i < ito; i++) {
-          f[i].x += f_in[ii].x;
-          f[i].y += f_in[ii].y;
-          f[i].z += f_in[ii].z;
-          tor[i].x += f_in[ii+1].x;
-          tor[i].y += f_in[ii+1].y;
-          tor[i].z += f_in[ii+1].z;
-          ii += 2;
-        }
-      }
-    } else {
-      if (eatom) {
-	double * _noalias const lmp_eatom = force->pair->eatom + out_offset;
-        #if defined(LMP_SIMD_COMPILER)
-	#pragma novector
-	#endif
-        for (int i = ifrom; i < ito; i++) {
-          f[i].x += f_in[i].x;
-          f[i].y += f_in[i].y;
-          f[i].z += f_in[i].z;
-          lmp_eatom[i] += f_in[i].w;
-        }
-      } else {
-        #if defined(LMP_SIMD_COMPILER)
-	#pragma novector
-	#endif
-        for (int i = ifrom; i < ito; i++) {
-          f[i].x += f_in[i].x;
-          f[i].y += f_in[i].y;
-          f[i].z += f_in[i].z;
-        }
-      }
-    }
-  }
-
-  if (ev_global != NULL) {
-    force->pair->eng_vdwl += ev_global[0];
-    force->pair->eng_coul += ev_global[1];
-    force->pair->virial[0] += ev_global[2];
-    force->pair->virial[1] += ev_global[3];
-    force->pair->virial[2] += ev_global[4];
-    force->pair->virial[3] += ev_global[5];
-    force->pair->virial[4] += ev_global[6];
-    force->pair->virial[5] += ev_global[7];
-  }
-}
-
 #ifdef _LMP_INTEL_OFFLOAD

 /* ---------------------------------------------------------------------- */
@ -491,70 +408,6 @@ void FixIntel::set_neighbor_host_sizes() {

 /* ---------------------------------------------------------------------- */

-template <class ft, class acc_t>
-void FixIntel::add_off_results(const ft * _noalias const f_in,
-                               const acc_t * _noalias const ev_global) {
-  if (_offload_balance < 0.0)
-    _balance_other_time = MPI_Wtime() - _balance_other_time;
-
-  start_watch(TIME_OFFLOAD_WAIT);
-  #ifdef _LMP_INTEL_OFFLOAD
-  if (neighbor->ago == 0) {
-    #pragma offload_wait target(mic:_cop) wait(atom->tag, f_in)
-  } else {
-    #pragma offload_wait target(mic:_cop) wait(f_in)
-  }
-  #endif
-  double wait_time = stop_watch(TIME_OFFLOAD_WAIT);
-
-  if (neighbor->ago == 0) {
-    if (_off_overflow_flag[LMP_OVERFLOW])
-      error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
-    _offload_nlocal = _off_overflow_flag[LMP_LOCAL_MAX] + 1;
-    _offload_min_ghost = _off_overflow_flag[LMP_GHOST_MIN];
-    _offload_nghost = _off_overflow_flag[LMP_GHOST_MAX] + 1 -
-      _offload_min_ghost;
-    if (_offload_nghost < 0) _offload_nghost = 0;
-    _offload_nall = _offload_nlocal + _offload_nghost;
-      _offload_nlocal;
-  }
-
-  int nlocal = atom->nlocal;
-  // Load balance?
-  if (_offload_balance < 0.0) {
-    if (neighbor->ago == 0)
-      _balance_pair = _balance_neighbor;
-    double mic_time;
-    mic_time = *_stopwatch_offload_pair;
-    if (_balance_pair_time + _balance_other_time < mic_time) {
-      double ft = _balance_pair_time + _balance_other_time + wait_time -
-          mic_time;
-      _balance_fixed = (1.0 - INTEL_LB_MEAN_WEIGHT) * _balance_fixed +
-          INTEL_LB_MEAN_WEIGHT * ft;
-    }
-
-    double ctps = _balance_pair_time / (1.0-_balance_pair);
-    double otps = mic_time / _balance_pair;
-    double new_balance = (ctps + _balance_other_time - _balance_fixed) /
-        (otps + ctps);
-    if (new_balance < 0.01) new_balance = 0.01;
-    else if (new_balance > 0.99) new_balance = 0.99;
-    _balance_neighbor = (1.0 - INTEL_LB_MEAN_WEIGHT) *_balance_neighbor +
-        INTEL_LB_MEAN_WEIGHT * new_balance;
-  }
-
-  #ifdef TIME_BALANCE
-  start_watch(TIME_IMBALANCE);
-  MPI_Barrier(_real_space_comm);
-  stop_watch(TIME_IMBALANCE);
-  #endif
-  acc_timers();
-  if (atom->torque)
-    if (f_in[1].w < 0.0)
-      error->all(FLERR, "Bad matrix inversion in mldivide3");
-  add_results(f_in, ev_global, _off_results_eatom, _off_results_vatom, 1);
-}
-
 #endif

 }
@ -653,4 +506,13 @@ W: More MPI tasks/OpenMP threads than available cores
 Using more MPI tasks/OpenMP threads than available cores will typically
 decrease performance.

+E: USER-INTEL package requires same setting for newton bond and non-bond.
+
+The newton setting must be the same for both pairwise and bonded forces.
+
+E: Intel styles for bond/angle/dihedral/improper require intel pair style."
+
+You cannot use the USER-INTEL package for bond calculations without a 
+USER-INTEL supported pair style.
+
 */
--- a/src/USER-INTEL/improper_harmonic_intel.cpp
+++ b/src/USER-INTEL/improper_harmonic_intel.cpp
@ -0,0 +1,378 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include <mpi.h>
+#include <math.h>
+#include <stdlib.h>
+#include "improper_harmonic_intel.h"
+#include "atom.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "force.h"
+#include "update.h"
+#include "math_const.h"
+#include "memory.h"
+#include "modify.h"
+#include "suffix.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+
+#define PTOLERANCE (flt_t)1.05
+#define MTOLERANCE (flt_t)-1.05
+#define SMALL     (flt_t)0.001
+typedef struct { int a,b,c,d,t;  } int5_t;
+
+/* ---------------------------------------------------------------------- */
+
+ImproperHarmonicIntel::ImproperHarmonicIntel(LAMMPS *lmp) : 
+  ImproperHarmonic(lmp)
+{
+  suffix_flag |= Suffix::INTEL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+ImproperHarmonicIntel::~ImproperHarmonicIntel()
+{
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ImproperHarmonicIntel::compute(int eflag, int vflag)
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_use_base) {
+    ImproperHarmonic::compute(eflag, vflag);
+    return;
+  }
+  #endif
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
+                          force_const_single);
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    compute<double,double>(eflag, vflag, fix->get_double_buffers(),
+                           force_const_double);
+  else
+    compute<float,float>(eflag, vflag, fix->get_single_buffers(),
+                         force_const_single);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void ImproperHarmonicIntel::compute(int eflag, int vflag,
+				    IntelBuffers<flt_t,acc_t> *buffers,
+				    const ForceConst<flt_t> &fc)
+{
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = 0;
+
+  if (evflag) {
+    if (eflag) {
+      if (force->newton_bond)
+	eval<1,1,1>(vflag, buffers, fc);
+      else
+	eval<1,1,0>(vflag, buffers, fc);
+    } else {
+      if (force->newton_bond)
+	eval<1,0,1>(vflag, buffers, fc);
+      else
+	eval<1,0,0>(vflag, buffers, fc);
+    }
+  } else {
+    if (force->newton_bond)
+      eval<0,0,1>(vflag, buffers, fc);
+    else
+      eval<0,0,0>(vflag, buffers, fc);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+void ImproperHarmonicIntel::eval(const int vflag, 
+				 IntelBuffers<flt_t,acc_t> *buffers,
+				 const ForceConst<flt_t> &fc)
+{
+  const int inum = neighbor->nimproperlist;
+  if (inum == 0) return;
+
+  ATOM_T * _noalias const x = buffers->get_x(0);
+  const int nlocal = atom->nlocal;
+  const int nall = nlocal + atom->nghost;
+
+  int f_stride;
+  if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
+  else f_stride = buffers->get_stride(nlocal);
+
+  int tc;
+  FORCE_T * _noalias f_start;
+  acc_t * _noalias ev_global;
+  IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
+  const int nthreads = tc;
+
+  acc_t oeimproper, ov0, ov1, ov2, ov3, ov4, ov5;
+  if (EVFLAG) {
+    if (EFLAG)
+      oeimproper = (acc_t)0.0;
+    if (vflag) {
+      ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
+    }
+  }
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(f_start,f_stride,fc) \
+    reduction(+:oeimproper,ov0,ov1,ov2,ov3,ov4,ov5)
+  #endif
+  {
+    int nfrom, nto, tid;
+    IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
+
+    FORCE_T * _noalias const f = f_start + (tid * f_stride);
+    if (fix->need_zero(tid))
+      memset(f, 0, f_stride * sizeof(FORCE_T));
+
+    const int5_t * _noalias const improperlist = 
+      (int5_t *) neighbor->improperlist[0];
+
+    for (int n = nfrom; n < nto; n++) {
+      const int i1 = improperlist[n].a;
+      const int i2 = improperlist[n].b;
+      const int i3 = improperlist[n].c;
+      const int i4 = improperlist[n].d;
+      const int type = improperlist[n].t;
+
+      // geometry of 4-body
+
+      const flt_t vb1x = x[i1].x - x[i2].x;
+      const flt_t vb1y = x[i1].y - x[i2].y;
+      const flt_t vb1z = x[i1].z - x[i2].z;
+
+      const flt_t vb2x = x[i3].x - x[i2].x;
+      const flt_t vb2y = x[i3].y - x[i2].y;
+      const flt_t vb2z = x[i3].z - x[i2].z;
+
+      const flt_t vb3x = x[i4].x - x[i3].x;
+      const flt_t vb3y = x[i4].y - x[i3].y;
+      const flt_t vb3z = x[i4].z - x[i3].z;
+
+      const flt_t ss1 = (flt_t)1.0 / (vb1x*vb1x + vb1y*vb1y + vb1z*vb1z);
+      const flt_t ss2 = (flt_t)1.0 / (vb2x*vb2x + vb2y*vb2y + vb2z*vb2z);
+      const flt_t ss3 = (flt_t)1.0 / (vb3x*vb3x + vb3y*vb3y + vb3z*vb3z);
+
+      const flt_t r1 = sqrt(ss1);
+      const flt_t r2 = sqrt(ss2);
+      const flt_t r3 = sqrt(ss3);
+
+      // sin and cos of angle
+
+      const flt_t c0 = (vb1x * vb3x + vb1y * vb3y + vb1z * vb3z) * r1 * r3;
+      const flt_t c1 = (vb1x * vb2x + vb1y * vb2y + vb1z * vb2z) * r1 * r2;
+      const flt_t c2 = -(vb3x * vb2x + vb3y * vb2y + vb3z * vb2z) * r3 * r2;
+
+      flt_t s1 = 1.0 - c1*c1;
+      if (s1 < SMALL) s1 = SMALL;
+      s1 = (flt_t)1.0 / s1;
+
+      flt_t s2 = (flt_t)1.0 - c2*c2;
+      if (s2 < SMALL) s2 = SMALL;
+      s2 = (flt_t)1.0 / s2;
+
+      flt_t s12 = sqrt(s1*s2);
+      flt_t c = (c1*c2 + c0) * s12;
+
+      // error check
+
+      if (c > PTOLERANCE || c < MTOLERANCE) {
+        int me;
+	MPI_Comm_rank(world,&me);
+	if (screen) {
+          char str[128];
+	  sprintf(str,"Improper problem: %d " BIGINT_FORMAT " "
+                  TAGINT_FORMAT " " TAGINT_FORMAT " "
+                  TAGINT_FORMAT " " TAGINT_FORMAT,
+                  me,update->ntimestep,
+                  atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
+	  error->warning(FLERR,str,0);
+	  fprintf(screen,"  1st atom: %d %g %g %g\n",
+                  me,x[i1].x,x[i1].y,x[i1].z);
+	  fprintf(screen,"  2nd atom: %d %g %g %g\n",
+                  me,x[i2].x,x[i2].y,x[i2].z);
+	  fprintf(screen,"  3rd atom: %d %g %g %g\n",
+                  me,x[i3].x,x[i3].y,x[i3].z);
+	  fprintf(screen,"  4th atom: %d %g %g %g\n",
+                  me,x[i4].x,x[i4].y,x[i4].z);
+        }
+      }
+
+      if (c > (flt_t)1.0) c = (flt_t)1.0;
+      if (c < (flt_t)-1.0) c = (flt_t)-1.0;
+
+      flt_t s = sqrt((flt_t)1.0 - c*c);
+      if (s < SMALL) s = SMALL;
+
+      // force & energy
+
+      const flt_t domega = acos(c) - fc.fc[type].chi;
+      flt_t a;
+      a = fc.fc[type].k * domega;
+
+      flt_t eimproper;
+      if (EFLAG) eimproper = a*domega;
+
+      a = -a * (flt_t)2.0/s;
+      c = c * a;
+      s12 = s12 * a;
+      const flt_t a11 = c*ss1*s1;
+      const flt_t a22 = -ss2 * ((flt_t)2.0*c0*s12 - c*(s1+s2));
+      const flt_t a33 = c*ss3*s2;
+      const flt_t a12 = -r1*r2*(c1*c*s1 + c2*s12);
+      const flt_t a13 = -r1*r3*s12;
+      const flt_t a23 = r2*r3*(c2*c*s2 + c1*s12);
+
+      const flt_t sx2  = a22*vb2x + a23*vb3x + a12*vb1x;
+      const flt_t sy2  = a22*vb2y + a23*vb3y + a12*vb1y;
+      const flt_t sz2  = a22*vb2z + a23*vb3z + a12*vb1z;
+
+      const flt_t f1x = a12*vb2x + a13*vb3x + a11*vb1x;
+      const flt_t f1y = a12*vb2y + a13*vb3y + a11*vb1y;
+      const flt_t f1z = a12*vb2z + a13*vb3z + a11*vb1z;
+
+      const flt_t f2x = -sx2 - f1x;
+      const flt_t f2y = -sy2 - f1y;
+      const flt_t f2z = -sz2 - f1z;
+
+      const flt_t f4x = a23*vb2x + a33*vb3x + a13*vb1x;
+      const flt_t f4y = a23*vb2y + a33*vb3y + a13*vb1y;
+      const flt_t f4z = a23*vb2z + a33*vb3z + a13*vb1z;
+
+      const flt_t f3x = sx2 - f4x;
+      const flt_t f3y = sy2 - f4y;
+      const flt_t f3z = sz2 - f4z;
+
+      // apply force to each of 4 atoms
+
+      if (NEWTON_BOND || i1 < nlocal) {
+        f[i1].x += f1x;
+	f[i1].y += f1y;
+	f[i1].z += f1z;
+      }
+
+      if (NEWTON_BOND || i2 < nlocal) {
+        f[i2].x += f2x;
+	f[i2].y += f2y;
+	f[i2].z += f2z;
+      }
+
+      if (NEWTON_BOND || i3 < nlocal) {
+        f[i3].x += f3x;
+        f[i3].y += f3y;
+        f[i3].z += f3z;
+      }
+
+      if (NEWTON_BOND || i4 < nlocal) {
+        f[i4].x += f4x;
+	f[i4].y += f4y;
+	f[i4].z += f4z;
+      }
+
+      if (EVFLAG) {
+	IP_PRE_ev_tally_dihed(EFLAG, eatom, vflag, eimproper, i1, i2, i3, i4, 
+                              f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z, 
+                              vb1x, vb1y, vb1z, vb2x, vb2y, vb2z, vb3x, vb3y, 
+                              vb3z, oeimproper, f, NEWTON_BOND, nlocal,
+			      ov0, ov1, ov2, ov3, ov4, ov5);
+      }
+    } // for n
+  } // omp parallel
+  if (EVFLAG) {
+    if (EFLAG)
+      energy += oeimproper;
+    if (vflag) {
+      virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
+      virial[3] += ov3; virial[4] += ov4; virial[5] += ov5; 
+    }
+  }
+
+  fix->set_reduce_flag();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ImproperHarmonicIntel::init()
+{
+  ImproperHarmonic::init();
+
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  fix = static_cast<FixIntel *>(modify->fix[ifix]);
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  _use_base = 0;
+  if (fix->offload_balance() != 0.0) {
+    _use_base = 1;
+    return;
+  }
+  #endif
+
+  fix->bond_init_check();
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
+    pack_force_const(force_const_single, fix->get_mixed_buffers());
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    pack_force_const(force_const_double, fix->get_double_buffers());
+  else
+    pack_force_const(force_const_single, fix->get_single_buffers());
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void ImproperHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
+					     IntelBuffers<flt_t,acc_t> *buffers)
+{
+  const int bp1 = atom->nimpropertypes + 1;
+  fc.set_ntypes(bp1,memory);
+
+  for (int i = 0; i < bp1; i++) {
+    fc.fc[i].k = k[i];
+    fc.fc[i].chi = chi[i];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t>
+void ImproperHarmonicIntel::ForceConst<flt_t>::set_ntypes(const int nimproper,
+	                                                  Memory *memory) {
+  if (nimproper != _nimpropertypes) {
+    if (_nimpropertypes > 0)
+      _memory->destroy(fc);
+    
+    if (nimproper > 0)
+      _memory->create(fc,nimproper,"improperharmonicintel.fc");
+  }
+  _nimpropertypes = nimproper;
+  _memory = memory;
+}
--- a/src/USER-INTEL/improper_harmonic_intel.h
+++ b/src/USER-INTEL/improper_harmonic_intel.h
@ -0,0 +1,94 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef IMPROPER_CLASS
+
+ImproperStyle(harmonic/intel,ImproperHarmonicIntel)
+
+#else
+
+#ifndef LMP_IMPROPER_HARMONIC_INTEL_H
+#define LMP_IMPROPER_HARMONIC_INTEL_H
+
+#include <stdio.h>
+#include "improper_harmonic.h"
+#include "fix_intel.h"
+
+namespace LAMMPS_NS {
+
+class ImproperHarmonicIntel : public ImproperHarmonic {
+ public:
+  ImproperHarmonicIntel(class LAMMPS *);
+  virtual ~ImproperHarmonicIntel();
+  virtual void compute(int, int);
+  virtual void init();
+
+ protected:
+  FixIntel *fix;
+
+  template <class flt_t> class ForceConst;
+  template <class flt_t, class acc_t>
+  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
+               const ForceConst<flt_t> &fc);
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers, 
+	    const ForceConst<flt_t> &fc);
+  template <class flt_t, class acc_t>
+  void pack_force_const(ForceConst<flt_t> &fc,
+                        IntelBuffers<flt_t, acc_t> *buffers);
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  int _use_base;
+  #endif
+
+  template <class flt_t>
+  class ForceConst {
+   public:
+    typedef struct { flt_t k, chi; } fc_packed1;
+
+    fc_packed1 *fc;
+
+    ForceConst() : _nimpropertypes(0)  {}
+    ~ForceConst() { set_ntypes(0, NULL); }
+
+    void set_ntypes(const int nimpropertypes, Memory *memory);
+
+   private:
+    int _nimpropertypes;
+    Memory *_memory;
+  };
+  ForceConst<float> force_const_single;
+  ForceConst<double> force_const_double;
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+W: Improper problem: %d %ld %d %d %d %d
+
+Conformation of the 4 listed improper atoms is extreme; you may want
+to check your simulation geometry.
+
+E: Incorrect args for improper coefficients
+
+Self-explanatory.  Check the input script or data file.
+
+*/
--- a/src/USER-INTEL/intel_buffers.cpp
+++ b/src/USER-INTEL/intel_buffers.cpp
@ -319,13 +319,10 @@ template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::free_nbor_list()
 {
  if (_list_alloc_atoms > 0) {
-    lmp->memory->destroy(_list_alloc);
-    _list_alloc_atoms = 0;
-
    #ifdef _LMP_INTEL_OFFLOAD
    if (_off_list_alloc) {
      int * list_alloc = _list_alloc;
-      int * special_flag = lmp->neighbor->special_flag_alloc();
+      int * special_flag = _off_map_special_flag;
      int * stencil = _off_map_stencil;
      if (list_alloc != 0 && special_flag != 0 && stencil != 0) {
        #pragma offload_transfer target(mic:_cop) \
@ -335,6 +332,8 @@ void IntelBuffers<flt_t, acc_t>::free_nbor_list()
      _off_list_alloc = false;
    }
    #endif
+    lmp->memory->destroy(_list_alloc);
+    _list_alloc_atoms = 0;
  }
 }

@ -350,7 +349,7 @@ void IntelBuffers<flt_t, acc_t>::_grow_nbor_list(NeighList *list,
  free_nbor_list();
  _list_alloc_atoms = 1.10 * nlocal;
  int nt = MAX(nthreads, _off_threads);
-  int list_alloc_size = (_list_alloc_atoms + nt + pack_width - 1) * 
+  int list_alloc_size = (_list_alloc_atoms + nt * 2 + pack_width - 1) * 
    get_max_nbors();
  lmp->memory->create(_list_alloc, list_alloc_size, "_list_alloc");
  #ifdef _LMP_INTEL_OFFLOAD
@ -365,6 +364,7 @@ void IntelBuffers<flt_t, acc_t>::_grow_nbor_list(NeighList *list,
        in(stencil:length(list->maxstencil) alloc_if(1) free_if(0)) \
        nocopy(list_alloc:length(list_alloc_size) alloc_if(1) free_if(0))
      _off_map_stencil = stencil;
+      _off_map_special_flag = special_flag;
      _off_list_alloc = true;
    }
  }
@ -406,6 +406,11 @@ void IntelBuffers<flt_t, acc_t>::free_ccache()
      #pragma offload_transfer target(mic:_cop) \
        nocopy(ccachex,ccachey,ccachez,ccachew:alloc_if(0) free_if(1)) \
        nocopy(ccachei,ccachej:alloc_if(0) free_if(1))
+
+      #ifdef LMP_USE_AVXCD
+      #pragma offload_transfer target(mic:_cop) \
+        nocopy(ccachef:alloc_if(0) free_if(1))
+      #endif
    }
    _off_ccache = 0;
    #endif
@ -451,7 +456,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
  lmp->memory->create(_ccachei, vsize, "_ccachei");
  lmp->memory->create(_ccachej, vsize, "_ccachej");
  #ifdef LMP_USE_AVXCD
-  IP_PRE_get_stride(_ccache_stride3, nsize * 3, esize, 0);
+  IP_PRE_get_stride(_ccache_stride3, nsize * 3, sizeof(acc_t), 0);
  lmp->memory->create(_ccachef, _ccache_stride3 * nt, "_ccachef");
  #endif
  memset(_ccachej, 0, vsize * sizeof(int));
@ -473,6 +478,12 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
        nocopy(ccachei:length(vsize) alloc_if(1) free_if(0)) \
        in(ccachej:length(vsize) alloc_if(1) free_if(0))
    }
+    #ifdef LMP_USE_AVXCD
+    if (ccachef != NULL) {
+      #pragma offload_transfer target(mic:_cop) \
+        nocopy(ccachef:length(_ccache_stride3 * nt) alloc_if(1) free_if(0))
+    }
+    #endif
    _off_ccache = 1;
  }
  #endif
--- a/src/USER-INTEL/intel_buffers.h
+++ b/src/USER-INTEL/intel_buffers.h
@ -298,7 +298,7 @@ class IntelBuffers {
  quat_t *_host_quat;
  vec3_acc_t *_off_f;
  int _off_map_nmax, _off_map_maxhead, _cop, _off_ccache;
-  int *_off_map_ilist;
+  int *_off_map_ilist, *_off_map_special_flag;
  int *_off_map_stencil, *_off_map_special, *_off_map_nspecial, *_off_map_tag;
  int *_off_map_binhead, *_off_map_bins, *_off_map_numneigh;
  bool _off_list_alloc;
--- a/src/USER-INTEL/intel_preprocess.h
+++ b/src/USER-INTEL/intel_preprocess.h
@ -383,11 +383,78 @@ inline double MIC_Wtime() {
  }                                                                     \
 }

+#define IP_PRE_ev_tally_bond(eflag, eatom, vflag, ebond, i1, i2, fbond,	\
+			     delx, dely, delz, obond, force, newton,	\
+			     nlocal, ov0, ov1, ov2, ov3, ov4, ov5)	\
+{                                                                       \
+  flt_t ev_pre;								\
+  if (newton) ev_pre = (flt_t)1.0;					\
+  else {								\
+    ev_pre = (flt_t)0.0;						\
+    if (i1 < nlocal) ev_pre += (flt_t)0.5;				\
+    if (i2 < nlocal) ev_pre += (flt_t)0.5;				\
+  }									\
+									\
+  if (eflag) {								\
+    oebond += ev_pre * ebond;						\
+    if (eatom) {							\
+      flt_t halfeng = ebond * (flt_t)0.5;				\
+      if (newton || i1 < nlocal) f[i1].w += halfeng;			\
+      if (newton || i2 < nlocal) f[i2].w += halfeng;			\
+    }									\
+  }									\
+									\
+  if (vflag) {								\
+    ov0 += ev_pre * (delx * delx * fbond);				\
+    ov1 += ev_pre * (dely * dely * fbond);				\
+    ov2 += ev_pre * (delz * delz * fbond);				\
+    ov3 += ev_pre * (delx * dely * fbond);				\
+    ov4 += ev_pre * (delx * delz * fbond);				\
+    ov5 += ev_pre * (dely * delz * fbond);				\
+  }                                                                     \
+}
+
+#define IP_PRE_ev_tally_angle(eflag, eatom, vflag, eangle, i1, i2, i3, 	\
+			      f1x, f1y, f1z, f3x, f3y, f3z, delx1,	\
+			      dely1, delz1, delx2, dely2, delz2,	\
+			      oeangle, force, newton, nlocal, ov0, ov1, \
+			      ov2, ov3, ov4, ov5)			\
+{                                                                       \
+  flt_t ev_pre;								\
+  if (newton) ev_pre = (flt_t)1.0;					\
+  else {								\
+    ev_pre = (flt_t)0.0;						\
+    if (i1 < nlocal) ev_pre += (flt_t)0.3333333333333333;		\
+    if (i2 < nlocal) ev_pre += (flt_t)0.3333333333333333;		\
+    if (i3 < nlocal) ev_pre += (flt_t)0.3333333333333333;		\
+  }									\
+									\
+  if (eflag) {								\
+    oeangle += ev_pre * eangle;						\
+    if (eatom) {							\
+      flt_t thirdeng = eangle * (flt_t)0.3333333333333333;		\
+      if (newton || i1 < nlocal) f[i1].w += thirdeng;			\
+      if (newton || i2 < nlocal) f[i2].w += thirdeng;			\
+      if (newton || i3 < nlocal) f[i3].w += thirdeng;			\
+    }									\
+  }									\
+									\
+  if (vflag) {								\
+    ov0 += ev_pre * (delx1 * f1x + delx2 * f3x);			\
+    ov1 += ev_pre * (dely1 * f1y + dely2 * f3y);			\
+    ov2 += ev_pre * (delz1 * f1z + delz2 * f3z);			\
+    ov3 += ev_pre * (delx1 * f1y + delx2 * f3y);			\
+    ov4 += ev_pre * (delx1 * f1z + delx2 * f3z);			\
+    ov5 += ev_pre * (dely1 * f1z + dely2 * f3z);			\
+  }                                                                     \
+}
+
 #define IP_PRE_ev_tally_dihed(eflag, eatom, vflag, deng, i1, i2, i3, i4,\
 			      f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y,	\
 			      f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z,	\
 			      vb3x, vb3y, vb3z,oedihedral, force,	\
-			      newton, nlocal)				\
+			      newton, nlocal, ov0, ov1, ov2, ov3, ov4,  \
+			      ov5)					\
 {                                                                       \
  flt_t ev_pre;								\
  if (newton) ev_pre = (flt_t)1.0;					\
@ -411,12 +478,12 @@ inline double MIC_Wtime() {
  }									\
 									\
  if (vflag) {								\
-    sv0 += ev_pre * (vb1x*f1x + vb2x*f3x + (vb3x+vb2x)*f4x);		\
-    sv1 += ev_pre * (vb1y*f1y + vb2y*f3y + (vb3y+vb2y)*f4y);		\
-    sv2 += ev_pre * (vb1z*f1z + vb2z*f3z + (vb3z+vb2z)*f4z);		\
-    sv3 += ev_pre * (vb1x*f1y + vb2x*f3y + (vb3x+vb2x)*f4y);		\
-    sv4 += ev_pre * (vb1x*f1z + vb2x*f3z + (vb3x+vb2x)*f4z);		\
-    sv5 += ev_pre * (vb1y*f1z + vb2y*f3z + (vb3y+vb2y)*f4z);		\
+    ov0 += ev_pre * (vb1x*f1x + vb2x*f3x + (vb3x+vb2x)*f4x);		\
+    ov1 += ev_pre * (vb1y*f1y + vb2y*f3y + (vb3y+vb2y)*f4y);		\
+    ov2 += ev_pre * (vb1z*f1z + vb2z*f3z + (vb3z+vb2z)*f4z);		\
+    ov3 += ev_pre * (vb1x*f1y + vb2x*f3y + (vb3x+vb2x)*f4y);		\
+    ov4 += ev_pre * (vb1x*f1z + vb2x*f3z + (vb3x+vb2x)*f4z);		\
+    ov5 += ev_pre * (vb1y*f1z + vb2y*f3z + (vb3y+vb2y)*f4z);		\
  }                                                                     \
 }

@ -459,7 +526,7 @@ inline double MIC_Wtime() {

 #define IP_PRE_fdotr_acc_force(newton, evflag, eflag, vflag, eatom,	\
 			       nall, nlocal, minlocal, nthreads,	\
-			       f_start, f_stride, x)			\
+			       f_start, f_stride, x, offload)		\
 {									\
  int o_range;								\
  if (newton)								\
--- a/src/USER-INTEL/intel_simd.h
+++ b/src/USER-INTEL/intel_simd.h
@ -110,6 +110,16 @@ namespace ip_simd {
    return _mm512_maskz_mov_pd(m, one);
  }

+  inline SIMD_float SIMD_set(const SIMD_float &src, const SIMD_mask &m, 
+			     const SIMD_float &one) {
+    return _mm512_mask_mov_ps(src,m,one);
+  }
+
+  inline SIMD_double SIMD_set(const SIMD_double &src, const SIMD_mask &m, 
+			      const SIMD_double &one) {
+    return _mm512_mask_mov_pd(src,m,one);
+  }
+
  // -------- Load Operations

  inline SIMD_int SIMD_load(const int *p) {
@ -352,6 +362,14 @@ namespace ip_simd {
    return _mm512_rcp28_pd(one);
  }

+  inline SIMD_float SIMD_rcpz(const SIMD_mask &m, const SIMD_float &one) {
+    return _mm512_maskz_rcp28_ps(m, one);
+  }
+
+  inline SIMD_double SIMD_rcpz(const SIMD_mask &m, const SIMD_double &one) {
+    return _mm512_maskz_rcp28_pd(m, one);
+  }
+
  inline SIMD_float SIMD_sqrt(const SIMD_float &one) {
    return _mm512_sqrt_ps(one);
  }
@ -456,6 +474,40 @@ namespace ip_simd {
    return _mm512_cmple_pd_mask(SIMD_set(one), two);
  }

+  inline SIMD_mask operator>(const SIMD_int &one, const SIMD_int &two) {
+    return _mm512_cmpgt_epi32_mask(one,two);
+  }
+
+  inline SIMD_mask operator>(const SIMD_float &one, const SIMD_float &two) {
+    return _mm512_cmplt_ps_mask(two,one);
+  }
+
+  inline SIMD_mask operator>(const SIMD_double &one, const SIMD_double &two) {
+    return _mm512_cmplt_pd_mask(two,one);
+  }
+
+  inline SIMD_mask operator==(const SIMD_int &one, const SIMD_int &two) {
+    return _mm512_cmpeq_epi32_mask(one,two);
+  }
+
+  inline SIMD_mask operator==(const SIMD_float &one, const SIMD_float &two) {
+    return _mm512_cmpeq_ps_mask(one,two);
+  }
+
+  inline SIMD_mask operator==(const SIMD_double &one, const SIMD_double &two) {
+    return _mm512_cmpeq_pd_mask(one,two);
+  }
+
+  // ------- Typecast operations
+
+  inline void SIMD_cast(const SIMD_int &one, SIMD_float &two) {
+    two = _mm512_cvtepi32_ps(one);
+  }
+
+  inline void SIMD_cast(const SIMD_int &one, SIMD_double &two) {
+    two = _mm512_cvtepi32lo_pd(one);
+  }
+
  // ------- Reduction operations

  inline int SIMD_max(const SIMD_int &i) {
@ -482,6 +534,55 @@ namespace ip_simd {
    return _mm512_reduce_add_pd(i);
  }

+  // i indices should be positive
+  inline void SIMD_conflict_pi_reduce1(const SIMD_mask &m, const SIMD_int &i,
+				       SIMD_float &v1) {
+    SIMD_int jc = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), m, i);
+    SIMD_int cd = _mm512_maskz_conflict_epi32(m, jc);
+    SIMD_mask todo_mask = _mm512_test_epi32_mask(cd, _mm512_set1_epi32(-1));
+    if (todo_mask) {
+      SIMD_int lz  = _mm512_lzcnt_epi32(cd);
+      SIMD_int lid = _mm512_sub_epi32(_mm512_set1_epi32(31),
+				      _mm512_lzcnt_epi32(cd));
+      
+      while(todo_mask) {
+	SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask);
+	SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd, 
+							  todo_bcast);
+	SIMD_float am_perm;
+	am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(),
+					     now_mask, lid, v1);
+	v1 = _mm512_mask_add_ps(v1, now_mask, v1, am_perm);
+	todo_mask = _mm512_kxor(todo_mask, now_mask);
+      }
+    }
+  }
+
+  // i indices should be positive
+  inline void SIMD_conflict_pi_reduce1(const SIMD_mask &m, const SIMD_int &i,
+				       SIMD_double &v1) {
+    SIMD_int jc = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), m, i);
+    SIMD_int cd = _mm512_maskz_conflict_epi32(m, jc);
+    SIMD_mask todo_mask = _mm512_test_epi32_mask(cd, _mm512_set1_epi32(-1));
+    if (todo_mask) {
+      SIMD_int lz  = _mm512_lzcnt_epi32(cd);
+      SIMD_int lid = _mm512_sub_epi32(_mm512_set1_epi32(31),
+				      _mm512_lzcnt_epi32(cd));
+      lid = _mm512_cvtepi32_epi64(_mm512_castsi512_si256(lid));
+      
+      while(todo_mask) {
+	SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask);
+	SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd, 
+							  todo_bcast);
+	SIMD_double am_perm;
+	am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(),
+					     now_mask, lid, v1);
+	v1 = _mm512_mask_add_pd(v1, now_mask, v1, am_perm);
+	todo_mask = _mm512_kxor(todo_mask, now_mask);
+      }
+    }
+  }
+
  // i indices should be positive
  inline void SIMD_conflict_pi_reduce3(const SIMD_mask &m, const SIMD_int &i,
 				       SIMD_float &v1, SIMD_float &v2,
@ -655,6 +756,166 @@ namespace ip_simd {
 				       _MM_SCALE_2);
  }

+  inline SIMD_float SIMD_ev_add(const SIMD_float &one, 
+				const SIMD_float &two) {
+    return _mm512_add_ps(one,two);
+  }
+
+  inline SIMD_double SIMD_ev_add(const SIMD_double &one, 
+				 const SIMD_double &two) {
+    return _mm512_add_pd(one,two);
+  }
+
+  inline SIMD_double SIMD_ev_add(const SIMD_double &one, 
+				 const SIMD_float &two) {
+    SIMD_double twod = _mm512_cvtps_pd(_mm512_castps512_ps256(two));
+    SIMD_double ans = _mm512_add_pd(one,twod);
+    twod = _mm512_cvtps_pd(_mm512_castps512_ps256(
+			     _mm512_shuffle_f32x4(two,two,238)));
+    return _mm512_add_pd(ans,twod);
+  }
+
+  inline void SIMD_jeng_update(const SIMD_mask &rmask, float *force, 
+			       const SIMD_int &joffset, SIMD_float &eng) {
+    SIMD_float jeng;
+    SIMD_conflict_pi_reduce1(rmask, joffset, eng);
+    jeng = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), rmask, joffset, 
+				    force, _MM_SCALE_1);
+    jeng = jeng + eng;
+    _mm512_mask_i32scatter_ps(force, rmask, joffset, jeng, _MM_SCALE_1);
+  }
+
+  inline void SIMD_jeng_update(const SIMD_mask &rmask, double *force, 
+			       const SIMD_int &joffset, SIMD_double &eng) {
+    SIMD_double jeng;
+    SIMD_conflict_pi_reduce1(rmask, joffset, eng);
+    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, 
+				      force, _MM_SCALE_2);
+    jeng = jeng + eng;
+    _mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2);
+  }
+
+  inline void SIMD_jeng_update(const SIMD_mask &rmask, double *force, 
+			       const SIMD_int &joffset, SIMD_float &eng) {
+    SIMD_double engd, jeng;
+    engd = _mm512_cvtps_pd(_mm512_castps512_ps256(eng));
+    SIMD_conflict_pi_reduce1(rmask, joffset, engd);
+    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, 
+				      force, _MM_SCALE_2);
+    jeng = jeng + engd;
+    _mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2);
+
+    SIMD_mask rmask2 = rmask >> 8;
+    engd = _mm512_cvtps_pd(_mm512_castps512_ps256(
+			     _mm512_shuffle_f32x4(eng,eng,238)));
+    SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
+    SIMD_conflict_pi_reduce1(rmask2, joffset2, engd);
+    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2, 
+				      force, _MM_SCALE_2);
+    jeng = jeng + engd;
+    _mm512_mask_i32loscatter_pd(force, rmask2, joffset2, jeng, _MM_SCALE_2);
+  }
+
+  inline void SIMD_jeng_update_hi(const SIMD_mask &mask, float *force, 
+				  const SIMD_int &joffset1, SIMD_float &eng) {
+  }
+
+  inline void SIMD_jeng_update_hi(const SIMD_mask &mask, double *force, 
+				  const SIMD_int &joffset1, SIMD_double &eng) {
+    SIMD_mask rmask = mask >> 8;
+    SIMD_int joffset = _mm512_shuffle_i32x4(joffset1, joffset1, 238);
+
+    SIMD_double jeng;
+    SIMD_conflict_pi_reduce1(rmask, joffset, eng);
+    jeng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, 
+				      force, _MM_SCALE_2);
+    jeng = jeng + eng;
+    _mm512_mask_i32loscatter_pd(force, rmask, joffset, jeng, _MM_SCALE_2);
+  }
+
+  inline void SIMD_safe_jforce(const SIMD_mask &m, float *force,
+			       const SIMD_int &i, SIMD_float &fx,
+			       SIMD_float &fy, SIMD_float &fz) {
+    SIMD_conflict_pi_reduce3(m, i, fx, fy, fz);
+    SIMD_float jfrc;
+    jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force, 
+				    _MM_SCALE_1);
+    jfrc = jfrc + fx;
+    _mm512_mask_i32scatter_ps(force, m, i, jfrc, _MM_SCALE_1);
+    jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 1, 
+				    _MM_SCALE_1);
+    jfrc = jfrc + fy;
+    _mm512_mask_i32scatter_ps(force+1, m, i, jfrc, _MM_SCALE_1);
+    jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 2,
+				    _MM_SCALE_1);
+    jfrc = jfrc + fz;
+    _mm512_mask_i32scatter_ps(force+2, m, i, jfrc, _MM_SCALE_1);
+  }
+
+  inline void SIMD_safe_jforce(const SIMD_mask &m, double *force,
+			       const SIMD_int &i, SIMD_double &fx,
+			       SIMD_double &fy, SIMD_double &fz) {
+    SIMD_conflict_pi_reduce3(m, i, fx, fy, fz);
+    SIMD_double jfrc;
+    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force, 
+				      _MM_SCALE_2);
+    jfrc = jfrc + fx;
+    _mm512_mask_i32loscatter_pd(force, m, i, jfrc, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 1, 
+				      _MM_SCALE_2);
+    jfrc = jfrc + fy;
+    _mm512_mask_i32loscatter_pd(force+1, m, i, jfrc, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force + 2,
+				      _MM_SCALE_2);
+    jfrc = jfrc + fz;
+    _mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
+  }
+
+  inline void SIMD_safe_jforce(const SIMD_mask &rmask, double *force, 
+			       const SIMD_int &joffset, SIMD_float &amx,
+			       SIMD_float &amy, SIMD_float &amz) {
+    SIMD_double amxd, amyd, amzd;
+    amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(amx));
+    amyd = _mm512_cvtps_pd(_mm512_castps512_ps256(amy));
+    amzd = _mm512_cvtps_pd(_mm512_castps512_ps256(amz));
+    SIMD_conflict_pi_reduce3(rmask, joffset, amxd, amyd, amzd);
+    SIMD_double jfrc;
+    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, 
+				      force, _MM_SCALE_2);
+    jfrc = jfrc + amxd;
+    _mm512_mask_i32loscatter_pd(force, rmask, joffset, jfrc, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, 
+				      force + 1, _MM_SCALE_2);
+    jfrc = jfrc + amyd;
+    _mm512_mask_i32loscatter_pd(force+1, rmask, joffset, jfrc, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask, joffset, 
+				      force + 2, _MM_SCALE_2);
+    jfrc = jfrc + amzd;
+    _mm512_mask_i32loscatter_pd(force+2, rmask, joffset, jfrc, _MM_SCALE_2);
+
+    SIMD_mask rmask2 = rmask >> 8;
+    amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(
+				_mm512_shuffle_f32x4(amx,amx,238)));
+    amyd = _mm512_cvtps_pd(_mm512_castps512_ps256(
+                                _mm512_shuffle_f32x4(amy,amy,238)));
+    amzd = _mm512_cvtps_pd(_mm512_castps512_ps256(
+                                _mm512_shuffle_f32x4(amz,amz,238)));
+    SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
+    SIMD_conflict_pi_reduce3(rmask2, joffset2, amxd, amyd, amzd);
+    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2, 
+				      force, _MM_SCALE_2);
+    jfrc = jfrc + amxd;
+    _mm512_mask_i32loscatter_pd(force, rmask2, joffset2, jfrc, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2, 
+				      force + 1, _MM_SCALE_2);
+    jfrc = jfrc + amyd;
+    _mm512_mask_i32loscatter_pd(force+1, rmask2, joffset2, jfrc, _MM_SCALE_2);
+    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), rmask2, joffset2, 
+				      force + 2, _MM_SCALE_2);
+    jfrc = jfrc + amzd;
+    _mm512_mask_i32loscatter_pd(force+2, rmask2, joffset2, jfrc, _MM_SCALE_2);
+  }
+
  inline void SIMD_jforce_update(const SIMD_mask &m, float *force,
 				 const SIMD_int &i, const SIMD_float &fx,
 				 const SIMD_float &fy, const SIMD_float &fz) {
@ -1086,37 +1347,48 @@ namespace ip_simd {
    }
  }

-  inline void SIMD_acc_three(const SIMD_mask &hmask, 
-			     const SIMD_float &facrad, const int eatom,
-			     SIMD_float &sevdwl, SIMD_float &fwtmp,
-			     SIMD_float &fjtmp, SIMD_float &fwtmp2,
-			     SIMD_float &fjtmp2) {
+  inline void SIMD_acc_three(const SIMD_mask &hmask, const SIMD_float &facrad, 
+			     const int eatom, SIMD_float &sevdwl, 
+			     SIMD_float &fwtmp, SIMD_float &fjtmp, 
+			     SIMD_float &fwtmp2, SIMD_float &fjtmp2, 
+			     const SIMD_int &k, float *force) {
    sevdwl = SIMD_add(sevdwl, hmask, sevdwl, facrad);
    if (eatom) {
      SIMD_float hevdwl = facrad * SIMD_set((float)0.33333333);
      fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
      fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
+      SIMD_conflict_pi_reduce1(hmask, k, hevdwl);
+      SIMD_float keng = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), hmask, 
+						 k, force + 3, _MM_SCALE_1);
+      keng = keng + hevdwl;
+      _mm512_mask_i32scatter_ps(force + 3, hmask, k, keng, _MM_SCALE_1);
    }
  }

-  inline void SIMD_acc_three(const SIMD_mask &hmask, 
-			     const SIMD_double &facrad, const int eatom,
-			     SIMD_double &sevdwl, SIMD_double &fwtmp,
-			     SIMD_double &fjtmp, SIMD_double &fwtmp2,
-			     SIMD_double &fjtmp2) {
+  inline void SIMD_acc_three(const SIMD_mask &hmask, const SIMD_double &facrad,
+			     const int eatom, SIMD_double &sevdwl, 
+			     SIMD_double &fwtmp, SIMD_double &fjtmp, 
+			     SIMD_double &fwtmp2, SIMD_double &fjtmp2, 
+			     const SIMD_int &k, double *force) {
    sevdwl = SIMD_add(sevdwl, hmask, sevdwl, facrad);
    if (eatom) {
      SIMD_double hevdwl = facrad * SIMD_set((double)0.33333333);
      fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
      fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
+      SIMD_conflict_pi_reduce1(hmask, k, hevdwl);
+      SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), 
+						    hmask, k, force + 3, 
+						    _MM_SCALE_2);
+      keng = keng + hevdwl;
+      _mm512_mask_i32loscatter_pd(force + 3, hmask, k, keng, _MM_SCALE_2);
    }
  }

-  inline void SIMD_acc_three(const SIMD_mask &hmask, 
-			     const SIMD_float &facrad, const int eatom,
-			     SIMD_double &sevdwl, SIMD_double &fwtmp,
-			     SIMD_double &fjtmp, SIMD_double &fwtmp2,
-			     SIMD_double &fjtmp2) {
+  inline void SIMD_acc_three(const SIMD_mask &hmask, const SIMD_float &facrad, 
+			     const int eatom, SIMD_double &sevdwl, 
+			     SIMD_double &fwtmp, SIMD_double &fjtmp, 
+			     SIMD_double &fwtmp2, SIMD_double &fjtmp2, 
+			     const SIMD_int &k, double *force) {
    SIMD_double facradd;
    facradd = _mm512_cvtps_pd(_mm512_castps512_ps256(facrad));
    sevdwl = SIMD_add(sevdwl, hmask, sevdwl, facradd);
@ -1124,6 +1396,12 @@ namespace ip_simd {
      SIMD_double hevdwl = facradd * SIMD_set((double)0.33333333);
      fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
      fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
+      SIMD_conflict_pi_reduce1(hmask, k, hevdwl);
+      SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), 
+						    hmask, k, force + 3, 
+						    _MM_SCALE_2);
+      keng = keng + hevdwl;
+      _mm512_mask_i32loscatter_pd(force + 3, hmask, k, keng, _MM_SCALE_2);
    }
    SIMD_mask hmask2 = hmask >> 8;
    facradd = _mm512_cvtps_pd(_mm512_castps512_ps256(
@ -1133,6 +1411,13 @@ namespace ip_simd {
      SIMD_double hevdwl = facradd * SIMD_set((double)0.33333333);
      fwtmp2 = SIMD_add(fwtmp2, hmask2, fwtmp2, hevdwl);
      fjtmp2 = SIMD_add(fjtmp2, hmask2, fjtmp2, hevdwl);
+      SIMD_int k2 = _mm512_shuffle_i32x4(k, k, 238);
+      SIMD_conflict_pi_reduce1(hmask2, k2, hevdwl);
+      SIMD_double keng = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), 
+						    hmask2, k2, force + 3, 
+						    _MM_SCALE_2);
+      keng = keng + hevdwl;
+      _mm512_mask_i32loscatter_pd(force + 3, hmask2, k2, keng, _MM_SCALE_2);
    }
  }

@ -1372,7 +1657,9 @@ namespace ip_simd {

  inline void SIMD_iforce_update(const SIMD_mask &m, float *force,
 				 const SIMD_int &i, const SIMD_float &fx,
-				 const SIMD_float &fy, const SIMD_float &fz) {
+				 const SIMD_float &fy, const SIMD_float &fz,
+				 const int EVFLAG, const int eatom,
+				 const SIMD_float &fwtmp) {
    SIMD_float jfrc;
    jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force, 
 				    _MM_SCALE_1);
@ -1386,11 +1673,21 @@ namespace ip_simd {
 				    _MM_SCALE_1);
    jfrc = jfrc + fz;
    _mm512_mask_i32scatter_ps(force+2, m, i, jfrc, _MM_SCALE_1);
+    if (EVFLAG) {
+      if (eatom) {
+	jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 3,
+					_MM_SCALE_1);
+	jfrc = jfrc + fwtmp;
+	_mm512_mask_i32scatter_ps(force+3, m, i, jfrc, _MM_SCALE_1);
+      }
+    }
  }

  inline void SIMD_iforce_update(const SIMD_mask &m, double *force,
 				 const SIMD_int &i, const SIMD_double &fx,
-				 const SIMD_double &fy, const SIMD_double &fz) {
+				 const SIMD_double &fy, const SIMD_double &fz,
+				 const int EVFLAG, const int eatom,
+				 const SIMD_double &fwtmp) {
    SIMD_double jfrc;
    jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, force, 
 				      _MM_SCALE_2);
@ -1404,6 +1701,14 @@ namespace ip_simd {
 				      _MM_SCALE_2);
    jfrc = jfrc + fz;
    _mm512_mask_i32loscatter_pd(force+2, m, i, jfrc, _MM_SCALE_2);
+    if (EVFLAG) {
+      if (eatom) {
+	jfrc = _mm512_mask_i32logather_pd(_mm512_undefined_pd(), m, i, 
+					  force + 3, _MM_SCALE_2);
+	jfrc = jfrc + fwtmp;
+	_mm512_mask_i32loscatter_pd(force+3, m, i, jfrc, _MM_SCALE_2);
+      }
+    }
  }

  #ifdef SW_GATHER_TEST
--- a/src/USER-INTEL/neigh_half_bin_intel.cpp
+++ b/src/USER-INTEL/neigh_half_bin_intel.cpp
@ -369,6 +369,7 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
      const int list_size = (ito + tid + 1) * maxnbors;
      int ct = (ifrom + tid) * maxnbors;
      int *neighptr = firstneigh + ct;
+
      for (int i = ifrom; i < ito; i++) {
        int j, k, n, n2, itype, jtype, ibin;
        double xtmp, ytmp, ztmp, delx, dely, delz, rsq;
@ -861,22 +862,23 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
      int e_ito = ito;
      if (ito == num) {
 	int imod = ito % swidth;
-	if (imod) e_ito += swidth - e_ito;
+	if (imod) e_ito += swidth - imod;
      }
-      const int list_size = (e_ito + tid + 1) * maxnbors;
+      const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
      #else
      const int swidth = 1;
      IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
      ifrom += astart;
      ito += astart;
-      const int list_size = (ito + tid + 1) * maxnbors;
+      const int list_size = (ito + tid * 2 + 2) * maxnbors;
      #endif

      int which;

      int pack_offset = maxnbors * swidth;
-      int ct = (ifrom + tid) * maxnbors;
+      int ct = (ifrom + tid * 2) * maxnbors;
      int *neighptr = firstneigh + ct;
+      const int obound = pack_offset + maxnbors * 2;

      int max_chunk = 0;
      int lane = 0;
@ -934,6 +936,8 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
 	  }
 	}

+	if (raw_count > obound) *overflow = 1;
+
        #if defined(LMP_SIMD_COMPILER)
 	#ifdef _LMP_INTEL_OFFLOAD
 	int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
@ -994,7 +998,6 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
 	  }
 	}
 	int ns = (n - lane) / swidth;
-	if (ns > maxnbors || n2 > list_size) *overflow = 1;
 	for (int u = pack_offset; u < n2; u++) {
 	  neighptr[n] = neighptr[u];
 	  n += swidth;
@ -1011,7 +1014,6 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
 	#ifdef OUTER_CHUNK
 	if (ns > max_chunk) max_chunk = ns;
 	lane++;
-	pack_offset -= maxnbors;
 	if (lane == swidth) {
 	  ct += max_chunk * swidth;
 	  const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
@ -1021,10 +1023,10 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
 	  max_chunk = 0;
 	  pack_offset = maxnbors * swidth;
 	  lane = 0;
-	  if (ct + pack_offset + maxnbors > list_size) {
+	  if (ct + obound > list_size) {
            if (i < ito - 1) {
 	      *overflow = 1;
-	      ct = (ifrom + tid) * maxnbors;
+	      ct = (ifrom + tid * 2) * maxnbors;
            }
 	  }
 	}
@ -1034,10 +1036,10 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
 	const int edge = (ct % alignb);
 	if (edge) ct += alignb - edge;
 	neighptr = firstneigh + ct;
-	if (ct + pack_offset + maxnbors > list_size) {
+	if (ct + obound > list_size) {
 	  if (i < ito - 1) {
 	    *overflow = 1;
-	    ct = (ifrom + tid) * maxnbors;
+	    ct = (ifrom + tid * 2) * maxnbors;
 	  }
 	}
 	#endif
@ -1965,15 +1967,14 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
      int e_ito = ito;
      if (ito == num) {
 	int imod = ito % pack_width;
-	if (imod) e_ito += pack_width - e_ito;
+	if (imod) e_ito += pack_width - imod;
      }
-      const int list_size = (e_ito + tid + 1) * maxnbors;
-
+      const int list_size = (e_ito + tid * 2 + 2) * maxnbors;
      int which;
-
      int pack_offset = maxnbors * pack_width;
-      int ct = (ifrom + tid) * maxnbors;
+      int ct = (ifrom + tid * 2) * maxnbors;
      int *neighptr = firstneigh + ct;
+      const int obound = pack_offset + maxnbors * 2;

      int max_chunk = 0;
      int lane = 0;
@ -2011,6 +2012,8 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
          }
        }

+	if (raw_count > obound) *overflow = 1;
+
        #if defined(LMP_SIMD_COMPILER)
 	#ifdef _LMP_INTEL_OFFLOAD
 	int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
@ -2084,7 +2087,6 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
          }
        }
 	int ns = (n - lane) / pack_width;
-	if (ns > maxnbors || n2 > list_size) *overflow = 1;
 	atombin[i] = ns;
 	for (int u = pack_offset; u < n2; u++) {
 	  neighptr[n] = neighptr[u];
@ -2098,7 +2100,6 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,

 	if (ns > max_chunk) max_chunk = ns;
 	lane++;
-	pack_offset -= maxnbors;
 	if (lane == pack_width) {
 	  ct += max_chunk * pack_width;
 	  const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
@ -2108,10 +2109,10 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
 	  max_chunk = 0;
 	  pack_offset = maxnbors * pack_width;
 	  lane = 0;
-	  if (ct + pack_offset + maxnbors > list_size) {
+	  if (ct + obound > list_size) {
  	    if (i < ito - 1) {
 	      *overflow = 1;
-	      ct = (ifrom + tid) * maxnbors;
+	      ct = (ifrom + tid * 2) * maxnbors;
 	    }
          }
 	}
--- a/src/USER-INTEL/pair_buck_coul_cut_intel.cpp
+++ b/src/USER-INTEL/pair_buck_coul_cut_intel.cpp
@ -358,12 +358,18 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,

        IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp);
      } // for ii
-      #if defined(_OPENMP)
-      #pragma omp barrier
+
+      #ifndef _LMP_INTEL_OFFLOAD
+      if (vflag == 2)
      #endif
-      IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
-			     nlocal, minlocal, nthreads, f_start, f_stride, 
-			     x);
+      {
+        #if defined(_OPENMP)
+        #pragma omp barrier
+        #endif
+        IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
+			       nlocal, minlocal, nthreads, f_start, f_stride, 
+			       x, offload);
+      }
    } // end of omp parallel region
    if (EVFLAG) {
      if (EFLAG) {
@ -390,7 +396,7 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
    fix->stop_watch(TIME_HOST_PAIR);

  if (EVFLAG)
-    fix->add_result_array(f_start, ev_global, offload, eatom);
+    fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
  else
    fix->add_result_array(f_start, 0, offload);
 }
--- a/src/USER-INTEL/pair_buck_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_buck_coul_long_intel.cpp
@ -410,12 +410,18 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
        f[i].z += fztmp;
 	IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp);
      } // for ii
-      #if defined(_OPENMP)
-      #pragma omp barrier
+
+      #ifndef _LMP_INTEL_OFFLOAD
+      if (vflag == 2)
      #endif
-      IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
-			     nlocal, minlocal, nthreads, f_start, f_stride, 
-			     x);
+      {
+        #if defined(_OPENMP)
+        #pragma omp barrier
+        #endif
+        IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
+			       nlocal, minlocal, nthreads, f_start, f_stride, 
+			       x, offload);
+      }
    } // end of omp parallel region
    if (EVFLAG) {
      if (EFLAG) {
@ -442,7 +448,7 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
    fix->stop_watch(TIME_HOST_PAIR);

  if (EVFLAG)
-    fix->add_result_array(f_start, ev_global, offload, eatom);
+    fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
  else
    fix->add_result_array(f_start, 0, offload);
 }
--- a/src/USER-INTEL/pair_buck_intel.cpp
+++ b/src/USER-INTEL/pair_buck_intel.cpp
@ -321,12 +321,18 @@ void PairBuckIntel::eval(const int offload, const int vflag,
        f[i].z += fztmp;
        IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp);
      } // for ii
-      #if defined(_OPENMP)
-      #pragma omp barrier
+
+      #ifndef _LMP_INTEL_OFFLOAD
+      if (vflag == 2)
      #endif
-      IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
-			     nlocal, minlocal, nthreads, f_start, f_stride, 
-			     x);
+      {
+        #if defined(_OPENMP)
+        #pragma omp barrier
+        #endif
+        IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
+			       nlocal, minlocal, nthreads, f_start, f_stride, 
+			       x, offload);
+      }
    } // end of omp parallel region
    if (EVFLAG) {
      if (EFLAG) {
@ -353,7 +359,7 @@ void PairBuckIntel::eval(const int offload, const int vflag,
    fix->stop_watch(TIME_HOST_PAIR);

  if (EVFLAG)
-    fix->add_result_array(f_start, ev_global, offload, eatom);
+    fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
  else
    fix->add_result_array(f_start, 0, offload);
 }
--- a/src/USER-INTEL/pair_gayberne_intel.cpp
+++ b/src/USER-INTEL/pair_gayberne_intel.cpp
@ -793,10 +793,6 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
 			     sizeof(FORCE_T));
      const int two_iito = iito * 2;

-      #if defined(_OPENMP)
-      #pragma omp barrier
-      #endif
-
      acc_t *facc = &(f_start[0].x);
      const int sto = two_iito * 4;
      const int fst4 = f_stride * 4;
@ -882,9 +878,9 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
    fix->stop_watch(TIME_HOST_PAIR);

  if (EVFLAG)
-    fix->add_result_array(f_start, ev_global, offload,eatom);
+    fix->add_result_array(f_start, ev_global, offload, eatom, 0, 2);
  else
-    fix->add_result_array(f_start, 0, offload);
+    fix->add_result_array(f_start, 0, offload, 0, 0, 2);
 }

 /* ---------------------------------------------------------------------- */
--- a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
@ -438,12 +438,17 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
 	IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp);
      } // for ii

-      #if defined(_OPENMP)
-      #pragma omp barrier
+      #ifndef _LMP_INTEL_OFFLOAD
+      if (vflag == 2)
      #endif
-      IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
-			     nlocal, minlocal, nthreads, f_start, f_stride,
-			     x);
+      {
+        #if defined(_OPENMP)
+        #pragma omp barrier
+        #endif
+        IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
+	  		       nlocal, minlocal, nthreads, f_start, f_stride,
+	                       x, offload);
+      }
    } // end of omp parallel region
    if (EVFLAG) {
      if (EFLAG) {
@ -470,7 +475,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
    fix->stop_watch(TIME_HOST_PAIR);

  if (EVFLAG)
-    fix->add_result_array(f_start, ev_global, offload, eatom);
+    fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
  else
    fix->add_result_array(f_start, 0, offload);
 }
--- a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
@ -405,12 +405,18 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
        f[i].z += fztmp;
 	IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp);
      } // for ii
-      #if defined(_OPENMP)
-      #pragma omp barrier
+
+      #ifndef _LMP_INTEL_OFFLOAD
+      if (vflag == 2)
      #endif
-      IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
-			     nlocal, minlocal, nthreads, f_start, f_stride,
-			     x);
+      {
+        #if defined(_OPENMP)
+        #pragma omp barrier
+        #endif
+        IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
+			       nlocal, minlocal, nthreads, f_start, f_stride,
+	                       x, offload);
+      }
    } // end of omp parallel region
    if (EVFLAG) {
      if (EFLAG) {
@ -437,7 +443,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
    fix->stop_watch(TIME_HOST_PAIR);

  if (EVFLAG)
-    fix->add_result_array(f_start, ev_global, offload, eatom);
+    fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
  else
    fix->add_result_array(f_start, 0, offload);
 }
--- a/src/USER-INTEL/pair_lj_cut_intel.cpp
+++ b/src/USER-INTEL/pair_lj_cut_intel.cpp
@ -349,12 +349,17 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
 	IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp);
      } // for ii

-      #if defined(_OPENMP)
-      #pragma omp barrier
+      #ifndef _LMP_INTEL_OFFLOAD
+      if (vflag == 2)
      #endif
-      IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
-			     nlocal, minlocal, nthreads, f_start, f_stride,
-			     x);
+      {
+        #if defined(_OPENMP)
+        #pragma omp barrier
+        #endif
+        IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
+	                       nlocal, minlocal, nthreads, f_start, f_stride,
+	                       x, offload);
+      }
    } // end omp
    if (EVFLAG) {
      if (EFLAG) {
@ -381,7 +386,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
    fix->stop_watch(TIME_HOST_PAIR);

  if (EVFLAG)
-    fix->add_result_array(f_start, ev_global, offload, eatom);
+    fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
  else
    fix->add_result_array(f_start, 0, offload);
 }
--- a/src/USER-INTEL/pair_sw_intel.cpp
+++ b/src/USER-INTEL/pair_sw_intel.cpp
@ -559,12 +559,18 @@ void PairSWIntel::eval(const int offload, const int vflag,
        f[i].z += fztmp;
        IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp);
      } // for ii
-      #if defined(_OPENMP)
-      #pragma omp barrier
+
+      #ifndef _LMP_INTEL_OFFLOAD
+      if (vflag == 2)
      #endif
-      IP_PRE_fdotr_acc_force(1, EVFLAG,  EFLAG, vflag, eatom, nall,
-                             nlocal, minlocal, nthreads, f_start, f_stride,
-                             x);
+      {
+        #if defined(_OPENMP)
+        #pragma omp barrier
+        #endif
+        IP_PRE_fdotr_acc_force(1, EVFLAG,  EFLAG, vflag, eatom, nall,
+			       nlocal, minlocal, nthreads, f_start, f_stride,
+			       x, offload);
+      }
    } // end omp
    if (EVFLAG) {
      if (EFLAG) {
@ -590,7 +596,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
    fix->stop_watch(TIME_HOST_PAIR);

  if (EVFLAG)
-    fix->add_result_array(f_start, ev_global, offload, eatom);
+    fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
  else
    fix->add_result_array(f_start, 0, offload);
 }
@ -677,7 +683,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
    in(numneighhalf:length(0) alloc_if(0) free_if(0)) \
    in(overflow:length(0) alloc_if(0) free_if(0)) \
    in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
-    in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
+    in(ccachei,ccachej,ccachef:length(0) alloc_if(0) free_if(0)) \
    in(ccache_stride,nthreads,inum,nall,ntypes,vflag,eatom,offload) \
    in(astart,nlocal,f_stride,minlocal,separate_flag,pad_width) \
    in(ccache_stride3)						\
@ -852,7 +858,7 @@ void PairSWIntel::eval(const int offload, const int vflag,

 	const int ejnum_max = SIMD_max(ejnum);
 	const int ejnumhalf_max = SIMD_max(ejnumhalf);
-	memset(tf, 0, ejnum_max * sizeof(double) * swidth * 3);
+	memset(tf, 0, ejnum_max * sizeof(acc_t) * swidth * 3);
        for (int jj = 0; jj < ejnum_max; jj++) {
          SIMD_int ijtype;
 	  const int coffset = jj * swidth;
@ -1006,9 +1012,15 @@ void PairSWIntel::eval(const int offload, const int vflag,
 			    tf + kcoffset * 3, swidth); 

 	    if (EVFLAG) {
-	      if (EFLAG)
+	      if (EFLAG) {
+		SIMD_int k;
+		if (eatom) {
+		  k = SIMD_load(tj + kcoffset);
+		  k = k << 4;
+		}
 		SIMD_acc_three(kmask, facrad, eatom, sevdwl, fwtmp, fjtmp,
-			       fwtmp2, fjtmp2);
+			       fwtmp2, fjtmp2, k, &(f[0].x));
+	      }
 	      SIMD_ev_tally_nbor3v(kmask, vflag, fjx, fjy, fjz, fkx, fky, fkz,
 				   delx, dely, delz, delr2x, delr2y, delr2z,
 				   sv0, sv1, sv2, sv3, sv4, sv5);
@ -1021,8 +1033,15 @@ void PairSWIntel::eval(const int offload, const int vflag,
 	    SIMD_cache3(tf + coffset * 3, swidth, fjxtmp, fjytmp, fjztmp, 
 	                fjxtmp2, fjytmp2, fjztmp2);

-	  //if (EFLAG)
-	  //  if (eatom) f[j].w += fjtmp;
+	  if (EFLAG) {
+	    if (eatom) { 
+	      SIMD_int j = SIMD_load(tj + coffset);
+	      j = j << 4;
+	      SIMD_jeng_update(jmask, &(f[0].x) + 3, j, fjtmp);
+	      if (is_same<flt_t,acc_t>::value == 0)
+		SIMD_jeng_update_hi(jmask, &(f[0].x) + 3, j, fjtmp2);
+	    }
+	  }
        } // for jj first loop

        for (int jj = 0; jj < ejnum_max; jj++) {
@ -1066,15 +1085,15 @@ void PairSWIntel::eval(const int offload, const int vflag,
 	  }
 	} // for jj second loop

-	SIMD_iforce_update(imask, &(f[i].x), goffset, fxtmp, fytmp, fztmp);
+	SIMD_iforce_update(imask, &(f[i].x), goffset, fxtmp, fytmp, fztmp,
+			   EVFLAG, eatom, fwtmp);
 	if (is_same<flt_t,acc_t>::value == 0) {
 	  imask = imask >> 8;
 	  SIMD_iforce_update(imask, &(f[i+8].x), goffset, fxtmp2, fytmp2, 
-			     fztmp2);
+			     fztmp2, EVFLAG, eatom, fwtmp2);
 	}
 	if (EVFLAG) {
 	  if (EFLAG) oevdwl += SIMD_sum(sevdwl);
-	  // f[i].w
 	  if (vflag == 1) {
 	    ov0 += SIMD_sum(sv0);
 	    ov1 += SIMD_sum(sv1);
@ -1086,11 +1105,18 @@ void PairSWIntel::eval(const int offload, const int vflag,
 	}
 	ilist = ilist + swidth;
      } // for ii
-      #if defined(_OPENMP)
-      #pragma omp barrier
+
+      #ifndef _LMP_INTEL_OFFLOAD
+      if (vflag == 2)
      #endif
-      IP_PRE_fdotr_acc_force(1, EVFLAG,  EFLAG, vflag, eatom, nall, nlocal, 
-	                     minlocal, nthreads, f_start, f_stride, x);
+      {
+        #if defined(_OPENMP)
+        #pragma omp barrier
+        #endif
+        IP_PRE_fdotr_acc_force(1, EVFLAG,  EFLAG, vflag, eatom, nall, nlocal, 
+			       minlocal, nthreads, f_start, f_stride, x, 
+			       offload);
+      }
    } // end omp
  
    if (EVFLAG) {
@ -1117,7 +1143,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
    fix->stop_watch(TIME_HOST_PAIR);

  if (EVFLAG)
-    fix->add_result_array(f_start, ev_global, offload, eatom);
+    fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
  else
    fix->add_result_array(f_start, 0, offload);
 }
--- a/src/USER-INTEL/pair_tersoff_intel.cpp
+++ b/src/USER-INTEL/pair_tersoff_intel.cpp
@ -384,12 +384,18 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
          }
        }
      }
-      #if defined(_OPENMP)
-      #pragma omp barrier
+
+      #ifndef _LMP_INTEL_OFFLOAD
+      if (vflag == 2)
      #endif
-      IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
-			     nlocal, minlocal, nthreads, f_start, f_stride, 
-			     x);
+      {
+        #if defined(_OPENMP)
+        #pragma omp barrier
+        #endif
+        IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
+	  		       nlocal, minlocal, nthreads, f_start, f_stride, 
+                               x, offload);
+      }
    } // end of omp parallel region
    if (EVFLAG) {
      if (EFLAG) {
@ -419,7 +425,7 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
    fix->stop_watch(TIME_HOST_PAIR);

  if (EVFLAG)
-    fix->add_result_array(f_start, ev_global, offload, eatom);
+    fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
  else
    fix->add_result_array(f_start, 0, offload);

--- a/src/USER-INTEL/verlet_intel.cpp
+++ b/src/USER-INTEL/verlet_intel.cpp
@ -73,11 +73,10 @@ void VerletIntel::init()
  if (nvlist_atom)
    error->all(FLERR,
 	       "Cannot currently get per-atom virials with Intel package.");
-  #ifdef _LMP_INTEL_OFFLOAD
+
  ifix = modify->find_fix("package_intel");
  if (ifix >= 0) fix_intel = static_cast<FixIntel *>(modify->fix[ifix]);
  else fix_intel = 0;
-  #endif

  // set flags for what arrays to clear in force_clear()
  // need to clear additionals arrays if they exist
@ -149,6 +148,8 @@ void VerletIntel::setup()
    else force->kspace->compute_dummy(eflag,vflag);
  }

+  if (fix_intel) fix_intel->sync();
+
  #ifdef _LMP_INTEL_OFFLOAD
  sync_mode = 0;
  if (fix_intel) {
@ -227,6 +228,8 @@ void VerletIntel::setup_minimal(int flag)
    else force->kspace->compute_dummy(eflag,vflag);
  }

+  if (fix_intel) fix_intel->sync();
+
  #ifdef _LMP_INTEL_OFFLOAD
  sync_mode = 0;
  if (fix_intel) {
@ -349,6 +352,8 @@ void VerletIntel::run(int n)
      timer->stamp(Timer::KSPACE);
    }

+    if (fix_intel) fix_intel->sync();
+
    #ifdef _LMP_INTEL_OFFLOAD
    if (sync_mode == 1) {
      fix_intel->sync_coprocessor();
--- a/src/USER-INTEL/verlet_intel.h
+++ b/src/USER-INTEL/verlet_intel.h
@ -21,9 +21,7 @@ IntegrateStyle(verlet/intel,VerletIntel)
 #define LMP_VERLET_INTEL_H

 #include "integrate.h"
-#ifdef LMP_INTEL_OFFLOAD
 #include "fix_intel.h"
-#endif

 namespace LAMMPS_NS {

@ -42,8 +40,9 @@ class VerletIntel : public Integrate {
  int torqueflag,extraflag;

  virtual void force_clear();
-  #ifdef _LMP_INTEL_OFFLOAD
+
  FixIntel *fix_intel;
+  #ifdef _LMP_INTEL_OFFLOAD
  int sync_mode;
  #endif
 };
--- a/src/USER-INTEL/verlet_split_intel.cpp
+++ b/src/USER-INTEL/verlet_split_intel.cpp
@ -378,6 +378,8 @@ void VerletSplitIntel::run(int n)
        timer->stamp(Timer::BOND);
      }

+      if (fix_intel) fix_intel->sync();
+
      #ifdef _LMP_INTEL_OFFLOAD
      if (sync_mode == 1) {
 	fix_intel->sync_coprocessor();
--- a/src/USER-INTEL/verlet_split_intel.h
+++ b/src/USER-INTEL/verlet_split_intel.h
@ -21,9 +21,6 @@ IntegrateStyle(verlet/split/intel,VerletSplitIntel)
 #define LMP_VERLET_SPLIT_INTEL_H

 #include "verlet_intel.h"
-#ifdef LMP_INTEL_OFFLOAD
-#include "fix_intel.h"
-#endif

 namespace LAMMPS_NS {