Adding full hybrid support to USER-INTEL package and fixing bug with EAM parameter initialization.

2018-10-11 05:12:00 -07:00
parent 7faa48b4c3
commit c37deebffa
42 changed files with 1442 additions and 566 deletions
--- a/doc/src/Speed_intel.txt
+++ b/doc/src/Speed_intel.txt
@ -499,7 +499,7 @@ MPI task.
 When offloading to a coprocessor, "hybrid"_pair_hybrid.html styles
 that require skip lists for neighbor builds cannot be offloaded.
 Using "hybrid/overlay"_pair_hybrid.html is allowed.  Only one intel
-accelerated style may be used with hybrid styles.
+accelerated style may be used with hybrid styles when offloading.
 "Special_bonds"_special_bonds.html exclusion lists are not currently
 supported with offload, however, the same effect can often be
 accomplished by setting cutoffs for excluded atom types to 0.  None of
--- a/src/USER-INTEL/angle_harmonic_intel.cpp
+++ b/src/USER-INTEL/angle_harmonic_intel.cpp
@ -314,7 +314,7 @@ void AngleHarmonicIntel::init_style()

 template <class flt_t, class acc_t>
 void AngleHarmonicIntel::pack_force_const(ForceConst<flt_t> &fc,
-                                        IntelBuffers<flt_t,acc_t> * /*buffers*/)
+                                          IntelBuffers<flt_t,acc_t> * /*buffers*/)
 {
  const int bp1 = atom->nangletypes + 1;
  fc.set_ntypes(bp1,memory);
--- a/src/USER-INTEL/bond_fene_intel.cpp
+++ b/src/USER-INTEL/bond_fene_intel.cpp
@ -291,7 +291,7 @@ void BondFENEIntel::init_style()

 template <class flt_t, class acc_t>
 void BondFENEIntel::pack_force_const(ForceConst<flt_t> &fc,
-                                         IntelBuffers<flt_t,acc_t> * /*buffers*/)
+                                     IntelBuffers<flt_t,acc_t> * /*buffers*/)
 {
  const int bp1 = atom->nbondtypes + 1;
  fc.set_ntypes(bp1,memory);
--- a/src/USER-INTEL/dihedral_opls_intel.cpp
+++ b/src/USER-INTEL/dihedral_opls_intel.cpp
@ -416,7 +416,7 @@ void DihedralOPLSIntel::init_style()

 template <class flt_t, class acc_t>
 void DihedralOPLSIntel::pack_force_const(ForceConst<flt_t> &fc,
-                                             IntelBuffers<flt_t,acc_t> * /*buffers*/)
+                                         IntelBuffers<flt_t,acc_t> * /*buffers*/)
 {
  const int bp1 = atom->ndihedraltypes + 1;
  fc.set_ntypes(bp1,memory);
--- a/src/USER-INTEL/fix_intel.cpp
+++ b/src/USER-INTEL/fix_intel.cpp
@ -65,6 +65,7 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) :  Fix(lmp, narg, arg)

  _nbor_pack_width = 1;
  _three_body_neighbor = 0;
+  _hybrid_nonpair = 0;

  _precision_mode = PREC_MODE_MIXED;
  _offload_balance = -1.0;
@ -266,8 +267,7 @@ FixIntel::~FixIntel()
    double *time1 = off_watch_pair();
    double *time2 = off_watch_neighbor();
    int *overflow = get_off_overflow_flag();
-    if (_offload_balance != 0.0 && time1 != NULL && time2 != NULL &&
-        overflow != NULL) {
+    if (_offload_balance != 0.0) {
      #pragma offload_transfer target(mic:_cop) \
        nocopy(time1,time2,overflow:alloc_if(0) free_if(1))
    }
@ -314,34 +314,63 @@ void FixIntel::init()

  int nstyles = 0;
  if (force->pair_match("hybrid", 1) != NULL) {
+    _pair_hybrid_flag = 1;
    PairHybrid *hybrid = (PairHybrid *) force->pair;
    for (int i = 0; i < hybrid->nstyles; i++)
      if (strstr(hybrid->keywords[i], "/intel") != NULL)
        nstyles++;
+    if (force->newton_pair != 0 && force->pair->no_virial_fdotr_compute)
+      error->all(FLERR,
+                 "Intel package requires fdotr virial with newton on.");
  } else if (force->pair_match("hybrid/overlay", 1) != NULL) {
+    _pair_hybrid_flag = 1;
    PairHybridOverlay *hybrid = (PairHybridOverlay *) force->pair;
    for (int i = 0; i < hybrid->nstyles; i++)
      if (strstr(hybrid->keywords[i], "/intel") != NULL)
        nstyles++;
-      else
-        force->pair->no_virial_fdotr_compute = 1;
+    if (force->newton_pair != 0 && force->pair->no_virial_fdotr_compute)
+      error->all(FLERR,
+                 "Intel package requires fdotr virial with newton on.");
+  } else
+    _pair_hybrid_flag = 0;
+
+  if (nstyles > 1 && _pair_hybrid_flag) _pair_hybrid_flag = 2;
+  else if (force->newton_pair == 0) _pair_hybrid_flag = 0;
+
+  _pair_hybrid_zero = 0;
+  _zero_master = 0;
+
+  if (_pair_hybrid_flag && _hybrid_nonpair)
+    if (_pair_hybrid_flag > 1 || force->newton_pair == 0)
+      _pair_hybrid_zero = 1;
+  _hybrid_nonpair = 0;
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (offload_balance() != 0.0) {
+    _pair_hybrid_zero = 0;
+    if (force->newton_pair == 0) _pair_hybrid_flag = 0;
+    if (nstyles > 1)
+      error->all(FLERR,
+        "Currently, cannot offload more than one intel style with hybrid.");
  }
-  if (nstyles > 1)
-    error->all(FLERR,
-               "Currently, cannot use more than one intel style with hybrid.");
+  #endif

  check_neighbor_intel();
+
  int off_mode = 0;
  if (_offload_balance != 0.0) off_mode = 1;
  if (_precision_mode == PREC_MODE_SINGLE) {
    _single_buffers->zero_ev();
    _single_buffers->grow_ncache(off_mode,_nthreads);
+    _single_buffers->free_list_ptrs();
  } else if (_precision_mode == PREC_MODE_MIXED) {
    _mixed_buffers->zero_ev();
    _mixed_buffers->grow_ncache(off_mode,_nthreads);
+    _mixed_buffers->free_list_ptrs();
  } else {
    _double_buffers->zero_ev();
    _double_buffers->grow_ncache(off_mode,_nthreads);
+    _double_buffers->free_list_ptrs();
  }

  _need_reduce = 0;
@ -349,7 +378,7 @@ void FixIntel::init()

 /* ---------------------------------------------------------------------- */

-void FixIntel::setup(int /*vflag*/)
+void FixIntel::setup(int vflag)
 {
  if (neighbor->style != Neighbor::BIN)
    error->all(FLERR,
@ -395,8 +424,7 @@ void FixIntel::pair_init_check(const bool cdmessage)
    double *time1 = off_watch_pair();
    double *time2 = off_watch_neighbor();
    int *overflow = get_off_overflow_flag();
-    if (_offload_balance !=0.0 && time1 != NULL && time2 != NULL &&
-        overflow != NULL) {
+    if (_offload_balance !=0.0) {
      #pragma offload_transfer target(mic:_cop)  \
        nocopy(time1,time2:length(1) alloc_if(1) free_if(0)) \
        in(overflow:length(5) alloc_if(1) free_if(0))
@ -419,6 +447,21 @@ void FixIntel::pair_init_check(const bool cdmessage)
    #endif
  }

+  #ifndef LMP_INTEL_NBOR_COMPAT
+  if (force->pair->manybody_flag && atom->molecular) {
+    int flag = 0;
+    if (atom->nbonds > 0 && force->special_lj[1] == 0.0 &&
+        force->special_coul[1] == 0.0) flag = 1;
+    if (atom->nangles > 0 && force->special_lj[2] == 0.0 &&
+        force->special_coul[2] == 0.0) flag = 1;
+    if (atom->ndihedrals > 0 && force->special_lj[3] == 0.0 &&
+        force->special_coul[3] == 0.0) flag = 1;
+    if (flag)
+      error->all(FLERR,"Add -DLMP_INTEL_NBOR_COMPAT to build for special_bond"
+                 "exclusions with Intel");
+  }
+  #endif
+  
  int need_tag = 0;
  if (atom->molecular) need_tag = 1;

@ -477,11 +520,13 @@ void FixIntel::bond_init_check()
  if (force->pair_match("/intel", 0) != NULL)
    intel_pair = 1;
  else if (force->pair_match("hybrid", 1) != NULL) {
+    _hybrid_nonpair = 1;
    PairHybrid *hybrid = (PairHybrid *) force->pair;
    for (int i = 0; i < hybrid->nstyles; i++)
      if (strstr(hybrid->keywords[i], "/intel") != NULL)
        intel_pair = 1;
  } else if (force->pair_match("hybrid/overlay", 1) != NULL) {
+    _hybrid_nonpair = 1;
    PairHybridOverlay *hybrid = (PairHybridOverlay *) force->pair;
    for (int i = 0; i < hybrid->nstyles; i++)
      if (strstr(hybrid->keywords[i], "/intel") != NULL)
@ -501,11 +546,13 @@ void FixIntel::kspace_init_check()
  if (force->pair_match("/intel", 0) != NULL)
    intel_pair = 1;
  else if (force->pair_match("hybrid", 1) != NULL) {
+    _hybrid_nonpair = 1;
    PairHybrid *hybrid = (PairHybrid *) force->pair;
    for (int i = 0; i < hybrid->nstyles; i++)
      if (strstr(hybrid->keywords[i], "/intel") != NULL)
        intel_pair = 1;
  } else if (force->pair_match("hybrid/overlay", 1) != NULL) {
+    _hybrid_nonpair = 1;
    PairHybridOverlay *hybrid = (PairHybridOverlay *) force->pair;
    for (int i = 0; i < hybrid->nstyles; i++)
      if (strstr(hybrid->keywords[i], "/intel") != NULL)
@ -522,51 +569,60 @@ void FixIntel::check_neighbor_intel()
 {
  #ifdef _LMP_INTEL_OFFLOAD
  _full_host_list = 0;
-  #endif
-  const int nrequest = neighbor->nrequest;

+  const int nrequest = neighbor->nrequest;
  for (int i = 0; i < nrequest; ++i) {
-    #ifdef _LMP_INTEL_OFFLOAD
    if (_offload_balance != 0.0 && neighbor->requests[i]->intel == 0) {
      _full_host_list = 1;
      _offload_noghost = 0;
    }
-    #endif
+    if (neighbor->requests[i]->skip && _offload_balance != 0.0)
+      error->all(FLERR, "Cannot yet use hybrid styles with Intel offload.");

    // avoid flagging a neighbor list as both USER-INTEL and USER-OMP
    if (neighbor->requests[i]->intel)
      neighbor->requests[i]->omp = 0;
-
-    if (neighbor->requests[i]->skip)
-      error->all(FLERR, "Hybrid styles with Intel package are unsupported.");
  }
+  #else
+  // avoid flagging a neighbor list as both USER-INTEL and USER-OMP
+  const int nrequest = neighbor->nrequest;
+  for (int i = 0; i < nrequest; ++i)
+    if (neighbor->requests[i]->intel)
+      neighbor->requests[i]->omp = 0;
+  #endif
 }

 /* ---------------------------------------------------------------------- */

-void FixIntel::pre_reverse(int /*eflag*/, int /*vflag*/)
+void FixIntel::_sync_main_arrays(const int prereverse)
 {
+  if (!prereverse) _zero_master = 1;
+  int done_this_step = prereverse;
+  if (_pair_hybrid_zero == 0) done_this_step = 1;
  if (_force_array_m != 0) {
    if (_need_reduce) {
      reduce_results(&_force_array_m[0].x);
      _need_reduce = 0;
    }
    add_results(_force_array_m, _ev_array_d, _results_eatom, _results_vatom,0);
-    _force_array_m = 0;
+    if (done_this_step) _force_array_m = 0;
+    else _ev_array_d = 0;
  } else if (_force_array_d != 0) {
    if (_need_reduce) {
      reduce_results(&_force_array_d[0].x);
      _need_reduce = 0;
    }
    add_results(_force_array_d, _ev_array_d, _results_eatom, _results_vatom,0);
-    _force_array_d = 0;
+    if (done_this_step) _force_array_d = 0;
+    else _ev_array_d = 0;
  } else if (_force_array_s != 0) {
    if (_need_reduce) {
      reduce_results(&_force_array_s[0].x);
      _need_reduce = 0;
    }
    add_results(_force_array_s, _ev_array_s, _results_eatom, _results_vatom,0);
-    _force_array_s = 0;
+    if (done_this_step) _force_array_s = 0;
+    else _ev_array_s = 0;
  }

  #ifdef _LMP_INTEL_OFFLOAD
@ -576,6 +632,13 @@ void FixIntel::pre_reverse(int /*eflag*/, int /*vflag*/)

 /* ---------------------------------------------------------------------- */

+void FixIntel::pre_reverse(int /*eflag*/, int /*vflag*/)
+{
+  _sync_main_arrays(1);
+}
+
+/* ---------------------------------------------------------------------- */
+
 template <class acc_t>
 void FixIntel::reduce_results(acc_t * _noalias const f_scalar)
 {
@ -657,7 +720,7 @@ template <class ft, class acc_t>
 void FixIntel::add_results(const ft * _noalias const f_in,
                           const acc_t * _noalias const ev_global,
                           const int eatom, const int vatom,
-                           const int /*offload*/) {
+                           const int offload) {
  start_watch(TIME_PACK);
  int f_length;
  #ifdef _LMP_INTEL_OFFLOAD
--- a/src/USER-INTEL/fix_intel.h
+++ b/src/USER-INTEL/fix_intel.h
@ -74,11 +74,12 @@ class FixIntel : public Fix {
  inline int nbor_pack_width() const { return _nbor_pack_width; }
  inline void nbor_pack_width(const int w) { _nbor_pack_width = w; }
  inline int three_body_neighbor() { return _three_body_neighbor; }
-  inline void three_body_neighbor(const int /*i*/) { _three_body_neighbor = 1; }
+  inline void three_body_neighbor(const int i) { _three_body_neighbor = i; }

  inline int need_zero(const int tid) {
    if (_need_reduce == 0 && tid > 0) return 1;
-    return 0;
+    else if (_zero_master && tid == 0) { _zero_master = 0; return 1; }
+    else return 0;
  }
  inline void set_reduce_flag() { if (_nthreads > 1) _need_reduce = 1; }
  inline int lrt() {
@ -100,7 +101,10 @@ class FixIntel : public Fix {
  IntelBuffers<double,double> *_double_buffers;

  int _precision_mode, _nthreads, _nbor_pack_width, _three_body_neighbor;
-
+  int _pair_hybrid_flag;
+  // These should be removed in subsequent update w/ simpler hybrid arch
+  int _pair_hybrid_zero, _hybrid_nonpair, _zero_master;
+  
 public:
  inline int* get_overflow_flag() { return _overflow_flag; }
  inline int* get_off_overflow_flag() { return _off_overflow_flag; }
@ -210,6 +214,8 @@ class FixIntel : public Fix {
  _alignvar(double _stopwatch_offload_neighbor[1],64);
  _alignvar(double _stopwatch_offload_pair[1],64);

+  void _sync_main_arrays(const int prereverse);
+  
  template <class ft>
  void reduce_results(ft * _noalias const f_in);

@ -238,7 +244,7 @@ class FixIntel : public Fix {

 /* ---------------------------------------------------------------------- */

-void FixIntel::get_buffern(const int /*offload*/, int &nlocal, int &nall,
+void FixIntel::get_buffern(const int offload, int &nlocal, int &nall,
                           int &minlocal) {
  #ifdef _LMP_INTEL_OFFLOAD
  if (_separate_buffers) {
@ -273,7 +279,7 @@ void FixIntel::get_buffern(const int /*offload*/, int &nlocal, int &nall,
 /* ---------------------------------------------------------------------- */

 void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
-                                double *ev_in, const int /*offload*/,
+                                double *ev_in, const int offload,
                                const int eatom, const int vatom,
                                const int rflag) {
  #ifdef _LMP_INTEL_OFFLOAD
@ -282,6 +288,8 @@ void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
    _off_results_vatom = vatom;
    _off_force_array_d = f_in;
    _off_ev_array_d = ev_in;
+    if (_pair_hybrid_flag && force->pair->fdotr_is_set())
+       _sync_main_arrays(1);
    return;
  }
  #endif
@ -296,12 +304,15 @@ void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,

  if (_overflow_flag[LMP_OVERFLOW])
    error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
+
+  if (_pair_hybrid_flag > 1 ||
+      (_pair_hybrid_flag && force->pair->fdotr_is_set())) _sync_main_arrays(0);
 }

 /* ---------------------------------------------------------------------- */

 void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
-                                double *ev_in, const int /*offload*/,
+                                double *ev_in, const int offload,
                                const int eatom, const int vatom,
                                const int rflag) {
  #ifdef _LMP_INTEL_OFFLOAD
@ -310,6 +321,8 @@ void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
    _off_results_vatom = vatom;
    _off_force_array_m = f_in;
    _off_ev_array_d = ev_in;
+    if (_pair_hybrid_flag && force->pair->fdotr_is_set())
+       _sync_main_arrays(1);
    return;
  }
  #endif
@ -324,12 +337,16 @@ void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,

  if (_overflow_flag[LMP_OVERFLOW])
    error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
+
+  if (_pair_hybrid_flag > 1 ||
+      (_pair_hybrid_flag && force->pair->fdotr_is_set()))
+    _sync_main_arrays(0);
 }

 /* ---------------------------------------------------------------------- */

 void FixIntel::add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
-                                float *ev_in, const int /*offload*/,
+                                float *ev_in, const int offload,
                                const int eatom, const int vatom,
                                const int rflag) {
  #ifdef _LMP_INTEL_OFFLOAD
@ -338,6 +355,8 @@ void FixIntel::add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
    _off_results_vatom = vatom;
    _off_force_array_s = f_in;
    _off_ev_array_s = ev_in;
+    if (_pair_hybrid_flag && force->pair->fdotr_is_set())
+       _sync_main_arrays(1);
    return;
  }
  #endif
@ -352,6 +371,10 @@ void FixIntel::add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,

  if (_overflow_flag[LMP_OVERFLOW])
    error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
+
+  if (_pair_hybrid_flag > 1 ||
+      (_pair_hybrid_flag && force->pair->fdotr_is_set()))
+    _sync_main_arrays(0);
 }

 /* ---------------------------------------------------------------------- */
@ -487,16 +510,16 @@ The compiler version used to build LAMMPS is not supported when using
 offload to a coprocessor. There could be performance or correctness
 issues. Please use 14.0.1.106 or 15.1.133 or later.

-E: Currently, cannot use more than one intel style with hybrid.
+E: Currently, cannot offload more than one intel style with hybrid.

-Currently, hybrid pair styles can only use the intel suffix for one of the
-pair styles.
+Currently, when using offload, hybrid pair styles can only use the intel 
+suffix for one of the pair styles.

-E: Cannot yet use hybrid styles with Intel package.
+E: Cannot yet use hybrid styles with Intel offload.

-The hybrid pair style configuration is not yet supported by the Intel
-package. Support is limited to hybrid/overlay or a hybrid style that does
-not require a skip list.
+The hybrid pair style configuration is not yet supported when using offload
+within the Intel package. Support is limited to hybrid/overlay or a hybrid 
+style that does not require a skip list.

 W: Leaving a core/node free can improve performance for offload

@ -538,4 +561,16 @@ E: Too few atoms for load balancing offload.
 When using offload to a coprocessor, each MPI task must have at least 2
 atoms throughout the simulation.

+E: Intel package requires fdotr virial with newton on.
+
+This error can occur with a hybrid pair style that mixes styles that are
+incompatible with the newton pair setting turned on. Try turning the 
+newton pair setting off.
+
+E: Add -DLMP_INTEL_NBOR_COMPAT to build for special_bond exclusions with Intel
+
+When using a manybody pair style, bonds/angles/dihedrals, and special_bond
+exclusions, LAMMPS should be built with the above compile flag for compatible
+results.
+
 */
--- a/src/USER-INTEL/improper_cvff_intel.cpp
+++ b/src/USER-INTEL/improper_cvff_intel.cpp
@ -428,7 +428,7 @@ void ImproperCvffIntel::init_style()

 template <class flt_t, class acc_t>
 void ImproperCvffIntel::pack_force_const(ForceConst<flt_t> &fc,
-                                             IntelBuffers<flt_t,acc_t> * /*buffers*/)
+                                         IntelBuffers<flt_t,acc_t> * /*buffers*/)
 {
  const int bp1 = atom->nimpropertypes + 1;
  fc.set_ntypes(bp1,memory);
--- a/src/USER-INTEL/intel_buffers.cpp
+++ b/src/USER-INTEL/intel_buffers.cpp
@ -24,7 +24,9 @@ using namespace LAMMPS_NS;
 template <class flt_t, class acc_t>
 IntelBuffers<flt_t, acc_t>::IntelBuffers(class LAMMPS *lmp_in) :
    lmp(lmp_in), _x(0), _q(0), _quat(0), _f(0), _off_threads(0),
-    _buf_size(0), _buf_local_size(0) {
+    _buf_size(0), _buf_local_size(0), _n_list_ptrs(1), _max_list_ptrs(4) {
+  _neigh_list_ptrs = new IntelNeighListPtrs[_max_list_ptrs];
+  _neigh_list_ptrs[0].cnumneigh = 0;
  _list_alloc_atoms = 0;
  _ntypes = 0;
  _off_map_listlocal = 0;
@ -55,6 +57,7 @@ IntelBuffers<flt_t, acc_t>::~IntelBuffers()
  free_all_nbor_buffers();
  free_ccache();
  set_ntypes(0);
+  delete []_neigh_list_ptrs;
 }

 /* ---------------------------------------------------------------------- */
@ -109,7 +112,7 @@ void IntelBuffers<flt_t, acc_t>::free_buffers()
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::_grow(const int nall, const int nlocal,
                                       const int nthreads,
-                                       const int /*offload_end*/)
+                                       const int offload_end)
 {
  free_buffers();
  _buf_size = static_cast<double>(nall) * 1.1 + 1;
@ -186,11 +189,9 @@ void IntelBuffers<flt_t, acc_t>::free_nmax()
    const int * tag = _off_map_tag;
    const int * special = _off_map_special;
    const int * nspecial = _off_map_nspecial;
-    if (tag != 0 && special != 0 && nspecial !=0) {
-      #pragma offload_transfer target(mic:_cop) \
-        nocopy(tag:alloc_if(0) free_if(1)) \
-        nocopy(special,nspecial:alloc_if(0) free_if(1))
-    }
+    #pragma offload_transfer target(mic:_cop) \
+      nocopy(tag:alloc_if(0) free_if(1)) \
+      nocopy(special,nspecial:alloc_if(0) free_if(1))
    _off_map_nmax = 0;
    _host_nmax = 0;
  }
@ -200,7 +201,7 @@ void IntelBuffers<flt_t, acc_t>::free_nmax()
 /* ---------------------------------------------------------------------- */

 template <class flt_t, class acc_t>
-void IntelBuffers<flt_t, acc_t>::_grow_nmax(const int /*offload_end*/)
+void IntelBuffers<flt_t, acc_t>::_grow_nmax(const int offload_end)
 {
  #ifdef _LMP_INTEL_OFFLOAD
  free_nmax();
@ -243,46 +244,117 @@ template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::free_list_local()
 {
  if (_off_map_listlocal > 0) {
-    int * cnumneigh = _cnumneigh;
+    if (_neigh_list_ptrs[0].cnumneigh) {
+      int * cnumneigh = _neigh_list_ptrs[0].cnumneigh;
+      _neigh_list_ptrs[0].cnumneigh = 0;
+      #ifdef _LMP_INTEL_OFFLOAD
+      if (_off_map_ilist != NULL) {
+        #pragma offload_transfer target(mic:_cop) \
+          nocopy(cnumneigh:alloc_if(0) free_if(1))
+      }
+      #endif
+      lmp->memory->destroy(cnumneigh);
+    }
+      
    #ifdef _LMP_INTEL_OFFLOAD
    if (_off_map_ilist != NULL) {
      const int * ilist = _off_map_ilist;
      const int * numneigh = _off_map_numneigh;
+      const int ** firstneigh = (const int **)_off_map_firstneigh;
      _off_map_ilist = NULL;
-      if (numneigh != 0 && ilist != 0) {
-        #pragma offload_transfer target(mic:_cop) \
-          nocopy(ilist,numneigh,cnumneigh:alloc_if(0) free_if(1))
-      }
+      #pragma offload_transfer target(mic:_cop) \
+        nocopy(ilist,firstneigh,numneigh:alloc_if(0) free_if(1))
    }
    #endif
-    lmp->memory->destroy(cnumneigh);
    _off_map_listlocal = 0;
  }
 }

 /* ---------------------------------------------------------------------- */

+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::free_list_ptrs()
+{
+  for (int list_num = 1; list_num < _n_list_ptrs; list_num++) {
+    if (_neigh_list_ptrs[list_num].size) {
+      lmp->memory->destroy(_neigh_list_ptrs[list_num].cnumneigh);
+      lmp->memory->destroy(_neigh_list_ptrs[list_num].numneighhalf);
+    }
+    _neigh_list_ptrs[list_num].size = 0;
+    _neigh_list_ptrs[list_num].list_ptr = 0;
+  }
+  _n_list_ptrs = 1;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::grow_data3(NeighList *list,
+                                            int *&numneighhalf,
+                                            int *&cnumneigh)
+{
+  const int size = list->get_maxlocal();
+  int list_num;
+  for (list_num = 0; list_num < _n_list_ptrs; list_num++) 
+    if (_neigh_list_ptrs[list_num].list_ptr == (void*)list) break;
+  if (list_num == _n_list_ptrs) {
+    if (_n_list_ptrs == _max_list_ptrs) {
+      _max_list_ptrs *= 2;
+      IntelNeighListPtrs *new_list = new IntelNeighListPtrs[_max_list_ptrs];
+      for (int i = 0; i < _n_list_ptrs; i++) new_list[i] = _neigh_list_ptrs[i];
+      delete []_neigh_list_ptrs;
+      _neigh_list_ptrs = new_list;
+    }
+    _neigh_list_ptrs[list_num].list_ptr = (void *)list;
+    _neigh_list_ptrs[list_num].size = 0;
+    _n_list_ptrs++;
+  }
+  if (size > _neigh_list_ptrs[list_num].size) {
+    if (_neigh_list_ptrs[list_num].size) {
+      lmp->memory->destroy(_neigh_list_ptrs[list_num].cnumneigh);
+      lmp->memory->destroy(_neigh_list_ptrs[list_num].numneighhalf);
+    }
+    lmp->memory->create(_neigh_list_ptrs[list_num].cnumneigh, size,
+                        "_cnumneigh");
+    lmp->memory->create(_neigh_list_ptrs[list_num].numneighhalf, size,
+                        "_cnumneigh");
+    _neigh_list_ptrs[list_num].size = size;
+  }
+  numneighhalf = _neigh_list_ptrs[list_num].numneighhalf;
+  cnumneigh = _neigh_list_ptrs[list_num].cnumneigh;
+}
+
+/* ---------------------------------------------------------------------- */
+
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::_grow_list_local(NeighList *list,
-                                                  const int /*offload_end*/)
+                                                  const int three_body,
+                                                  const int offload_end)
 {
  free_list_local();
  int size = list->get_maxlocal();
-  lmp->memory->create(_cnumneigh, size, "_cnumneigh");
  _off_map_listlocal = size;
+  if (three_body)
+    lmp->memory->create(_neigh_list_ptrs[0].cnumneigh, size, "_cnumneigh");

  #ifdef _LMP_INTEL_OFFLOAD
  if (offload_end > 0) {
+    int tb_size = size;
+    if (three_body == 0) {
+      lmp->memory->create(_neigh_list_ptrs[0].cnumneigh, 16, "_cnumneigh");
+      tb_size = 16;
+    }
+    int ** firstneigh = list->firstneigh;
    int * numneigh = list->numneigh;
    int * ilist = list->ilist;
-    int * cnumneigh = _cnumneigh;
-    if (cnumneigh != 0) {
-      #pragma offload_transfer target(mic:_cop) \
-        nocopy(ilist:length(size) alloc_if(1) free_if(0)) \
-        nocopy(numneigh:length(size) alloc_if(1) free_if(0)) \
-        nocopy(cnumneigh:length(size) alloc_if(1) free_if(0))
-    }
+    int * cnumneigh = _neigh_list_ptrs[0].cnumneigh;
+    #pragma offload_transfer target(mic:_cop) \
+      nocopy(ilist:length(size) alloc_if(1) free_if(0)) \
+      nocopy(firstneigh:length(size) alloc_if(1) free_if(0)) \
+      nocopy(numneigh:length(size) alloc_if(1) free_if(0)) \
+      nocopy(cnumneigh:length(tb_size) alloc_if(1) free_if(0))
    _off_map_ilist = ilist;
+    _off_map_firstneigh = firstneigh;
    _off_map_numneigh = numneigh;
  }
  #endif
@ -313,7 +385,7 @@ template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::_grow_nbor_list(NeighList * /*list*/,
                                                 const int nlocal,
                                                 const int nthreads,
-                                                 const int /*offload_end*/,
+                                                 const int offload_end,
                                                 const int pack_width)
 {
  free_nbor_list();
@ -382,7 +454,7 @@ void IntelBuffers<flt_t, acc_t>::free_ccache()
 /* ---------------------------------------------------------------------- */

 template <class flt_t, class acc_t>
-void IntelBuffers<flt_t, acc_t>::grow_ccache(const int /*off_flag*/,
+void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
        const int nthreads,
        const int width)
 {
@ -481,7 +553,7 @@ void IntelBuffers<flt_t, acc_t>::free_ncache()
 /* ---------------------------------------------------------------------- */

 template <class flt_t, class acc_t>
-void IntelBuffers<flt_t, acc_t>::grow_ncache(const int /*off_flag*/,
+void IntelBuffers<flt_t, acc_t>::grow_ncache(const int off_flag,
                                             const int nthreads)
 {
  const int nsize = get_max_nbors() * 3;
@ -576,12 +648,12 @@ void IntelBuffers<flt_t, acc_t>::set_ntypes(const int ntypes,
    if (_ntypes > 0) {
      #ifdef _LMP_INTEL_OFFLOAD
      flt_t * cutneighsqo = _cutneighsq[0];
-      if (_off_threads > 0 && cutneighsqo != 0) {
+      if (_off_threads > 0) {
        #pragma offload_transfer target(mic:_cop) \
          nocopy(cutneighsqo:alloc_if(0) free_if(1))
      }
      flt_t * cutneighghostsqo;
-      if (_cutneighghostsq && _off_threads > 0 && cutneighghostsqo != 0) {
+      if (_cutneighghostsq && _off_threads > 0) {
        cutneighghostsqo = _cutneighghostsq[0];
        #pragma offload_transfer target(mic:_cop) \
          nocopy(cutneighghostsqo:alloc_if(0) free_if(1))
@ -637,6 +709,8 @@ double IntelBuffers<flt_t, acc_t>::memory_usage(const int nthreads)
  tmem += (_list_alloc_atoms + _off_threads) * get_max_nbors() * sizeof(int);
  tmem += _ntypes * _ntypes * sizeof(int);

+  tmem += _buf_local_size + (_n_list_ptrs - 1) * _buf_local_size * 2;
+
  return tmem;
 }

--- a/src/USER-INTEL/intel_buffers.h
+++ b/src/USER-INTEL/intel_buffers.h
@ -33,12 +33,20 @@ namespace LAMMPS_NS {
 #define QUAT_T typename IntelBuffers<flt_t,acc_t>::quat_t
 #define FORCE_T typename IntelBuffers<flt_t,acc_t>::vec3_acc_t

+struct IntelNeighListPtrs {
+  void * list_ptr;
+  int *cnumneigh;
+  int *numneighhalf;
+  int size;
+};
+
 // May not need a separate force array for mixed/double
 template <class flt_t, class acc_t>
 class IntelBuffers {
 public:
  typedef struct { flt_t x,y,z; int w; } atom_t;
  typedef struct { flt_t w,i,j,k; } quat_t;
+  typedef struct { flt_t x,y; } vec2_t;
  typedef struct { flt_t x,y,z,w; } vec3_t;
  typedef struct { flt_t x,y,z,w; } vec4_t;
  typedef struct { acc_t x,y,z,w; } vec3_acc_t;
@ -62,8 +70,12 @@ class IntelBuffers {

  void free_buffers();
  void free_nmax();
-  inline void set_bininfo(int *atombin, int *binpacked)
-    { _atombin = atombin; _binpacked = binpacked; }
+  inline void set_bininfo(int *atombin, int *binpacked) {
+    _atombin = atombin;
+    _binpacked = binpacked;
+    _neigh_list_ptrs[0].numneighhalf = atombin;
+  }
+
  inline void grow(const int nall, const int nlocal, const int nthreads,
                   const int offload_end) {
    if (nall >= _buf_size || nlocal >= _buf_local_size)
@ -79,18 +91,22 @@ class IntelBuffers {
    free_nmax();
    free_list_local();
    free_ncache();
+    free_list_ptrs();
  }

  inline void grow_list(NeighList *list, const int nlocal, const int nthreads,
-                        const int offload_end, const int pack_width=1) {
-    grow_list_local(list, offload_end);
+                        const int three_body, const int offload_end,
+                        const int pack_width=1) {
+    grow_list_local(list, three_body, offload_end);
    grow_nbor_list(list, nlocal, nthreads, offload_end, pack_width);
  }

  void free_list_local();
-  inline void grow_list_local(NeighList *list, const int offload_end) {
+  inline void grow_list_local(NeighList *list, const int three_body,
+                              const int offload_end) {
+    _neigh_list_ptrs[0].list_ptr = (void *)list;
    if (list->get_maxlocal() > _off_map_listlocal)
-      _grow_list_local(list, offload_end);
+      _grow_list_local(list, three_body, offload_end);
  }

  void free_ccache();
@ -133,26 +149,36 @@ class IntelBuffers {
      _grow_nbor_list(list, nlocal, nthreads, offload_end, pack_width);
  }

-  void set_ntypes(const int ntypes, const int use_ghost_cut = 0);
+  void set_ntypes(const int ntypes, const int use_ghost_cut = 1);

-  inline int * firstneigh(const NeighList * /*list*/) { return _list_alloc; }
-  inline int * cnumneigh(const NeighList * /*list*/) { return _cnumneigh; }
+  inline int * intel_list(const NeighList * /*list*/) { return _list_alloc; }
  inline int * get_atombin() { return _atombin; }
  inline int * get_binpacked() { return _binpacked; }
+  inline int * cnumneigh() { return _neigh_list_ptrs[0].cnumneigh; }
+  inline void get_list_data3(const NeighList *list, int *&numneighhalf,
+                             int *&cnumneigh) {
+    for (int i = 0; i < _n_list_ptrs; i++)
+      if ((void *)list == _neigh_list_ptrs[i].list_ptr) {
+        numneighhalf = _neigh_list_ptrs[i].numneighhalf;
+        cnumneigh = _neigh_list_ptrs[i].cnumneigh;
+      }
+  }
+  void grow_data3(NeighList *list, int *&numneighhalf, int *&cnumneigh);
+  void free_list_ptrs();

-  inline atom_t * get_x(const int /*offload*/ = 1) {
+  inline atom_t * get_x(const int offload = 1) {
    #ifdef _LMP_INTEL_OFFLOAD
    if (_separate_buffers && offload == 0) return _host_x;
    #endif
    return _x;
  }
-  inline flt_t * get_q(const int /*offload*/ = 1) {
+  inline flt_t * get_q(const int offload = 1) {
    #ifdef _LMP_INTEL_OFFLOAD
    if (_separate_buffers && offload == 0) return _host_q;
    #endif
    return _q;
  }
-  inline quat_t * get_quat(const int /*offload*/ = 1) {
+  inline quat_t * get_quat(const int offload = 1) {
    #ifdef _LMP_INTEL_OFFLOAD
    if (_separate_buffers && offload == 0) return _host_quat;
    #endif
@ -298,6 +324,9 @@ class IntelBuffers {
  int _list_alloc_atoms;
  int *_list_alloc, *_cnumneigh, *_atombin, *_binpacked;

+  IntelNeighListPtrs *_neigh_list_ptrs;
+  int _n_list_ptrs, _max_list_ptrs;
+
  flt_t **_cutneighsq, **_cutneighghostsq;
  int _ntypes;

@ -325,7 +354,7 @@ class IntelBuffers {
  int _off_map_nmax, _cop, _off_ccache, _off_ncache;
  int *_off_map_ilist;
  int *_off_map_special, *_off_map_nspecial, *_off_map_tag;
-  int *_off_map_numneigh;
+  int **_off_map_firstneigh, *_off_map_numneigh;
  bool _off_list_alloc;
  #endif

@ -336,7 +365,8 @@ class IntelBuffers {
  void _grow(const int nall, const int nlocal, const int nthreads,
             const int offload_end);
  void _grow_nmax(const int offload_end);
-  void _grow_list_local(NeighList *list, const int offload_end);
+  void _grow_list_local(NeighList *list, const int three_body,
+                        const int offload_end);
  void _grow_nbor_list(NeighList *list, const int nlocal, const int nthreads,
                       const int offload_end, const int pack_width);
 };
--- a/src/USER-INTEL/intel_preprocess.h
+++ b/src/USER-INTEL/intel_preprocess.h
@ -20,6 +20,9 @@
 #if (__INTEL_COMPILER_BUILD_DATE > 20160720)
 #define LMP_INTEL_USE_SIMDOFF
 #endif
+#pragma warning (disable:3948)
+#pragma warning (disable:3949)
+#pragma warning (disable:13200)
 #endif

 #ifdef __INTEL_OFFLOAD
@ -606,7 +609,7 @@ inline double MIC_Wtime() {
  if (newton)                                                           \
    f_stride = buffers->get_stride(nall);                               \
  else                                                                  \
-    f_stride = buffers->get_stride(inum);                               \
+    f_stride = buffers->get_stride(nlocal);                             \
 }

 #define IP_PRE_get_buffers(offload, buffers, fix, tc, f_start,          \
--- a/src/USER-INTEL/intel_simd.h
+++ b/src/USER-INTEL/intel_simd.h
@ -213,6 +213,12 @@ namespace ip_simd {
                                       _MM_SCALE_8);
  }

+  inline SIMD_int SIMD_gatherz(const SIMD_mask &m, const int *p,
+                               const SIMD_int &i) {
+    return _mm512_mask_i32gather_epi32( _mm512_set1_epi32(0), m, i, p,
+                                    _MM_SCALE_4);
+  }
+
  inline SIMD_float SIMD_gatherz(const SIMD_mask &m, const float *p,
                                 const SIMD_int &i) {
    return _mm512_mask_i32gather_ps( _mm512_set1_ps((float)0), m, i, p,
--- a/src/USER-INTEL/nbin_intel.cpp
+++ b/src/USER-INTEL/nbin_intel.cpp
@ -192,7 +192,6 @@ void NBinIntel::bin_atoms(IntelBuffers<flt_t,acc_t> * buffers) {

  // ---------- Bin Atoms -------------
  _fix->start_watch(TIME_HOST_NEIGHBOR);
-  //const ATOM_T * _noalias const x = buffers->get_x();
  int * _noalias const atombin = this->_atombin;
  int * _noalias const binpacked = this->_binpacked;

--- a/src/USER-INTEL/npair_full_bin_ghost_intel.cpp
+++ b/src/USER-INTEL/npair_full_bin_ghost_intel.cpp
@ -86,7 +86,7 @@ void NPairFullBinGhostIntel::fbi(NeighList * list,

  // only uses offload_end_neighbor to check whether we are doing offloading
  // at all, no need to correct this later
-  buffers->grow_list(list, nall, comm->nthreads, off_end,
+  buffers->grow_list(list, nall, comm->nthreads, 0, off_end,
                     _fix->nbor_pack_width());

  int need_ic = 0;
@ -106,7 +106,7 @@ void NPairFullBinGhostIntel::fbi(NeighList * list,
 /* ---------------------------------------------------------------------- */

 template<class flt_t, class acc_t, int need_ic>
-void NPairFullBinGhostIntel::fbi(const int /*offload*/, NeighList * list,
+void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
                                 IntelBuffers<flt_t,acc_t> * buffers,
                                 const int pstart, const int pend) {
  if (pend-pstart == 0) return;
@ -116,7 +116,8 @@ void NPairFullBinGhostIntel::fbi(const int /*offload*/, NeighList * list,
  const int aend = nall;

  const ATOM_T * _noalias const x = buffers->get_x();
-  int * _noalias const firstneigh = buffers->firstneigh(list);
+  int * _noalias const intel_list = buffers->intel_list(list);
+  int ** _noalias const firstneigh = list->firstneigh;
  const int e_nall = nall_t;

  const int molecular = atom->molecular;
@ -140,7 +141,6 @@ void NPairFullBinGhostIntel::fbi(const int /*offload*/, NeighList * list,

  int * _noalias const ilist = list->ilist;
  int * _noalias numneigh = list->numneigh;
-  int * _noalias const cnumneigh = buffers->cnumneigh(list);
  const int nstencil = this->nstencil;
  const int * _noalias const stencil = this->stencil;
  const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
@ -228,7 +228,7 @@ void NPairFullBinGhostIntel::fbi(const int /*offload*/, NeighList * list,
    in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
    in(cutneighghostsq:length(0) alloc_if(0) free_if(0)) \
    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
-    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
+    in(intel_list:length(0) alloc_if(0) free_if(0)) \
    in(numneigh:length(0) alloc_if(0) free_if(0)) \
    in(ilist:length(0) alloc_if(0) free_if(0)) \
    in(atombin:length(aend) alloc_if(0) free_if(0)) \
@ -295,7 +295,7 @@ void NPairFullBinGhostIntel::fbi(const int /*offload*/, NeighList * list,

      int pack_offset = maxnbors;
      int ct = (ifrom + tid * 2) * maxnbors;
-      int *neighptr = firstneigh + ct;
+      int *neighptr = intel_list + ct;
      const int obound = pack_offset + maxnbors * 2;

      const int toffs = tid * ncache_stride;
@ -528,12 +528,12 @@ void NPairFullBinGhostIntel::fbi(const int /*offload*/, NeighList * list,
        if (ns > maxnbors) *overflow = 1;

        ilist[i] = i;
-        cnumneigh[i] = ct;
+        firstneigh[i] = intel_list + ct;
        numneigh[i] = ns;

        ct += ns;
        IP_PRE_edge_align(ct, sizeof(int));
-        neighptr = firstneigh + ct;
+        neighptr = intel_list + ct;
        if (ct + obound > list_size) {
          if (i < ito - 1) {
            *overflow = 1;
@ -564,13 +564,12 @@ void NPairFullBinGhostIntel::fbi(const int /*offload*/, NeighList * list,
  if (offload) {
    _fix->stop_watch(TIME_OFFLOAD_LATENCY);
    _fix->start_watch(TIME_HOST_NEIGHBOR);
+    firstneigh[0] = intel_list;
    for (int n = 0; n < aend; n++) {
      ilist[n] = n;
      numneigh[n] = 0;
    }
  } else {
-    for (int i = 0; i < aend; i++)
-      list->firstneigh[i] = firstneigh + cnumneigh[i];
    if (separate_buffers) {
      _fix->start_watch(TIME_PACK);
      _fix->set_neighbor_host_sizes();
@ -581,10 +580,5 @@ void NPairFullBinGhostIntel::fbi(const int /*offload*/, NeighList * list,
      _fix->stop_watch(TIME_PACK);
    }
  }
-  #else
-  #pragma vector aligned
-  #pragma simd
-  for (int i = 0; i < aend; i++)
-    list->firstneigh[i] = firstneigh + cnumneigh[i];
  #endif
 }
--- a/src/USER-INTEL/npair_full_bin_intel.cpp
+++ b/src/USER-INTEL/npair_full_bin_intel.cpp
@ -70,7 +70,8 @@ fbi(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
  int offload_noghost = _fix->offload_noghost();
  #endif

-  buffers->grow_list(list, atom->nlocal, comm->nthreads, off_end,
+  buffers->grow_list(list, atom->nlocal, comm->nthreads,
+                     _fix->three_body_neighbor(), off_end,
                     _fix->nbor_pack_width());

  int need_ic = 0;
--- a/src/USER-INTEL/npair_half_bin_newton_intel.cpp
+++ b/src/USER-INTEL/npair_half_bin_newton_intel.cpp
@ -71,7 +71,7 @@ hbni(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
  int offload_noghost = _fix->offload_noghost();
  #endif

-  buffers->grow_list(list, atom->nlocal, comm->nthreads, off_end);
+  buffers->grow_list(list, atom->nlocal, comm->nthreads, 0, off_end);

  int need_ic = 0;
  if (atom->molecular)
--- a/src/USER-INTEL/npair_half_bin_newton_tri_intel.cpp
+++ b/src/USER-INTEL/npair_half_bin_newton_tri_intel.cpp
@ -71,7 +71,7 @@ hbnti(NeighList *list, IntelBuffers<flt_t,acc_t> *buffers) {
  int offload_noghost = _fix->offload_noghost();
  #endif

-  buffers->grow_list(list, atom->nlocal, comm->nthreads, off_end);
+  buffers->grow_list(list, atom->nlocal, comm->nthreads, 0, off_end);

  int need_ic = 0;
  if (atom->molecular)
--- a/src/USER-INTEL/npair_halffull_newtoff_intel.h
+++ b/src/USER-INTEL/npair_halffull_newtoff_intel.h
@ -0,0 +1,47 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+// Only used for hybrid to generate list for non-intel style. Use
+// standard routines.
+
+#ifdef NPAIR_CLASS
+
+NPairStyle(halffull/newtoff/intel,
+           NPairHalffullNewtoff,
+           NP_HALF_FULL | NP_NEWTOFF | NP_NSQ | NP_BIN | NP_MULTI | NP_HALF |
+           NP_ORTHO | NP_TRI | NP_INTEL)
+
+NPairStyle(halffull/newtoff/skip/intel,
+           NPairHalffullNewtoff,
+           NP_HALF_FULL | NP_NEWTOFF | NP_NSQ | NP_BIN | NP_MULTI | NP_HALF |
+           NP_ORTHO | NP_TRI | NP_SKIP | NP_INTEL)
+
+NPairStyle(halffull/newtoff/ghost/intel,
+           NPairHalffullNewtoff,
+           NP_HALF_FULL | NP_NEWTOFF | NP_NSQ | NP_BIN | NP_MULTI | NP_HALF |
+           NP_ORTHO | NP_TRI | NP_GHOST | NP_INTEL)
+
+NPairStyle(halffull/newtoff/skip/ghost/intel,
+           NPairHalffullNewtoff,
+           NP_HALF_FULL | NP_NEWTOFF | NP_NSQ | NP_BIN | NP_MULTI | NP_HALF |
+           NP_ORTHO | NP_TRI | NP_SKIP | NP_GHOST | NP_INTEL)
+
+#endif
+
+/* ERROR/WARNING messages:
+
+*/
--- a/src/USER-INTEL/npair_halffull_newton_intel.cpp
+++ b/src/USER-INTEL/npair_halffull_newton_intel.cpp
@ -0,0 +1,233 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include "npair_halffull_newton_intel.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "comm.h"
+#include "modify.h"
+#include "molecule.h"
+#include "domain.h"
+#include "my_page.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+NPairHalffullNewtonIntel::NPairHalffullNewtonIntel(LAMMPS *lmp) : NPair(lmp) {
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  _fix = static_cast<FixIntel *>(modify->fix[ifix]);
+}
+
+/* ----------------------------------------------------------------------
+   build half list from full list
+   pair stored once if i,j are both owned and i < j
+   if j is ghost, only store if j coords are "above and to the right" of i
+   works if full list is a skip list
+------------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void NPairHalffullNewtonIntel::build_t(NeighList *list,
+                                       IntelBuffers<flt_t,acc_t> *buffers)
+{
+  const int inum_full = list->listfull->inum;
+  const int nlocal = atom->nlocal;
+  const int e_nall = nlocal + atom->nghost;
+  const ATOM_T * _noalias const x = buffers->get_x();
+  int * _noalias const ilist = list->ilist;
+  int * _noalias const numneigh = list->numneigh;
+  int ** _noalias const firstneigh = list->firstneigh;
+  const int * _noalias const ilist_full = list->listfull->ilist;
+  const int * _noalias const numneigh_full = list->listfull->numneigh;
+  const int ** _noalias const firstneigh_full =
+    (const int ** const)list->listfull->firstneigh;
+  
+  #if defined(_OPENMP)
+  #pragma omp parallel
+  #endif
+  {
+    int tid, ifrom, ito;
+    IP_PRE_omp_range_id(ifrom, ito, tid, inum_full, comm->nthreads);
+
+    // each thread has its own page allocator
+    MyPage<int> &ipage = list->ipage[tid];
+    ipage.reset();
+
+    // loop over parent full list
+    for (int ii = ifrom; ii < ito; ii++) {
+      int n = 0;
+      int *neighptr = ipage.vget();
+
+      const int i = ilist_full[ii];
+      const flt_t xtmp = x[i].x;
+      const flt_t ytmp = x[i].y;
+      const flt_t ztmp = x[i].z;
+
+      // loop over full neighbor list
+
+      const int * _noalias const jlist = firstneigh_full[i];
+      const int jnum = numneigh_full[i];
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma ivdep
+      #endif
+      for (int jj = 0; jj < jnum; jj++) {
+        const int joriginal = jlist[jj];
+        const int j = joriginal & NEIGHMASK;
+        int addme = 1;
+        if (j < nlocal) {
+          if (i > j) addme = 0;
+        } else {
+          if (x[j].z < ztmp) addme = 0;
+          if (x[j].z == ztmp) {
+            if (x[j].y < ytmp) addme = 0;
+            if (x[j].y == ytmp && x[j].x < xtmp) addme = 0;
+          }
+        }
+        if (addme)
+          neighptr[n++] = joriginal;
+      }
+
+      ilist[ii] = i;
+      firstneigh[i] = neighptr;
+      numneigh[i] = n;
+
+      int pad_end = n;
+      IP_PRE_neighbor_pad(pad_end, 0);
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \
+              avg=INTEL_COMPILE_WIDTH/2
+      #endif
+      for ( ; n < pad_end; n++)
+        neighptr[n] = e_nall;
+
+      ipage.vgot(n);
+      if (ipage.status())
+        error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+    }
+  }
+  list->inum = inum_full;
+}
+
+/* ----------------------------------------------------------------------
+   build half list from full 3-body list
+   half list is already stored as first part of 3-body list
+------------------------------------------------------------------------- */
+
+template <class flt_t>
+void NPairHalffullNewtonIntel::build_t3(NeighList *list, int *numhalf)
+{
+  const int inum_full = list->listfull->inum;
+  const int e_nall = atom->nlocal + atom->nghost;
+  int * _noalias const ilist = list->ilist;
+  int * _noalias const numneigh = list->numneigh;
+  int ** _noalias const firstneigh = list->firstneigh;
+  const int * _noalias const ilist_full = list->listfull->ilist;
+  const int * _noalias const numneigh_full = numhalf;
+  const int ** _noalias const firstneigh_full =
+    (const int ** const)list->listfull->firstneigh;
+  
+  int packthreads = 1;
+  if (comm->nthreads > INTEL_HTHREADS) packthreads = comm->nthreads;
+
+  #if defined(_OPENMP)
+  #pragma omp parallel if(packthreads > 1)
+  #endif
+  {
+    int tid, ifrom, ito;
+    IP_PRE_omp_range_id(ifrom, ito, tid, inum_full, packthreads);
+
+    // each thread has its own page allocator
+    MyPage<int> &ipage = list->ipage[tid];
+    ipage.reset();
+
+    // loop over parent full list
+    for (int ii = ifrom; ii < ito; ii++) {
+      int n = 0;
+      int *neighptr = ipage.vget();
+
+      const int i = ilist_full[ii];
+
+      // loop over full neighbor list
+
+      const int * _noalias const jlist = firstneigh_full[i];
+      const int jnum = numneigh_full[ii];
+
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma ivdep
+      #endif
+      for (int jj = 0; jj < jnum; jj++) {
+        const int joriginal = jlist[jj];
+        neighptr[n++] = joriginal;
+      }
+
+      ilist[ii] = i;
+      firstneigh[i] = neighptr;
+      numneigh[i] = n;
+
+      int pad_end = n;
+      IP_PRE_neighbor_pad(pad_end, 0);
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \
+              avg=INTEL_COMPILE_WIDTH/2
+      #endif
+      for ( ; n < pad_end; n++)
+        neighptr[n] = e_nall;
+
+      ipage.vgot(n);
+      if (ipage.status())
+        error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+    }
+  }
+  list->inum = inum_full;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void NPairHalffullNewtonIntel::build(NeighList *list)
+{
+  if (_fix->three_body_neighbor() == 0) {
+    if (_fix->precision() == FixIntel::PREC_MODE_MIXED)
+      build_t(list, _fix->get_mixed_buffers());
+    else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+      build_t(list, _fix->get_double_buffers());
+    else
+      build_t(list, _fix->get_single_buffers());
+  } else {
+    int *nhalf, *cnum;
+    if (_fix->precision() == FixIntel::PREC_MODE_MIXED) {
+      _fix->get_mixed_buffers()->get_list_data3(list->listfull, nhalf, cnum);
+      build_t3<float>(list, nhalf);
+    } else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+      _fix->get_double_buffers()->get_list_data3(list->listfull, nhalf, cnum);
+      build_t3<double>(list, nhalf);
+    } else {
+      _fix->get_single_buffers()->get_list_data3(list->listfull, nhalf, cnum);
+      build_t3<float>(list, nhalf);
+    }
+  }
+}
--- a/src/USER-INTEL/npair_halffull_newton_intel.h
+++ b/src/USER-INTEL/npair_halffull_newton_intel.h
@ -0,0 +1,72 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef NPAIR_CLASS
+
+NPairStyle(halffull/newton/intel,
+           NPairHalffullNewtonIntel,
+           NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
+           NP_ORTHO | NP_TRI| NP_INTEL)
+
+NPairStyle(halffull/newton/skip/intel,
+           NPairHalffullNewtonIntel,
+           NP_HALF_FULL | NP_NEWTON | NP_HALF | NP_NSQ | NP_BIN | NP_MULTI |
+           NP_ORTHO | NP_TRI | NP_SKIP | NP_INTEL)
+
+#else
+
+#ifndef LMP_NPAIR_HALFFULL_NEWTON_INTEL_H
+#define LMP_NPAIR_HALFFULL_NEWTON_INTEL_H
+
+#include "npair.h"
+#include "fix_intel.h"
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+
+namespace LAMMPS_NS {
+
+class NPairHalffullNewtonIntel : public NPair {
+ public:
+  NPairHalffullNewtonIntel(class LAMMPS *);
+  ~NPairHalffullNewtonIntel() {}
+  void build(class NeighList *);
+
+ protected:
+  FixIntel *_fix;
+
+  template<class flt_t, class acc_t>
+  void build_t(NeighList *, IntelBuffers<flt_t,acc_t> *);
+
+  template<class flt_t>
+  void build_t3(NeighList *, int *);
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: The 'package intel' command is required for /intel styles
+
+Self explanatory.
+
+*/
--- a/src/USER-INTEL/npair_intel.cpp
+++ b/src/USER-INTEL/npair_intel.cpp
@ -52,12 +52,61 @@ NPairIntel::~NPairIntel() {

 /* ---------------------------------------------------------------------- */

+void NPairIntel::copy_neighbor_info()
+{
+  NPair::copy_neighbor_info();
+  if (_fix->precision() == FixIntel::PREC_MODE_MIXED)
+    copy_cutsq_info(_fix->get_mixed_buffers());
+  else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    copy_cutsq_info(_fix->get_double_buffers());
+  else
+    copy_cutsq_info(_fix->get_single_buffers());
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void NPairIntel::copy_cutsq_info(IntelBuffers<flt_t,acc_t> *buffers) {
+  int tp1 = atom->ntypes + 1;
+  int use_ghost_cut = 0;
+  if (cutneighghostsq)
+    use_ghost_cut = 1;
+  buffers->set_ntypes(tp1, use_ghost_cut);
+  
+  flt_t **cutneighsqb = buffers->get_cutneighsq();
+  for (int i = 1; i <= atom->ntypes; i++)
+    for (int j = 1; j <= atom->ntypes; j++)
+      cutneighsqb[i][j] = cutneighsq[i][j];
+
+  flt_t **cutneighghostsqb;
+  if (use_ghost_cut) {
+    cutneighghostsqb = buffers->get_cutneighghostsq();
+    for (int i = 1; i <= atom->ntypes; i++)
+      for (int j = 1; j <= atom->ntypes; j++)
+        cutneighghostsqb[i][j] = cutneighghostsq[i][j];
+  }
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_cop < 0) return;
+  int tp1sq = tp1 * tp1;
+  flt_t * ocutneighsq = cutneighsqb[0];
+  #pragma offload_transfer target(mic:_cop) in(ocutneighsq: length(tp1sq))
+  if (use_ghost_cut) {
+    flt_t * ocutneighghostsq = cutneighghostsqb[0];
+    #pragma offload_transfer target(mic:_cop) \
+      in(ocutneighghostsq: length(tp1sq))
+  }
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
 template <class flt_t, class acc_t, int offload_noghost, int need_ic,
          int FULL, int TRI, int THREE>
-void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
+void NPairIntel::bin_newton(const int offload, NeighList *list,
                            IntelBuffers<flt_t,acc_t> *buffers,
                            const int astart, const int aend,
-                            const int /*offload_end*/) {
+                            const int offload_end) {

  if (aend-astart == 0) return;

@ -71,7 +120,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
  const int pack_width = _fix->nbor_pack_width();

  const ATOM_T * _noalias const x = buffers->get_x();
-  int * _noalias const firstneigh = buffers->firstneigh(list);
+  int * _noalias const intel_list = buffers->intel_list(list);
  const int e_nall = nall_t;

  const int molecular = atom->molecular;
@ -94,8 +143,10 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
  const tagint * _noalias const tag = atom->tag;

  int * _noalias const ilist = list->ilist;
-  int * _noalias numneigh = list->numneigh;
-  int * _noalias const cnumneigh = buffers->cnumneigh(list);
+  int ** _noalias const firstneigh = list->firstneigh;
+  int * _noalias const numneigh = list->numneigh;
+  int * _noalias const cnumneigh = buffers->cnumneigh();
+ 
  const int nstencil = this->nstencil;
  const int * _noalias const stencil = this->stencil;
  const flt_t * _noalias const cutneighsq = buffers->get_cutneighsq()[0];
@ -168,6 +219,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
    in(binhead:length(mbins+1) alloc_if(0) free_if(0)) \
    in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
+    in(intel_list:length(0) alloc_if(0) free_if(0)) \
    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
    out(numneigh:length(0) alloc_if(0) free_if(0)) \
    in(ilist:length(0) alloc_if(0) free_if(0)) \
@ -178,7 +230,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
    in(ncache_stride,maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \
    in(offload_end,separate_buffers,astart,aend,nlocal,molecular) \
    in(ntypes,xperiodic,yperiodic,zperiodic,xprd_half,yprd_half,zprd_half) \
-    in(pack_width,special_bound) \
+    in(pack_width,special_bound)                                        \
    out(overflow:length(5) alloc_if(0) free_if(0)) \
    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
    signal(tag)
@ -212,7 +264,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,

    #if defined(_OPENMP)
    #pragma omp parallel default(none) \
-      shared(numneigh, overflow, nstencilp, binstart, binend)
+      shared(overflow, nstencilp, binstart, binend)
    #endif
    {
      #ifdef _LMP_INTEL_OFFLOAD
@ -246,7 +298,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
      const int obound = maxnbors * 3;
      #endif
      int ct = (ifrom + tid * 2) * maxnbors;
-      int *neighptr = firstneigh + ct;
+      int *neighptr = intel_list + ct;
      int *neighptr2;
      if (THREE) neighptr2 = neighptr;

@ -605,12 +657,24 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
          if (n > maxnbors) *overflow = 1;

        ilist[i] = i;
-        cnumneigh[i] = ct;
+        firstneigh[i] = intel_list + ct;
        if (THREE) {
+          numneigh[i] = ns;
+          cnumneigh[i] = ct;
          #ifdef LMP_INTEL_3BODY_FAST
          cnumneigh[i] += lane;
+          #else
+          // Pad anyways just in case we have hybrid with 2-body and newton off
+          int pad_end = ns;
+          IP_PRE_neighbor_pad(pad_end, offload);
+          #if defined(LMP_SIMD_COMPILER)
+          #pragma vector aligned
+          #pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \
+                  avg=INTEL_COMPILE_WIDTH/2
+          #endif
+          for ( ; ns < pad_end; ns++)
+            neighptr[n++] = e_nall;
          #endif
-          numneigh[i] = ns;
        } else {
          numneigh[i] = n;
          int pad_end = n;
@ -631,7 +695,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
          if (lane == pack_width) {
            ct += max_chunk * pack_width;
            IP_PRE_edge_align(ct, sizeof(int));
-            neighptr = firstneigh + ct;
+            neighptr = intel_list + ct;
            neighptr2 = neighptr;
            max_chunk = 0;
            lane = 0;
@ -647,7 +711,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
        {
          ct += n;
          //IP_PRE_edge_align(ct, sizeof(int));
-          neighptr = firstneigh + ct;
+          neighptr = intel_list + ct;
          if (THREE) neighptr2 = neighptr;
          if (ct + obound > list_size) {
            if (i < ito - 1) {
@ -667,7 +731,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
      int ghost_offset = 0, nall_offset = e_nall;
      if (separate_buffers) {
        for (int i = ifrom; i < ito; ++i) {
-          int * _noalias jlist = firstneigh + cnumneigh[i];
+          int * _noalias jlist = firstneigh[i];
          int jnum = numneigh[i];
          if (!THREE) IP_PRE_neighbor_pad(jnum, offload);
          #if __INTEL_COMPILER+0 > 1499
@ -712,7 +776,7 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
        }

        for (int i = ifrom; i < ito; ++i) {
-          int * _noalias jlist = firstneigh + cnumneigh[i];
+          int * _noalias jlist = firstneigh[i];
          int jnum = numneigh[i];
          if (!THREE) IP_PRE_neighbor_pad(jnum, offload);
          int jj = 0;
@ -739,13 +803,12 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
  if (offload) {
    _fix->stop_watch(TIME_OFFLOAD_LATENCY);
    _fix->start_watch(TIME_HOST_NEIGHBOR);
+    firstneigh[0] = intel_list;
    for (int n = 0; n < aend; n++) {
      ilist[n] = n;
      numneigh[n] = 0;
    }
  } else {
-    for (int i = astart; i < aend; i++)
-      list->firstneigh[i] = firstneigh + cnumneigh[i];
    if (separate_buffers) {
      _fix->start_watch(TIME_PACK);
      _fix->set_neighbor_host_sizes();
@ -756,11 +819,6 @@ void NPairIntel::bin_newton(const int /*offload*/, NeighList *list,
      _fix->stop_watch(TIME_PACK);
    }
  }
-  #else
-  #pragma vector aligned
-  #pragma simd
-  for (int i = astart; i < aend; i++)
-    list->firstneigh[i] = firstneigh + cnumneigh[i];
  #endif
 }

--- a/src/USER-INTEL/npair_intel.h
+++ b/src/USER-INTEL/npair_intel.h
@ -75,7 +75,8 @@ class NPairIntel : public NPair {
 public:
  NPairIntel(class LAMMPS *);
  ~NPairIntel();
-
+  virtual void copy_neighbor_info();
+  
  #ifdef _LMP_INTEL_OFFLOAD
  void grow_stencil();
  #endif
@ -83,6 +84,9 @@ class NPairIntel : public NPair {
 protected:
  FixIntel *_fix;

+  template <class flt_t, class acc_t>
+  void copy_cutsq_info(IntelBuffers<flt_t,acc_t> *);
+  
  template <class flt_t, class acc_t, int, int, int, int, int>
  void bin_newton(const int, NeighList *, IntelBuffers<flt_t,acc_t> *,
                  const int, const int, const int offload_end = 0);
--- a/src/USER-INTEL/npair_skip_intel.cpp
+++ b/src/USER-INTEL/npair_skip_intel.cpp
@ -0,0 +1,230 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include "npair_skip_intel.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "comm.h"
+#include "modify.h"
+#include "molecule.h"
+#include "neigh_request.h"
+#include "domain.h"
+#include "my_page.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+NPairSkipIntel::NPairSkipIntel(LAMMPS *lmp) : NPair(lmp) {
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  _fix = static_cast<FixIntel *>(modify->fix[ifix]);
+  _inum_starts = new int[comm->nthreads];
+  _inum_counts = new int[comm->nthreads];
+  _full_props = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+NPairSkipIntel::~NPairSkipIntel() {
+  delete []_inum_starts;
+  delete []_inum_counts;
+  if (_full_props) delete []_full_props;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void NPairSkipIntel::copy_neighbor_info()
+{
+  NPair::copy_neighbor_info();
+  if (_full_props) delete []_full_props;
+  _full_props = new int[neighbor->nlist];
+  for (int i = 0; i < neighbor->nlist; i++)
+    _full_props[i] = neighbor->requests[i]->full;
+}
+
+/* ----------------------------------------------------------------------
+   build skip list for subset of types from parent list
+   works for half and full lists
+   works for owned (non-ghost) list, also for ghost list
+   iskip and ijskip flag which atom types and type pairs to skip
+   if ghost, also store neighbors of ghost atoms & set inum,gnum correctly
+------------------------------------------------------------------------- */
+
+template<class flt_t, int THREE>
+void NPairSkipIntel::build_t(NeighList *list, int *numhalf, int *cnumneigh,
+                             int *numhalf_skip)
+{
+  const int nlocal = atom->nlocal;
+  const int e_nall = nlocal + atom->nghost;
+  const int * _noalias const type = atom->type;
+  int * _noalias const ilist = list->ilist;
+  int * _noalias const numneigh = list->numneigh;
+  int ** _noalias const firstneigh = (int ** const)list->firstneigh;
+  const int * _noalias const ilist_skip = list->listskip->ilist;
+  const int * _noalias const numneigh_skip = list->listskip->numneigh;
+  const int ** _noalias const firstneigh_skip =
+    (const int ** const)list->listskip->firstneigh;
+  const int * _noalias const iskip = list->iskip;
+  const int ** _noalias const ijskip = (const int ** const)list->ijskip;
+
+  int num_skip = list->listskip->inum;
+  if (list->ghost) num_skip += list->listskip->gnum;
+
+  int packthreads;
+  if (comm->nthreads > INTEL_HTHREADS && THREE==0)
+    packthreads = comm->nthreads;
+  else
+    packthreads = 1;
+
+  #if defined(_OPENMP)
+  #pragma omp parallel if(packthreads > 1)
+  #endif
+  {
+    int tid, ifrom, ito;
+    IP_PRE_omp_range_id(ifrom, ito, tid, num_skip, packthreads);
+
+    // each thread has its own page allocator
+    MyPage<int> &ipage = list->ipage[tid];
+    ipage.reset();
+
+    int my_inum = ifrom;
+    _inum_starts[tid] = ifrom;
+    
+    // loop over parent full list
+    for (int ii = ifrom; ii < ito; ii++) {
+      const int i = ilist_skip[ii];
+      const int itype = type[i];
+      if (iskip[itype]) continue;
+
+      int n = 0;
+      int *neighptr = ipage.vget();
+
+      // loop over parent non-skip list
+
+      const int * _noalias const jlist = firstneigh_skip[i];
+      const int jnum = numneigh_skip[i];
+
+      if (THREE) {
+        const int jnumhalf = numhalf_skip[ii];
+        for (int jj = 0; jj < jnumhalf; jj++) {
+          const int joriginal = jlist[jj];
+          const int j = joriginal & NEIGHMASK;
+          if (!ijskip[itype][type[j]]) neighptr[n++] = joriginal;
+        }
+        numhalf[my_inum] = n; 
+
+        for (int jj = jnumhalf; jj < jnum; jj++) {
+          const int joriginal = jlist[jj];
+          const int j = joriginal & NEIGHMASK;
+          if (!ijskip[itype][type[j]]) neighptr[n++] = joriginal;
+        }
+      } else {
+        #if defined(LMP_SIMD_COMPILER)
+        #pragma vector aligned
+        #pragma ivdep
+        #endif
+        for (int jj = 0; jj < jnum; jj++) {
+          const int joriginal = jlist[jj];
+          const int j = joriginal & NEIGHMASK;
+          if (!ijskip[itype][type[j]]) neighptr[n++] = joriginal;
+        }
+      }
+
+      ilist[my_inum++] = i;
+      firstneigh[i] = neighptr;
+      numneigh[i] = n;
+
+      int pad_end = n;
+      IP_PRE_neighbor_pad(pad_end, 0);
+      #if defined(LMP_SIMD_COMPILER)
+      #pragma vector aligned
+      #pragma loop_count min=1, max=INTEL_COMPILE_WIDTH-1, \
+              avg=INTEL_COMPILE_WIDTH/2
+      #endif
+      for ( ; n < pad_end; n++)
+        neighptr[n] = e_nall;
+
+      ipage.vgot(n);
+      if (ipage.status())
+        error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
+    }
+
+    int last_inum = 0, loop_end;
+    _inum_counts[tid] = my_inum;
+  }
+  int inum = _inum_counts[0];
+  for (int tid = 1; tid < packthreads; tid++) {
+    for (int i = _inum_starts[tid]; i < _inum_counts[tid]; i++) {
+      if (THREE) numhalf[inum] = numhalf[i];
+      ilist[inum++] = ilist[i];
+    }
+  }
+  list->inum = inum;
+
+  if (THREE && num_skip > 0) {
+    int * const list_start = firstneigh[ilist[0]];
+    for (int ii = 0; ii < inum; ii++) {
+      int i = ilist[ii];
+      cnumneigh[ii] = static_cast<int>(firstneigh[i] - list_start);
+    }
+  }
+  if (list->ghost) {
+    int num = 0;
+    int my_inum = list->inum;
+    for (int i = 0; i < my_inum; i++)
+      if (ilist[i] < nlocal) num++;
+      else break;
+    list->inum = num;
+    list->gnum = my_inum - num;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void NPairSkipIntel::build(NeighList *list)
+{
+  if (_fix->three_body_neighbor()==0 ||
+      _full_props[list->listskip->index] == 0) {
+    if (_fix->precision() == FixIntel::PREC_MODE_MIXED)
+      build_t<float,0>(list, 0, 0, 0);
+    else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+      build_t<double,0>(list, 0, 0, 0);
+    else
+      build_t<float,0>(list, 0, 0, 0);
+  } else {
+    int *nhalf, *cnumneigh, *nhalf_skip, *u;
+    if (_fix->precision() == FixIntel::PREC_MODE_MIXED) {
+      _fix->get_mixed_buffers()->get_list_data3(list->listskip,nhalf_skip,u);
+      _fix->get_mixed_buffers()->grow_data3(list, nhalf, cnumneigh);
+      build_t<float,1>(list, nhalf, cnumneigh, nhalf_skip);
+    } else if (_fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+      _fix->get_double_buffers()->get_list_data3(list->listskip,nhalf_skip,u);
+      _fix->get_double_buffers()->grow_data3(list, nhalf, cnumneigh);
+      build_t<double,1>(list, nhalf, cnumneigh, nhalf_skip);
+    } else {
+      _fix->get_single_buffers()->get_list_data3(list->listskip,nhalf_skip,u);
+      _fix->get_single_buffers()->grow_data3(list,nhalf,cnumneigh);
+      build_t<float,1>(list, nhalf, cnumneigh, nhalf_skip);
+    }
+  }
+}
--- a/src/USER-INTEL/npair_skip_intel.h
+++ b/src/USER-INTEL/npair_skip_intel.h
@ -0,0 +1,69 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef NPAIR_CLASS
+
+NPairStyle(skip/intel,
+           NPairSkipIntel,
+           NP_SKIP | NP_HALF | NP_FULL |
+           NP_NSQ | NP_BIN | NP_MULTI |
+           NP_NEWTON | NP_NEWTOFF | NP_ORTHO | NP_TRI | NP_INTEL)
+
+NPairStyle(skip/ghost/intel,
+           NPairSkipIntel,
+           NP_SKIP | NP_HALF | NP_FULL |
+           NP_NSQ | NP_BIN | NP_MULTI |
+           NP_NEWTON | NP_NEWTOFF | NP_ORTHO | NP_TRI | NP_GHOST | NP_INTEL)
+
+#else
+
+#ifndef LMP_NPAIR_SKIP_INTEL_H
+#define LMP_NPAIR_SKIP_INTEL_H
+
+#include "npair.h"
+#include "fix_intel.h"
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+
+namespace LAMMPS_NS {
+
+class NPairSkipIntel : public NPair {
+ public:
+  NPairSkipIntel(class LAMMPS *);
+  ~NPairSkipIntel();
+  virtual void copy_neighbor_info();
+  void build(class NeighList *);
+  
+ protected:
+  FixIntel *_fix;
+  int *_inum_starts, *_inum_counts, *_full_props;
+
+  template<class flt_t, int THREE>
+  void build_t(NeighList *, int *numhalf, int *cnumneigh, int *numhalf_skip);
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: The 'package intel' command is required for /intel styles
+
+Self explanatory.
+
+*/
--- a/src/USER-INTEL/pair_airebo_intel.cpp
+++ b/src/USER-INTEL/pair_airebo_intel.cpp
@ -97,6 +97,13 @@ struct LAMMPS_NS::PairAIREBOIntelParam {

 namespace {

+struct NeighListLMPAIREBO {
+  int * num; /* num_all */
+  int * num_half; /* num_all */
+  int * offset; /* num_all */
+  int ** entries; /* num_all * num_neighs_per_atom */
+};
+
 struct NeighListAIREBO {
  int * num; /* num_all */
  int * num_half; /* num_all */
@ -125,7 +132,7 @@ struct KernelArgsAIREBOT {
  int neigh_from_atom, neigh_to_atom;
  int rebuild_flag;
  flt_t skin;
-  struct NeighListAIREBO neigh_lmp;
+  struct NeighListLMPAIREBO neigh_lmp;
  struct NeighListAIREBO neigh_rebo;
  PairAIREBOIntelParam<flt_t,acc_t> params;
  struct AtomAIREBOT<flt_t> * x; /* num_all */
@ -176,6 +183,11 @@ void PairAIREBOIntel::init_style()
  PairAIREBO::init_style();
  neighbor->requests[neighbor->nrequest-1]->intel = 1;

+  const int nrequest = neighbor->nrequest;
+  for (int i = 0; i < nrequest; ++i)
+    if (neighbor->requests[i]->skip)
+      error->all(FLERR, "Cannot yet use airebo/intel with hybrid.");
+
  int ifix = modify->find_fix("package_intel");
  if (ifix < 0)
    error->all(FLERR,
@ -399,8 +411,7 @@ void PairAIREBOIntel::eval(
  ATOM_T * _noalias const x = buffers->get_x(offload);
  const int * _noalias const numneighhalf = buffers->get_atombin();
  const int * _noalias const numneigh = list->numneigh;
-  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int ** _noalias const firstneigh = (const int **)list->firstneigh;
  tagint * const tag = atom->tag;

  const int ntypes = atom->ntypes + 1;
@ -441,7 +452,6 @@ void PairAIREBOIntel::eval(

  #pragma offload target(mic:_cop) if(offload) \
    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
-    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
    in(numneigh:length(0) alloc_if(0) free_if(0)) \
    in(numneighhalf:length(0) alloc_if(0) free_if(0)) \
    in(x:length(x_size) alloc_if(0) free_if(0)) \
@ -472,10 +482,8 @@ void PairAIREBOIntel::eval(
                              f_stride, x, 0/*q*/);

    acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EVFLAG) {
-      oevdwl = oecoul = (acc_t)0;
-      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
-    }
+    if (EVFLAG)
+      oevdwl = oecoul = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;

    // loop over neighbors of my atoms
    #if defined(_OPENMP)
@ -508,8 +516,7 @@ void PairAIREBOIntel::eval(
      args.skin = skin;
      args.neigh_lmp.num = const_cast<int*>(numneigh);
      args.neigh_lmp.num_half = const_cast<int*>(numneighhalf);
-      args.neigh_lmp.offset = const_cast<int*>(cnumneigh);
-      args.neigh_lmp.entries = const_cast<int*>(firstneigh);
+      args.neigh_lmp.entries = const_cast<int**>(firstneigh);
      args.neigh_rebo.num = REBO_numneigh;
      args.neigh_rebo.num_half = REBO_num_skin;
      args.neigh_rebo.offset = REBO_cnumneigh;
@ -543,18 +550,14 @@ void PairAIREBOIntel::eval(
    IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
                        ov0, ov1, ov2, ov3, ov4, ov5);
    if (EVFLAG) {
-      if (EFLAG) {
-        ev_global[0] = oevdwl;
-        ev_global[1] = oecoul;
-      }
-      if (vflag) {
-        ev_global[2] = ov0;
-        ev_global[3] = ov1;
-        ev_global[4] = ov2;
-        ev_global[5] = ov3;
-        ev_global[6] = ov4;
-        ev_global[7] = ov5;
-      }
+      ev_global[0] = oevdwl;
+      ev_global[1] = oecoul;
+      ev_global[2] = ov0;
+      ev_global[3] = ov1;
+      ev_global[4] = ov2;
+      ev_global[5] = ov3;
+      ev_global[6] = ov4;
+      ev_global[7] = ov5;
    }
    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
    *timer_compute = MIC_Wtime() - *timer_compute;
@ -578,28 +581,20 @@ template<class flt_t, class acc_t>
 void PairAIREBOIntel::pack_force_const(IntelBuffers<flt_t,acc_t> * buffers) {
  int tp1 = atom->ntypes + 1;

-  buffers->set_ntypes(tp1,1);
-  flt_t **cutneighsq = buffers->get_cutneighsq();
-  flt_t **cutneighghostsq = buffers->get_cutneighghostsq();
-
  // Repeat cutsq calculation because done after call to init_style
-  double cut, cutneigh;
  for (int i = 1; i <= atom->ntypes; i++) {
    for (int j = i; j <= atom->ntypes; j++) {
-      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+      double cut;
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
        cut = init_one(i, j);
-        cutneigh = cut + neighbor->skin;
-        cutsq[i][j] = cutsq[j][i] = cut*cut;
-        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
-        cut = cutghost[i][j] + neighbor->skin;
-        cutneighghostsq[i][j] = cutneighghostsq[j][i] = cut*cut;
-      }
+      else
+        cut = 0.0;
+      cutsq[i][j] = cutsq[j][i] = cut*cut;
    }
  }

  #ifdef _LMP_INTEL_OFFLOAD
  if (_cop < 0) return;
-  flt_t * ocutneighsq = cutneighsq[0];
  size_t VL = 512 / 8 / sizeof(flt_t);
  int ntypes = tp1;
  int tp1sq = tp1 * tp1;
@ -607,7 +602,6 @@ void PairAIREBOIntel::pack_force_const(IntelBuffers<flt_t,acc_t> * buffers) {
  // it might not be freed if this method is called more than once
  int * map = this->map;
  #pragma offload_transfer target(mic:_cop) \
-    in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0)) \
    in(map: length(tp1) alloc_if(1) free_if(0))
  #endif

@ -1580,7 +1574,7 @@ void ref_rebo_neigh(KernelArgsAIREBOT<flt_t,acc_t> * ka) {
    ka->nC[i] = 0;
    ka->nH[i] = 0;
    for (int j = 0; j < ka->neigh_lmp.num[i]; j++) {
-      int ji = ka->neigh_lmp.entries[ka->neigh_lmp.offset[i] + j];
+      int ji = ka->neigh_lmp.entries[i][j];
      flt_t delx = ka->x[i].x - ka->x[ji].x;
      flt_t dely = ka->x[i].y - ka->x[ji].y;
      flt_t delz = ka->x[i].z - ka->x[ji].z;
@ -2168,7 +2162,7 @@ void ref_lennard_jones_single_atom(KernelArgsAIREBOT<flt_t,acc_t> * ka, int i,
                                   int morseflag) {
  AtomAIREBOT<flt_t> * x = ka->x;
  int jj;
-  int * neighs = ka->neigh_lmp.entries + ka->neigh_lmp.offset[i];
+  int * neighs = ka->neigh_lmp.entries[i];
  int jnum = ka->neigh_lmp.num_half[i];
  for (jj = 0; jj < jnum; jj++) {
    int j = neighs[jj];
@ -2332,7 +2326,7 @@ static void aut_rebo_neigh(KernelArgsAIREBOT<flt_t,acc_t> * ka) {
    int jnum = ka->rebuild_flag ? ka->neigh_lmp.num[i] :
      ka->neigh_rebo.num_half[i];
    int * neighs = ka->rebuild_flag ?
-      &ka->neigh_lmp.entries[ka->neigh_lmp.offset[i]] :
+      ka->neigh_lmp.entries[i] :
      &ka->neigh_rebo.entries[ka->neigh_rebo.offset[i]+jnum];
    int * skin_target = &ka->neigh_rebo.entries[offset+ka->num_neighs_per_atom];
    int n = 0;
@ -4554,7 +4548,7 @@ static void aut_lennard_jones(KernelArgsAIREBOT<flt_t,acc_t> * ka) {
    fvec p_lj40 = fvec::set1(ka->params.lj4[itype][0]);
    fvec p_lj41 = fvec::set1(ka->params.lj4[itype][1]);

-    int * neighs = ka->neigh_lmp.entries + ka->neigh_lmp.offset[i];
+    int * neighs = ka->neigh_lmp.entries[i];
    int jnum = ka->neigh_lmp.num_half[i];

    bool tap_success = aut_airebo_lj_test_all_paths(ka, i, &test_path_result);
--- a/src/USER-INTEL/pair_airebo_intel.h
+++ b/src/USER-INTEL/pair_airebo_intel.h
@ -107,4 +107,10 @@ E: Cannot open AIREBO potential file %s
 The specified AIREBO potential file cannot be opened.  Check that the
 path and name are correct.

+E: Cannot yet use airebo/intel with hybrid.
+
+Pair style airebo/intel cannot currently be used as part of a hybrid
+pair style (with the exception of hybrid/overlay). 
+
+
 */
--- a/src/USER-INTEL/pair_buck_coul_cut_intel.cpp
+++ b/src/USER-INTEL/pair_buck_coul_cut_intel.cpp
@ -144,8 +144,7 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,

  const int * _noalias const ilist = list->ilist;
  const int * _noalias const numneigh = list->numneigh;
-  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int ** _noalias const firstneigh = (const int **)list->firstneigh;

  const flt_t * _noalias const special_coul = fc.special_coul;
  const flt_t * _noalias const special_lj = fc.special_lj;
@ -182,7 +181,6 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
    in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \
    in(c_force, c_energy, c_cut:length(0) alloc_if(0) free_if(0))      \
    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
-    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
    in(numneigh:length(0) alloc_if(0) free_if(0)) \
    in(x:length(x_size) alloc_if(0) free_if(0)) \
    in(q:length(q_size) alloc_if(0) free_if(0)) \
@ -204,8 +202,10 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
                              f_stride, x, q);

    acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EFLAG) oevdwl = oecoul = (acc_t)0;
-    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (EFLAG || vflag)
+      oevdwl = oecoul = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (NEWTON_PAIR == 0 && inum != nlocal)     
+      memset(f_start, 0, f_stride * sizeof(FORCE_T));

    // loop over neighbors of my atoms
    #if defined(_OPENMP)
@ -231,8 +231,8 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
        const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off;
        const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off;
        const C_CUT_T * _noalias const c_cuti = c_cut + ptr_off;
-        const int   * _noalias const jlist = firstneigh + cnumneigh[ii];
-        int jnum = numneigh[ii];
+        const int   * _noalias const jlist = firstneigh[i];
+        int jnum = numneigh[i];
        IP_PRE_neighbor_pad(jnum, offload);

        acc_t fxtmp,fytmp,fztmp,fwtmp;
@ -361,13 +361,10 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
    IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
                        ov0, ov1, ov2, ov3, ov4, ov5);

-    if (EFLAG) {
-      if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
-      ev_global[0] = oevdwl;
-      ev_global[1] = oecoul;
-    }
-    if (vflag) {
+    if (EFLAG || vflag) {
      if (NEWTON_PAIR == 0) {
+        oevdwl *= (acc_t)0.5;
+        oecoul *= (acc_t)0.5;
        ov0 *= (acc_t)0.5;
        ov1 *= (acc_t)0.5;
        ov2 *= (acc_t)0.5;
@ -375,6 +372,8 @@ void PairBuckCoulCutIntel::eval(const int offload, const int vflag,
        ov4 *= (acc_t)0.5;
        ov5 *= (acc_t)0.5;
      }
+      ev_global[0] = oevdwl;
+      ev_global[1] = oecoul;
      ev_global[2] = ov0;
      ev_global[3] = ov1;
      ev_global[4] = ov2;
@ -439,19 +438,16 @@ void PairBuckCoulCutIntel::pack_force_const(ForceConst<flt_t> &fc,
    for (int i = 0; i < ncoultablebits; i++) ntable *= 2;

  fc.set_ntypes(tp1, ntable, memory, _cop);
-  buffers->set_ntypes(tp1);
-  flt_t **cutneighsq = buffers->get_cutneighsq();

  // Repeat cutsq calculation because done after call to init_style
-  double cut, cutneigh;
  for (int i = 1; i <= atom->ntypes; i++) {
    for (int j = i; j <= atom->ntypes; j++) {
-      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+      double cut;
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
        cut = init_one(i, j);
-        cutneigh = cut + neighbor->skin;
-        cutsq[i][j] = cutsq[j][i] = cut*cut;
-        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
-      }
+      else
+        cut = 0.0;
+      cutsq[i][j] = cutsq[j][i] = cut*cut;
    }
  }

@ -483,12 +479,10 @@ void PairBuckCoulCutIntel::pack_force_const(ForceConst<flt_t> &fc,
  C_FORCE_T * c_force = fc.c_force[0];
  C_ENERGY_T * c_energy = fc.c_energy[0];
  C_CUT_T * c_cut = fc.c_cut[0];
-  flt_t * ocutneighsq = cutneighsq[0];
  int tp1sq = tp1 * tp1;
  #pragma offload_transfer target(mic:_cop) \
    in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \
-    in(c_force, c_energy, c_cut: length(tp1sq) alloc_if(0) free_if(0))   \
-    in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
+    in(c_force, c_energy, c_cut: length(tp1sq) alloc_if(0) free_if(0))
  #endif
 }

--- a/src/USER-INTEL/pair_buck_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_buck_coul_long_intel.cpp
@ -144,8 +144,7 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,

  const int * _noalias const ilist = list->ilist;
  const int * _noalias const numneigh = list->numneigh;
-  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int ** _noalias const firstneigh = (const int **)list->firstneigh;

  const flt_t * _noalias const special_coul = fc.special_coul;
  const flt_t * _noalias const special_lj = fc.special_lj;
@ -206,7 +205,6 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
    in(c_force, c_energy:length(0) alloc_if(0) free_if(0)) \
    in(rho_inv:length(0) alloc_if(0) free_if(0)) \
    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
-    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
    in(numneigh:length(0) alloc_if(0) free_if(0)) \
    in(x:length(x_size) alloc_if(0) free_if(0)) \
    in(q:length(q_size) alloc_if(0) free_if(0)) \
@ -230,8 +228,10 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
                              f_stride, x, q);

    acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EFLAG) oevdwl = oecoul = (acc_t)0;
-    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (EFLAG || vflag)
+      oevdwl = oecoul = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (NEWTON_PAIR == 0 && inum != nlocal)
+      memset(f_start, 0, f_stride * sizeof(FORCE_T));

    // loop over neighbors of my atoms
    #if defined(_OPENMP)
@ -265,8 +265,8 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
        const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off;
        const flt_t * _noalias const rho_invi = rho_inv + ptr_off;

-        const int   * _noalias const jlist = firstneigh + cnumneigh[ii];
-        int jnum = numneigh[ii];
+        const int   * _noalias const jlist = firstneigh[i];
+        int jnum = numneigh[i];
        IP_PRE_neighbor_pad(jnum, offload);

        acc_t fxtmp,fytmp,fztmp,fwtmp;
@ -437,13 +437,10 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
    IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
                        ov0, ov1, ov2, ov3, ov4, ov5);

-    if (EFLAG) {
-      if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
-      ev_global[0] = oevdwl;
-      ev_global[1] = oecoul;
-    }
-    if (vflag) {
+    if (EFLAG || vflag) {
      if (NEWTON_PAIR == 0) {
+        oevdwl *= (acc_t)0.5;
+        oecoul *= (acc_t)0.5;
        ov0 *= (acc_t)0.5;
        ov1 *= (acc_t)0.5;
        ov2 *= (acc_t)0.5;
@ -451,6 +448,8 @@ void PairBuckCoulLongIntel::eval(const int offload, const int vflag,
        ov4 *= (acc_t)0.5;
        ov5 *= (acc_t)0.5;
      }
+      ev_global[0] = oevdwl;
+      ev_global[1] = oecoul;
      ev_global[2] = ov0;
      ev_global[3] = ov1;
      ev_global[4] = ov2;
@ -523,19 +522,16 @@ void PairBuckCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
    for (int i = 0; i < ncoultablebits; i++) ntable *= 2;

  fc.set_ntypes(tp1, ntable, memory, _cop);
-  buffers->set_ntypes(tp1);
-  flt_t **cutneighsq = buffers->get_cutneighsq();

  // Repeat cutsq calculation because done after call to init_style
-  double cut, cutneigh;
  for (int i = 1; i <= atom->ntypes; i++) {
    for (int j = i; j <= atom->ntypes; j++) {
-      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+      double cut;
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
        cut = init_one(i, j);
-        cutneigh = cut + neighbor->skin;
-        cutsq[i][j] = cutsq[j][i] = cut*cut;
-        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
-      }
+      else
+        cut = 0.0;
+      cutsq[i][j] = cutsq[j][i] = cut*cut;
    }
  }

@ -591,15 +587,13 @@ void PairBuckCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
  flt_t * detable = fc.detable;
  flt_t * ctable = fc.ctable;
  flt_t * dctable = fc.dctable;
-  flt_t * ocutneighsq = cutneighsq[0];
  int tp1sq = tp1 * tp1;
  #pragma offload_transfer target(mic:_cop) \
    in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \
    in(c_force, c_energy: length(tp1sq) alloc_if(0) free_if(0)) \
    in(rho_inv: length(tp1sq) alloc_if(0) free_if(0)) \
    in(table: length(ntable) alloc_if(0) free_if(0)) \
-    in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0)) \
-    in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
+    in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0))
  #endif
 }

--- a/src/USER-INTEL/pair_buck_intel.cpp
+++ b/src/USER-INTEL/pair_buck_intel.cpp
@ -135,8 +135,7 @@ void PairBuckIntel::eval(const int offload, const int vflag,

  const int * _noalias const ilist = list->ilist;
  const int * _noalias const numneigh = list->numneigh;
-  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int ** _noalias const firstneigh = (const int **)list->firstneigh;

  const flt_t * _noalias const special_lj = fc.special_lj;
  const C_FORCE_T * _noalias const c_force = fc.c_force[0];
@ -167,7 +166,6 @@ void PairBuckIntel::eval(const int offload, const int vflag,
    in(special_lj:length(0) alloc_if(0) free_if(0)) \
    in(c_force, c_energy:length(0) alloc_if(0) free_if(0))      \
    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
-    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
    in(numneigh:length(0) alloc_if(0) free_if(0)) \
    in(x:length(x_size) alloc_if(0) free_if(0)) \
    in(ilist:length(0) alloc_if(0) free_if(0)) \
@ -188,8 +186,10 @@ void PairBuckIntel::eval(const int offload, const int vflag,
                              f_stride, x, 0);

    acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EFLAG) oevdwl =  (acc_t)0;
-    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (EFLAG || vflag)
+      oevdwl = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (NEWTON_PAIR == 0 && inum != nlocal)
+      memset(f_start, 0, f_stride * sizeof(FORCE_T));

    // loop over neighbors of my atoms
    #if defined(_OPENMP)
@ -214,8 +214,8 @@ void PairBuckIntel::eval(const int offload, const int vflag,
        const int ptr_off = itype * ntypes;
        const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off;
        const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off;
-        const int   * _noalias const jlist = firstneigh + cnumneigh[ii];
-        int jnum = numneigh[ii];
+        const int   * _noalias const jlist = firstneigh[i];
+        int jnum = numneigh[i];
        IP_PRE_neighbor_pad(jnum, offload);

        acc_t fxtmp,fytmp,fztmp,fwtmp;
@ -324,13 +324,9 @@ void PairBuckIntel::eval(const int offload, const int vflag,
    IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
                        ov0, ov1, ov2, ov3, ov4, ov5);

-    if (EFLAG) {
-      if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
-      ev_global[0] = oevdwl;
-      ev_global[1] = (acc_t)0;
-    }
-    if (vflag) {
+    if (EFLAG || vflag) {
      if (NEWTON_PAIR == 0) {
+        oevdwl *= (acc_t)0.5;
        ov0 *= (acc_t)0.5;
        ov1 *= (acc_t)0.5;
        ov2 *= (acc_t)0.5;
@ -338,6 +334,8 @@ void PairBuckIntel::eval(const int offload, const int vflag,
        ov4 *= (acc_t)0.5;
        ov5 *= (acc_t)0.5;
      }
+      ev_global[0] = oevdwl;
+      ev_global[1] = (acc_t)0;
      ev_global[2] = ov0;
      ev_global[3] = ov1;
      ev_global[4] = ov2;
@ -396,19 +394,16 @@ void PairBuckIntel::pack_force_const(ForceConst<flt_t> &fc,
  int tp1 = atom->ntypes + 1;

  fc.set_ntypes(tp1, memory, _cop);
-  buffers->set_ntypes(tp1);
-  flt_t **cutneighsq = buffers->get_cutneighsq();

  // Repeat cutsq calculation because done after call to init_style
-  double cut, cutneigh;
  for (int i = 1; i <= atom->ntypes; i++) {
    for (int j = i; j <= atom->ntypes; j++) {
-      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+      double cut;
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
        cut = init_one(i, j);
-        cutneigh = cut + neighbor->skin;
-        cutsq[i][j] = cutsq[j][i] = cut*cut;
-        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
-      }
+      else
+        cut = 0.0;
+      cutsq[i][j] = cutsq[j][i] = cut*cut;
    }
  }

@ -434,12 +429,10 @@ void PairBuckIntel::pack_force_const(ForceConst<flt_t> &fc,
  flt_t * special_lj = fc.special_lj;
  C_FORCE_T * c_force = fc.c_force[0];
  C_ENERGY_T * c_energy = fc.c_energy[0];
-  flt_t * ocutneighsq = cutneighsq[0];
  int tp1sq = tp1 * tp1;
  #pragma offload_transfer target(mic:_cop) \
    in(special_lj: length(4) alloc_if(0) free_if(0)) \
-    in(c_force, c_energy: length(tp1sq) alloc_if(0) free_if(0))   \
-    in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
+    in(c_force, c_energy: length(tp1sq) alloc_if(0) free_if(0))
  #endif
 }

--- a/src/USER-INTEL/pair_dpd_intel.cpp
+++ b/src/USER-INTEL/pair_dpd_intel.cpp
@ -173,8 +173,7 @@ void PairDPDIntel::eval(const int offload, const int vflag,

  const int * _noalias const ilist = list->ilist;
  const int * _noalias const numneigh = list->numneigh;
-  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int ** _noalias const firstneigh = (const int **)list->firstneigh;
  const FC_PACKED1_T * _noalias const param = fc.param[0];
  const flt_t * _noalias const special_lj = fc.special_lj;
  int * _noalias const rngi_thread = fc.rngi;
@ -204,8 +203,10 @@ void PairDPDIntel::eval(const int offload, const int vflag,
                              f_stride, x, 0);

    acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EFLAG) oevdwl = (acc_t)0;
-    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (EFLAG || vflag)
+      oevdwl = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (NEWTON_PAIR == 0 && inum != nlocal)
+      memset(f_start, 0, f_stride * sizeof(FORCE_T));

    // loop over neighbors of my atoms
    #if defined(_OPENMP)
@ -233,10 +234,10 @@ void PairDPDIntel::eval(const int offload, const int vflag,

      flt_t icut, a0, gamma, sigma;
      if (ONETYPE) {
-        icut = param[3].icut;
-        a0 = param[3].a0;
-        gamma = param[3].gamma;
-        sigma = param[3].sigma;
+        icut = param[_onetype].icut;
+        a0 = param[_onetype].a0;
+        gamma = param[_onetype].gamma;
+        sigma = param[_onetype].sigma;
      }
      for (int ii = iifrom; ii < iito; ii += iip) {
        const int i = ilist[ii];
@ -248,8 +249,8 @@ void PairDPDIntel::eval(const int offload, const int vflag,
          parami = param + ptr_off;
        }

-        const int * _noalias const jlist = firstneigh + cnumneigh[ii];
-        int jnum = numneigh[ii];
+        const int * _noalias const jlist = firstneigh[i];
+        int jnum = numneigh[i];
        IP_PRE_neighbor_pad(jnum, offload);

        acc_t fxtmp, fytmp, fztmp, fwtmp;
@ -379,13 +380,9 @@ void PairDPDIntel::eval(const int offload, const int vflag,
    IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
                        ov0, ov1, ov2, ov3, ov4, ov5);

-    if (EFLAG) {
-      if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
-      ev_global[0] = oevdwl;
-      ev_global[1] = (acc_t)0.0;
-    }
-    if (vflag) {
+    if (EFLAG || vflag) {
      if (NEWTON_PAIR == 0) {
+        oevdwl *= (acc_t)0.5;
        ov0 *= (acc_t)0.5;
        ov1 *= (acc_t)0.5;
        ov2 *= (acc_t)0.5;
@ -393,6 +390,8 @@ void PairDPDIntel::eval(const int offload, const int vflag,
        ov4 *= (acc_t)0.5;
        ov5 *= (acc_t)0.5;
      }
+      ev_global[0] = oevdwl;
+      ev_global[1] = (acc_t)0.0;
      ev_global[2] = ov0;
      ev_global[3] = ov1;
      ev_global[4] = ov2;
@ -503,32 +502,26 @@ void PairDPDIntel::pack_force_const(ForceConst<flt_t> &fc,
                                    IntelBuffers<flt_t,acc_t> *buffers)
 {
  _onetype = 0;
-  if (atom->ntypes == 1 && !atom->molecular) _onetype = 1;

  int tp1 = atom->ntypes + 1;
  fc.set_ntypes(tp1,comm->nthreads,buffers->get_max_nbors(),memory,_cop);
-  buffers->set_ntypes(tp1);
-  flt_t **cutneighsq = buffers->get_cutneighsq();

  // Repeat cutsq calculation because done after call to init_style
-  double cut, cutneigh;
+  int mytypes = 0;
  for (int i = 1; i <= atom->ntypes; i++) {
    for (int j = i; j <= atom->ntypes; j++) {
      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
-        cut = init_one(i,j);
-        cutneigh = cut + neighbor->skin;
-        cutsq[i][j] = cutsq[j][i] = cut*cut;
-        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
-        double icut = 1.0 / cut;
-        fc.param[i][j].icut = fc.param[j][i].icut = icut;
-      } else {
-        cut = init_one(i,j);
-        double icut = 1.0 / cut;
-        fc.param[i][j].icut = fc.param[j][i].icut = icut;
+        mytypes++;
+        _onetype = i * tp1 + j;
      }
+      double cut = init_one(i,j);
+      cutsq[i][j] = cutsq[j][i] = cut*cut;
+      double icut = 1.0 / cut;
+      fc.param[i][j].icut = fc.param[j][i].icut = icut;
    }
  }
-
+  if (mytypes > 1 || atom->molecular) _onetype = 0;
+  
  for (int i = 0; i < 4; i++) {
    fc.special_lj[i] = force->special_lj[i];
    fc.special_lj[0] = 1.0;
--- a/src/USER-INTEL/pair_eam_intel.cpp
+++ b/src/USER-INTEL/pair_eam_intel.cpp
@ -193,8 +193,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,

  const int * _noalias const ilist = list->ilist;
  const int * _noalias const numneigh = list->numneigh;
-  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int ** _noalias const firstneigh = (const int **)list->firstneigh;
  const FC_PACKED1_T * _noalias const rhor_spline_f = fc.rhor_spline_f;
  const FC_PACKED1_T * _noalias const rhor_spline_e = fc.rhor_spline_e;
  const FC_PACKED2_T * _noalias const z2r_spline_t = fc.z2r_spline_t;
@ -243,8 +242,10 @@ void PairEAMIntel::eval(const int offload, const int vflag,
                              f_stride, x, 0);

    acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EFLAG) oevdwl = (acc_t)0;
-    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (EFLAG || vflag)
+      oevdwl = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (NEWTON_PAIR == 0 && inum != nlocal)
+      memset(f_start, 0, f_stride * sizeof(FORCE_T));

    // loop over neighbors of my atoms
    #if defined(_OPENMP)
@ -293,8 +294,8 @@ void PairEAMIntel::eval(const int offload, const int vflag,
          itype = x[i].w;
          rhor_ioff = istride * itype;
        }
-        const int * _noalias const jlist = firstneigh + cnumneigh[ii];
-        int jnum = numneigh[ii];
+        const int * _noalias const jlist = firstneigh[i];
+        int jnum = numneigh[i];
        IP_PRE_neighbor_pad(jnum, offload);

        const flt_t xtmp = x[i].x;
@ -450,8 +451,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,

      if (tid == 0)
        comm->forward_comm_pair(this);
-      if (NEWTON_PAIR)
-        memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+      if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));

      #if defined(_OPENMP)
      #pragma omp barrier
@ -469,8 +469,8 @@ void PairEAMIntel::eval(const int offload, const int vflag,
          rhor_ioff = istride * itype;
          scale_fi = scale_f + itype*ntypes;
        }
-        const int * _noalias const jlist = firstneigh + cnumneigh[ii];
-        int jnum = numneigh[ii];
+        const int * _noalias const jlist = firstneigh[i];
+        int jnum = numneigh[i];
        IP_PRE_neighbor_pad(jnum, offload);

        acc_t fxtmp, fytmp, fztmp, fwtmp;
@ -597,11 +597,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,
    IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
                        ov0, ov1, ov2, ov3, ov4, ov5);

-    if (EFLAG) {
-      ev_global[0] = oevdwl;
-      ev_global[1] = (acc_t)0.0;
-    }
-    if (vflag) {
+    if (EFLAG || vflag) {
      if (NEWTON_PAIR == 0) {
        ov0 *= (acc_t)0.5;
        ov1 *= (acc_t)0.5;
@ -610,6 +606,8 @@ void PairEAMIntel::eval(const int offload, const int vflag,
        ov4 *= (acc_t)0.5;
        ov5 *= (acc_t)0.5;
      }
+      ev_global[0] = oevdwl;
+      ev_global[1] = (acc_t)0.0;
      ev_global[2] = ov0;
      ev_global[3] = ov1;
      ev_global[4] = ov2;
@ -682,19 +680,16 @@ void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc,

  int tp1 = atom->ntypes + 1;
  fc.set_ntypes(tp1,nr,nrho,memory,_cop);
-  buffers->set_ntypes(tp1);
-  flt_t **cutneighsq = buffers->get_cutneighsq();

  // Repeat cutsq calculation because done after call to init_style
-  double cut, cutneigh;
  for (int i = 1; i <= atom->ntypes; i++) {
    for (int j = i; j <= atom->ntypes; j++) {
-      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+      double cut;
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
        cut = init_one(i,j);
-        cutneigh = cut + neighbor->skin;
-        cutsq[i][j] = cutsq[j][i] = cut*cut;
-        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
-      }
+      else
+        cut = 0.0;
+      cutsq[i][j] = cutsq[j][i] = cut*cut;
    }
  }

@ -733,13 +728,13 @@ void PairEAMIntel::pack_force_const(ForceConst<flt_t> &fc,
          fc.rhor_spline_e[joff + k].b=rhor_spline[type2rhor[j][i]][k][4];
          fc.rhor_spline_e[joff + k].c=rhor_spline[type2rhor[j][i]][k][5];
          fc.rhor_spline_e[joff + k].d=rhor_spline[type2rhor[j][i]][k][6];
-          fc.z2r_spline_t[joff + k].a=z2r_spline[type2rhor[j][i]][k][0];
-          fc.z2r_spline_t[joff + k].b=z2r_spline[type2rhor[j][i]][k][1];
-          fc.z2r_spline_t[joff + k].c=z2r_spline[type2rhor[j][i]][k][2];
-          fc.z2r_spline_t[joff + k].d=z2r_spline[type2rhor[j][i]][k][3];
-          fc.z2r_spline_t[joff + k].e=z2r_spline[type2rhor[j][i]][k][4];
-          fc.z2r_spline_t[joff + k].f=z2r_spline[type2rhor[j][i]][k][5];
-          fc.z2r_spline_t[joff + k].g=z2r_spline[type2rhor[j][i]][k][6];
+          fc.z2r_spline_t[joff + k].a=z2r_spline[type2z2r[j][i]][k][0];
+          fc.z2r_spline_t[joff + k].b=z2r_spline[type2z2r[j][i]][k][1];
+          fc.z2r_spline_t[joff + k].c=z2r_spline[type2z2r[j][i]][k][2];
+          fc.z2r_spline_t[joff + k].d=z2r_spline[type2z2r[j][i]][k][3];
+          fc.z2r_spline_t[joff + k].e=z2r_spline[type2z2r[j][i]][k][4];
+          fc.z2r_spline_t[joff + k].f=z2r_spline[type2z2r[j][i]][k][5];
+          fc.z2r_spline_t[joff + k].g=z2r_spline[type2z2r[j][i]][k][6];
        }
      }
    }
@ -754,7 +749,7 @@ void PairEAMIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
                                                 const int nr, const int nrho,
                                                 Memory *memory,
                                                 const int cop) {
-  if (ntypes != _ntypes || nr > _nr || nrho > _nrho) {
+  if (ntypes != _ntypes || nr + 1 > _nr || nrho + 1 > _nrho) {
    if (_ntypes > 0) {
      _memory->destroy(rhor_spline_f);
      _memory->destroy(rhor_spline_e);
--- a/src/USER-INTEL/pair_gayberne_intel.cpp
+++ b/src/USER-INTEL/pair_gayberne_intel.cpp
@ -228,8 +228,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,

  const int * _noalias const ilist = list->ilist;
  const int * _noalias const numneigh = list->numneigh;
-  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int ** _noalias const firstneigh = (const int **)list->firstneigh;
  const flt_t * _noalias const special_lj = fc.special_lj;

  const FC_PACKED1_T * _noalias const ijc = fc.ijc[0];
@ -283,7 +282,6 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
    in(rsq_formi, delx_formi, dely_formi: length(0) alloc_if(0) free_if(0)) \
    in(delz_formi, jtype_formi, jlist_formi: length(0) alloc_if(0) free_if(0))\
    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
-    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
    in(numneigh:length(0) alloc_if(0) free_if(0)) \
    in(x:length(x_size) alloc_if(0) free_if(0)) \
    in(quat:length(nall+1) alloc_if(0) free_if(0)) \
@ -337,9 +335,11 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
    #endif

    acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EFLAG) oevdwl = (acc_t)0.0;
-    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
+    if (EFLAG || vflag)
+      oevdwl = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
    if (NEWTON_PAIR == 0) f_start[1].w = 0;
+    if (NEWTON_PAIR == 0 && inum != nlocal)
+      memset(f_start, 0, f_stride * sizeof(FORCE_T));

    // loop over neighbors of my atoms
    #if defined(_OPENMP)
@ -372,8 +372,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
        const FC_PACKED1_T * _noalias const ijci = ijc + ptr_off;
        const FC_PACKED2_T * _noalias const lj34i = lj34 + ptr_off;

-        const int * _noalias const jlist = firstneigh + cnumneigh[ii];
-        int jnum = numneigh[ii];
+        const int * _noalias const jlist = firstneigh[i];
+        int jnum = numneigh[i];
        IP_PRE_neighbor_pad(jnum, offload);

        const flt_t xtmp = x[i].x;
@ -833,13 +833,9 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
        f_start[1].w = ierror;
    } // omp

-    if (EFLAG) {
-      if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
-      ev_global[0] = oevdwl;
-      ev_global[1] = (acc_t)0.0;
-    }
-    if (vflag) {
+    if (EFLAG || vflag) {
      if (NEWTON_PAIR == 0) {
+        oevdwl *= (acc_t)0.5;
        ov0 *= (acc_t)-0.5;
        ov1 *= (acc_t)-0.5;
        ov2 *= (acc_t)-0.5;
@ -847,6 +843,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
        ov4 *= (acc_t)-0.5;
        ov5 *= (acc_t)-0.5;
      }
+      ev_global[0] = oevdwl;
+      ev_global[1] = (acc_t)0.0;
      ev_global[2] = ov0;
      ev_global[3] = ov1;
      ev_global[4] = ov2;
@ -914,19 +912,16 @@ void PairGayBerneIntel::pack_force_const(ForceConst<flt_t> &fc,
  if (mthreads < buffers->get_off_threads())
    mthreads = buffers->get_off_threads();
  fc.set_ntypes(tp1, _max_nbors, mthreads, memory, _cop);
-  buffers->set_ntypes(tp1);
-  flt_t **cutneighsq = buffers->get_cutneighsq();

  // Repeat cutsq calculation because done after call to init_style
-  double cut, cutneigh;
  for (int i = 1; i <= atom->ntypes; i++) {
    for (int j = i; j <= atom->ntypes; j++) {
-      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+      double cut;
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
        cut = init_one(i,j);
-        cutneigh = cut + neighbor->skin;
-        cutsq[i][j] = cutsq[j][i] = cut*cut;
-        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
-      }
+      else
+        cut = 0.0;
+      cutsq[i][j] = cutsq[j][i] = cut*cut;
    }
  }

@ -963,14 +958,12 @@ void PairGayBerneIntel::pack_force_const(ForceConst<flt_t> &fc,
  FC_PACKED1_T *oijc = fc.ijc[0];
  FC_PACKED2_T *olj34 = fc.lj34[0];
  FC_PACKED3_T *oic = fc.ic;
-  flt_t * ocutneighsq = cutneighsq[0];
  int tp1sq = tp1 * tp1;
  if (oijc != NULL && oic != NULL) {
    #pragma offload_transfer target(mic:_cop) \
      in(special_lj: length(4) alloc_if(0) free_if(0)) \
      in(oijc,olj34: length(tp1sq) alloc_if(0) free_if(0)) \
-      in(oic: length(tp1) alloc_if(0) free_if(0)) \
-      in(ocutneighsq: length(tp1sq))
+      in(oic: length(tp1) alloc_if(0) free_if(0))
  }
  #endif
 }
--- a/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp
+++ b/src/USER-INTEL/pair_lj_charmm_coul_charmm_intel.cpp
@ -138,8 +138,7 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,

  const int * _noalias const ilist = list->ilist;
  const int * _noalias const numneigh = list->numneigh;
-  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int ** _noalias const firstneigh = (const int **)list->firstneigh;

  const flt_t * _noalias const special_coul = fc.special_coul;
  const flt_t * _noalias const special_lj = fc.special_lj;
@ -186,7 +185,6 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
    in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \
    in(cutsq,lj:length(0) alloc_if(0) free_if(0)) \
    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
-    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
    in(numneigh:length(0) alloc_if(0) free_if(0)) \
    in(x:length(x_size) alloc_if(0) free_if(0)) \
    in(q:length(q_size) alloc_if(0) free_if(0)) \
@ -212,8 +210,10 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
                              f_stride, x, q);

    acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EFLAG) oevdwl = oecoul = (acc_t)0;
-    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (EFLAG || vflag)
+      oevdwl = oecoul = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (NEWTON_PAIR == 0 && inum != nlocal)
+      memset(f_start, 0, f_stride * sizeof(FORCE_T));

    // loop over neighbors of my atoms
    #if defined(_OPENMP)
@ -248,8 +248,8 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
        const flt_t * _noalias const cutsqi = cutsq + ptr_off;
        const LJ_T * _noalias const lji = lj + ptr_off;

-        const int   * _noalias const jlist = firstneigh + cnumneigh[ii];
-        int jnum = numneigh[ii];
+        const int   * _noalias const jlist = firstneigh[i];
+        int jnum = numneigh[i];
        IP_PRE_neighbor_pad(jnum, offload);

        acc_t fxtmp,fytmp,fztmp,fwtmp;
@ -403,16 +403,10 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
    IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
                        ov0, ov1, ov2, ov3, ov4, ov5);

-    if (EFLAG) {
+    if (EFLAG || vflag) {
      if (NEWTON_PAIR == 0) {
        oevdwl *= (acc_t)0.5;
        oecoul *= (acc_t)0.5;
-      }
-      ev_global[0] = oevdwl;
-      ev_global[1] = oecoul;
-    }
-    if (vflag) {
-      if (NEWTON_PAIR == 0) {
        ov0 *= (acc_t)0.5;
        ov1 *= (acc_t)0.5;
        ov2 *= (acc_t)0.5;
@ -420,6 +414,8 @@ void PairLJCharmmCoulCharmmIntel::eval(const int offload, const int vflag,
        ov4 *= (acc_t)0.5;
        ov5 *= (acc_t)0.5;
      }
+      ev_global[0] = oevdwl;
+      ev_global[1] = oecoul;
      ev_global[2] = ov0;
      ev_global[3] = ov1;
      ev_global[4] = ov2;
@ -487,22 +483,19 @@ void PairLJCharmmCoulCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
  int tp1 = atom->ntypes + 1;

  fc.set_ntypes(tp1, memory, _cop);
-  buffers->set_ntypes(tp1);
-  flt_t **cutneighsq = buffers->get_cutneighsq();

  // Repeat cutsq calculation because done after call to init_style
-  double cut, cutneigh;
  if (cut_lj > cut_coul)
    error->all(FLERR,
         "Intel varient of lj/charmm/coul/long expects lj cutoff<=coulombic");
  for (int i = 1; i <= atom->ntypes; i++) {
    for (int j = i; j <= atom->ntypes; j++) {
-      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+      double cut;
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
        cut = init_one(i, j);
-        cutneigh = cut + neighbor->skin;
-        cutsq[i][j] = cutsq[j][i] = cut*cut;
-        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
-      }
+      else
+        cut = 0.0;
+      cutsq[i][j] = cutsq[j][i] = cut*cut;
    }
  }

@ -540,12 +533,10 @@ void PairLJCharmmCoulCharmmIntel::pack_force_const(ForceConst<flt_t> &fc,
  flt_t * special_coul = fc.special_coul;
  flt_t * cutsq = fc.cutsq[0];
  LJ_T * lj = fc.lj[0];
-  flt_t * ocutneighsq = cutneighsq[0];
  int tp1sq = tp1 * tp1;
  #pragma offload_transfer target(mic:_cop) \
    in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \
-    in(cutsq,lj: length(tp1sq) alloc_if(0) free_if(0)) \
-    in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
+    in(cutsq,lj: length(tp1sq) alloc_if(0) free_if(0))
  #endif
 }

--- a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
@ -28,7 +28,7 @@
 #include "suffix.h"
 using namespace LAMMPS_NS;

-#define LJ_T typename IntelBuffers<flt_t,flt_t>::vec4_t
+#define LJ_T typename IntelBuffers<flt_t,flt_t>::vec2_t
 #define TABLE_T typename ForceConst<flt_t>::table_t

 /* ---------------------------------------------------------------------- */
@ -142,8 +142,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,

  const int * _noalias const ilist = list->ilist;
  const int * _noalias const numneigh = list->numneigh;
-  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int ** _noalias const firstneigh = (const int **)list->firstneigh;

  const flt_t * _noalias const special_coul = fc.special_coul;
  const flt_t * _noalias const special_lj = fc.special_lj;
@ -207,7 +206,6 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
    in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \
    in(cutsq,lj:length(0) alloc_if(0) free_if(0)) \
    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
-    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
    in(numneigh:length(0) alloc_if(0) free_if(0)) \
    in(x:length(x_size) alloc_if(0) free_if(0)) \
    in(q:length(q_size) alloc_if(0) free_if(0)) \
@ -232,8 +230,10 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
                              f_stride, x, q);

    acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EFLAG) oevdwl = oecoul = (acc_t)0;
-    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (EFLAG || vflag)
+      oevdwl = oecoul = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (NEWTON_PAIR == 0 && inum != nlocal)
+      memset(f_start, 0, f_stride * sizeof(FORCE_T));

    // loop over neighbors of my atoms
    #if defined(_OPENMP)
@ -268,8 +268,8 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
        const flt_t * _noalias const cutsqi = cutsq + ptr_off;
        const LJ_T * _noalias const lji = lj + ptr_off;

-        const int   * _noalias const jlist = firstneigh + cnumneigh[ii];
-        int jnum = numneigh[ii];
+        const int   * _noalias const jlist = firstneigh[i];
+        int jnum = numneigh[i];
        IP_PRE_neighbor_pad(jnum, offload);

        acc_t fxtmp,fytmp,fztmp,fwtmp;
@ -376,8 +376,14 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
          if (rsq < cut_ljsq) {
          #endif
            flt_t r6inv = r2inv * r2inv * r2inv;
-            forcelj = r6inv * (lji[jtype].x * r6inv - lji[jtype].y);
-            if (EFLAG) evdwl = r6inv*(lji[jtype].z * r6inv - lji[jtype].w);
+            flt_t eps4 = lji[jtype].x;
+            flt_t sigp6 = lji[jtype].y;
+            flt_t lj4 = eps4 * sigp6;
+            flt_t lj3 = lj4 * sigp6;
+            flt_t lj2 = (flt_t)6.0 * lj4;
+            flt_t lj1 = (flt_t)12.0 * lj3;
+            forcelj = r6inv * (lj1 * r6inv - lj2);
+            if (EFLAG) evdwl = r6inv*(lj3 * r6inv - lj4);

            #ifdef INTEL_VMASK
            if (rsq > cut_lj_innersq) {
@ -397,8 +403,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
                }
                #endif
              } else {
-                const flt_t philj = r6inv * (lji[jtype].z*r6inv -
-                    lji[jtype].w);
+                const flt_t philj = r6inv * (lj3 * r6inv - lj4);
                #ifndef INTEL_VMASK
                if (rsq > cut_lj_innersq)
                #endif
@ -463,16 +468,10 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
    IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
                        ov0, ov1, ov2, ov3, ov4, ov5);

-    if (EFLAG) {
+    if (EFLAG || vflag) {
      if (NEWTON_PAIR == 0) {
        oevdwl *= (acc_t)0.5;
        oecoul *= (acc_t)0.5;
-      }
-      ev_global[0] = oevdwl;
-      ev_global[1] = oecoul;
-    }
-    if (vflag) {
-      if (NEWTON_PAIR == 0) {
        ov0 *= (acc_t)0.5;
        ov1 *= (acc_t)0.5;
        ov2 *= (acc_t)0.5;
@ -480,6 +479,8 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
        ov4 *= (acc_t)0.5;
        ov5 *= (acc_t)0.5;
      }
+      ev_global[0] = oevdwl;
+      ev_global[1] = oecoul;
      ev_global[2] = ov0;
      ev_global[3] = ov1;
      ev_global[4] = ov2;
@ -552,22 +553,19 @@ void PairLJCharmmCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
    for (int i = 0; i < ncoultablebits; i++) ntable *= 2;

  fc.set_ntypes(tp1, ntable, memory, _cop);
-  buffers->set_ntypes(tp1);
-  flt_t **cutneighsq = buffers->get_cutneighsq();

  // Repeat cutsq calculation because done after call to init_style
-  double cut, cutneigh;
  if (cut_lj > cut_coul)
    error->all(FLERR,
         "Intel varient of lj/charmm/coul/long expects lj cutoff<=coulombic");
  for (int i = 1; i <= atom->ntypes; i++) {
    for (int j = i; j <= atom->ntypes; j++) {
-      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+      double cut;
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
        cut = init_one(i, j);
-        cutneigh = cut + neighbor->skin;
-        cutsq[i][j] = cutsq[j][i] = cut*cut;
-        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
-      }
+      else
+        cut = 0.0;
+      cutsq[i][j] = cutsq[j][i] = cut*cut;
    }
  }

@ -591,10 +589,13 @@ void PairLJCharmmCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,

  for (int i = 1; i < tp1; i++) {
    for (int j = 1; j < tp1; j++) {
-      fc.lj[i][j].x = lj1[i][j];
-      fc.lj[i][j].y = lj2[i][j];
-      fc.lj[i][j].z = lj3[i][j];
-      fc.lj[i][j].w = lj4[i][j];
+      if (i <= j) {
+        fc.lj[i][j].x = epsilon[i][j] * 4.0;
+        fc.lj[i][j].y = pow(sigma[i][j],6.0);
+      } else {
+        fc.lj[i][j].x = epsilon[j][i] * 4.0;
+        fc.lj[i][j].y = pow(sigma[j][i],6.0);
+      }
      fc.cutsq[i][j] = cutsq[i][j];
    }
  }
@ -623,14 +624,12 @@ void PairLJCharmmCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
  flt_t * detable = fc.detable;
  flt_t * ctable = fc.ctable;
  flt_t * dctable = fc.dctable;
-  flt_t * ocutneighsq = cutneighsq[0];
  int tp1sq = tp1 * tp1;
  #pragma offload_transfer target(mic:_cop) \
    in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \
    in(cutsq,lj: length(tp1sq) alloc_if(0) free_if(0)) \
    in(table: length(ntable) alloc_if(0) free_if(0)) \
-    in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0)) \
-    in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
+    in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0))
  #endif
 }

@ -647,7 +646,7 @@ void PairLJCharmmCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
      flt_t * ospecial_lj = special_lj;
      flt_t * ospecial_coul = special_coul;
      flt_t * ocutsq = cutsq[0];
-      typename IntelBuffers<flt_t,flt_t>::vec4_t * olj = lj[0];
+      typename IntelBuffers<flt_t,flt_t>::vec2_t * olj = lj[0];
      table_t * otable = table;
      flt_t * oetable = etable;
      flt_t * odetable = detable;
@ -687,7 +686,7 @@ void PairLJCharmmCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
      flt_t * ospecial_lj = special_lj;
      flt_t * ospecial_coul = special_coul;
      flt_t * ocutsq = cutsq[0];
-      typename IntelBuffers<flt_t,flt_t>::vec4_t * olj = lj[0];
+      typename IntelBuffers<flt_t,flt_t>::vec2_t * olj = lj[0];
      table_t * otable = table;
      flt_t * oetable = etable;
      flt_t * odetable = detable;
--- a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h
+++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h
@ -69,7 +69,7 @@ class PairLJCharmmCoulLongIntel : public PairLJCharmmCoulLong {
    flt_t cut_lj_innersq;
    table_t *table;
    flt_t *etable, *detable, *ctable, *dctable;
-    typename IntelBuffers<flt_t,flt_t>::vec4_t **lj;
+    typename IntelBuffers<flt_t,flt_t>::vec2_t **lj;

    ForceConst() : _ntypes(0), _ntable(0) {}
    ~ForceConst() { set_ntypes(0,0,NULL,_cop); }
--- a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
@ -141,8 +141,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,

  const int * _noalias const ilist = list->ilist;
  const int * _noalias const numneigh = list->numneigh;
-  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int ** _noalias const firstneigh = (const int **)list->firstneigh;

  const flt_t * _noalias const special_coul = fc.special_coul;
  const flt_t * _noalias const special_lj = fc.special_lj;
@ -201,7 +200,6 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
    in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \
    in(c_force, c_energy:length(0) alloc_if(0) free_if(0)) \
    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
-    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
    in(numneigh:length(0) alloc_if(0) free_if(0)) \
    in(x:length(x_size) alloc_if(0) free_if(0)) \
    in(q:length(q_size) alloc_if(0) free_if(0)) \
@ -225,8 +223,10 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
                              f_stride, x, q);

    acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EFLAG) oevdwl = oecoul = (acc_t)0;
-    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (EFLAG || vflag)
+      oevdwl = oecoul = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (NEWTON_PAIR == 0 && inum != nlocal)
+      memset(f_start, 0, f_stride * sizeof(FORCE_T));

    // loop over neighbors of my atoms
    #if defined(_OPENMP)
@ -260,8 +260,8 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
        const C_FORCE_T * _noalias const c_forcei = c_force + ptr_off;
        const C_ENERGY_T * _noalias const c_energyi = c_energy + ptr_off;

-        const int   * _noalias const jlist = firstneigh + cnumneigh[ii];
-        int jnum = numneigh[ii];
+        const int   * _noalias const jlist = firstneigh[i];
+        int jnum = numneigh[i];
        IP_PRE_neighbor_pad(jnum, offload);

        acc_t fxtmp,fytmp,fztmp,fwtmp;
@ -433,16 +433,10 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
    IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
                        ov0, ov1, ov2, ov3, ov4, ov5);

-    if (EFLAG) {
+    if (EFLAG || vflag) {
      if (NEWTON_PAIR == 0) {
        oevdwl *= (acc_t)0.5;
        oecoul *= (acc_t)0.5;
-      }
-      ev_global[0] = oevdwl;
-      ev_global[1] = oecoul;
-    }
-    if (vflag) {
-      if (NEWTON_PAIR == 0) {
        ov0 *= (acc_t)0.5;
        ov1 *= (acc_t)0.5;
        ov2 *= (acc_t)0.5;
@ -450,6 +444,8 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
        ov4 *= (acc_t)0.5;
        ov5 *= (acc_t)0.5;
      }
+      ev_global[0] = oevdwl;
+      ev_global[1] = oecoul;
      ev_global[2] = ov0;
      ev_global[3] = ov1;
      ev_global[4] = ov2;
@ -522,19 +518,16 @@ void PairLJCutCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
    for (int i = 0; i < ncoultablebits; i++) ntable *= 2;

  fc.set_ntypes(tp1, ntable, memory, _cop);
-  buffers->set_ntypes(tp1);
-  flt_t **cutneighsq = buffers->get_cutneighsq();

  // Repeat cutsq calculation because done after call to init_style
-  double cut, cutneigh;
  for (int i = 1; i <= atom->ntypes; i++) {
    for (int j = i; j <= atom->ntypes; j++) {
-      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+      double cut;
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
        cut = init_one(i, j);
-        cutneigh = cut + neighbor->skin;
-        cutsq[i][j] = cutsq[j][i] = cut*cut;
-        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
-      }
+      else
+        cut = 0.0;
+      cutsq[i][j] = cutsq[j][i] = cut*cut;
    }
  }

@ -587,14 +580,12 @@ void PairLJCutCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
  flt_t * detable = fc.detable;
  flt_t * ctable = fc.ctable;
  flt_t * dctable = fc.dctable;
-  flt_t * ocutneighsq = cutneighsq[0];
  int tp1sq = tp1 * tp1;
  #pragma offload_transfer target(mic:_cop) \
    in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \
    in(c_force, c_energy: length(tp1sq) alloc_if(0) free_if(0)) \
    in(table: length(ntable) alloc_if(0) free_if(0)) \
-    in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0)) \
-    in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
+    in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0))
  #endif
 }

--- a/src/USER-INTEL/pair_lj_cut_intel.cpp
+++ b/src/USER-INTEL/pair_lj_cut_intel.cpp
@ -150,8 +150,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag,

  const int * _noalias const ilist = list->ilist;
  const int * _noalias const numneigh = list->numneigh;
-  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int ** _noalias const firstneigh = (const int **)list->firstneigh;
  const flt_t * _noalias const special_lj = fc.special_lj;
  const FC_PACKED1_T * _noalias const ljc12o = fc.ljc12o[0];
  const FC_PACKED2_T * _noalias const lj34 = fc.lj34[0];
@ -180,8 +179,10 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
                              f_stride, x, 0);

    acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EFLAG) oevdwl = (acc_t)0;
-    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (EFLAG || vflag)
+      oevdwl = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (NEWTON_PAIR == 0 && inum != nlocal)
+      memset(f_start, 0, f_stride * sizeof(FORCE_T));

    // loop over neighbors of my atoms
    #if defined(_OPENMP)
@ -201,12 +202,12 @@ void PairLJCutIntel::eval(const int offload, const int vflag,

      flt_t cutsq, lj1, lj2, lj3, lj4, offset;
      if (ONETYPE) {
-        cutsq = ljc12o[3].cutsq;
-        lj1 = ljc12o[3].lj1;
-        lj2 = ljc12o[3].lj2;
-        lj3 = lj34[3].lj3;
-        lj4 = lj34[3].lj4;
-        offset = ljc12o[3].offset;
+        cutsq = ljc12o[_onetype].cutsq;
+        lj1 = ljc12o[_onetype].lj1;
+        lj2 = ljc12o[_onetype].lj2;
+        lj3 = lj34[_onetype].lj3;
+        lj4 = lj34[_onetype].lj4;
+        offset = ljc12o[_onetype].offset;
      }
      for (int ii = iifrom; ii < iito; ii += iip) {
        const int i = ilist[ii];
@ -219,8 +220,8 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
          ljc12oi = ljc12o + ptr_off;
          lj34i = lj34 + ptr_off;
        }
-        const int * _noalias const jlist = firstneigh + cnumneigh[ii];
-        int jnum = numneigh[ii];
+        const int * _noalias const jlist = firstneigh[i];
+        int jnum = numneigh[i];
        IP_PRE_neighbor_pad(jnum, offload);

        acc_t fxtmp, fytmp, fztmp, fwtmp;
@ -338,13 +339,9 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
    IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
                        ov0, ov1, ov2, ov3, ov4, ov5);

-    if (EFLAG) {
-      if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
-      ev_global[0] = oevdwl;
-      ev_global[1] = (acc_t)0.0;
-    }
-    if (vflag) {
+    if (EFLAG || vflag) {
      if (NEWTON_PAIR == 0) {
+        oevdwl *= (acc_t)0.5;
        ov0 *= (acc_t)0.5;
        ov1 *= (acc_t)0.5;
        ov2 *= (acc_t)0.5;
@ -352,6 +349,8 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
        ov4 *= (acc_t)0.5;
        ov5 *= (acc_t)0.5;
      }
+      ev_global[0] = oevdwl;
+      ev_global[1] = (acc_t)0.0;
      ev_global[2] = ov0;
      ev_global[3] = ov1;
      ev_global[4] = ov2;
@ -414,25 +413,26 @@ void PairLJCutIntel::pack_force_const(ForceConst<flt_t> &fc,
                                      IntelBuffers<flt_t,acc_t> *buffers)
 {
  _onetype = 0;
-  if (atom->ntypes == 1 && !atom->molecular) _onetype = 1;

  int tp1 = atom->ntypes + 1;
  fc.set_ntypes(tp1,memory,_cop);
-  buffers->set_ntypes(tp1);
-  flt_t **cutneighsq = buffers->get_cutneighsq();

  // Repeat cutsq calculation because done after call to init_style
-  double cut, cutneigh;
+  int mytypes = 0;
  for (int i = 1; i <= atom->ntypes; i++) {
    for (int j = i; j <= atom->ntypes; j++) {
+      double cut;
      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
        cut = init_one(i,j);
-        cutneigh = cut + neighbor->skin;
-        cutsq[i][j] = cutsq[j][i] = cut*cut;
-        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
+        mytypes++;
+        _onetype = i * tp1 + j;
+      } else {
+        cut = 0.0;
      }
+      cutsq[i][j] = cutsq[j][i] = cut*cut;
    }
  }
+  if (mytypes > 1 || atom->molecular) _onetype = 0;

  for (int i = 0; i < 4; i++) {
    fc.special_lj[i] = force->special_lj[i];
--- a/src/USER-INTEL/pair_sw_intel.cpp
+++ b/src/USER-INTEL/pair_sw_intel.cpp
@ -130,37 +130,37 @@ void PairSWIntel::compute(int eflag, int vflag,
  if (_onetype) {
    if (_spq) {
      if (eflag) {
-        eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-        eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+        eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
      } else {
-        eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-        eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+        eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
      }
    } else {
      if (eflag) {
-        eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-        eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+        eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum);
      } else {
-        eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-        eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+        eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum);
      }
    }
  } else {
    if (_spq) {
      if (eflag) {
-        eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-        eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+        eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
      } else {
-        eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-        eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+        eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
      }
    } else {
      if (eflag) {
-        eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-        eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+        eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum);
      } else {
-        eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-        eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+        eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum);
      }
    }
  }
@ -173,7 +173,7 @@ template <int SPQ,int ONETYPE,int EFLAG,class flt_t,class acc_t>
 void PairSWIntel::eval(const int offload, const int vflag,
                       IntelBuffers<flt_t,acc_t> *buffers,
                       const ForceConst<flt_t> &fc, const int astart,
-                       const int aend, const int /*pad_width*/)
+                       const int aend)
 {
  const int inum = aend - astart;
  if (inum == 0) return;
@ -185,10 +185,13 @@ void PairSWIntel::eval(const int offload, const int vflag,

  ATOM_T * _noalias const x = buffers->get_x(offload);
  const int * _noalias const ilist = list->ilist;
-  const int * _noalias const numneighhalf = buffers->get_atombin();
  const int * _noalias const numneigh = list->numneigh;
-  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int * _noalias const firstneigh = list->firstneigh[ilist[0]];
+
+  int *nhalf, *cnum;
+  buffers->get_list_data3(list, nhalf, cnum);
+  const int * _noalias const numneighhalf = nhalf;
+  const int * _noalias const cnumneigh = cnum;

  const FC_PACKED0_T * _noalias const p2 = fc.p2[0];
  const FC_PACKED1_T * _noalias const p2f = fc.p2f[0];
@ -206,6 +209,8 @@ void PairSWIntel::eval(const int offload, const int vflag,

  const int ntypes = atom->ntypes + 1;
  const int eatom = this->eflag_atom;
+  const int onetype = _onetype;
+  const int onetype3 = _onetype3;

  // Determine how much data to transfer
  int x_size, q_size, f_stride, ev_size, separate_flag;
@ -235,8 +240,8 @@ void PairSWIntel::eval(const int offload, const int vflag,
    in(overflow:length(0) alloc_if(0) free_if(0)) \
    in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
    in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
-    in(ccache_stride,nthreads,inum,nall,ntypes,vflag,eatom,offload) \
-    in(astart,nlocal,f_stride,minlocal,separate_flag,pad_width) \
+    in(ccache_stride,nthreads,inum,nall,ntypes,vflag,eatom,offload,onetype3) \
+    in(astart,nlocal,f_stride,minlocal,separate_flag,onetype) \
    out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
    out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
@ -251,8 +256,8 @@ void PairSWIntel::eval(const int offload, const int vflag,
                              f_stride, x, 0);

    acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EFLAG) oevdwl = (acc_t)0;
-    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (EFLAG || vflag)
+      oevdwl = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;

    #if defined(_OPENMP)
    #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
@ -278,24 +283,24 @@ void PairSWIntel::eval(const int offload, const int vflag,
      flt_t cutsq, cut, powerp, powerq, sigma, c1, c2, c3, c4, c5, c6;
      flt_t sigma_gamma, costheta, lambda_epsilon, lambda_epsilon2;
      if (ONETYPE) {
-        cutsq = p2[3].cutsq;
-        cut = p2f[3].cut;
-        sigma = p2f[3].sigma;
-        c1 = p2f2[3].c1;
-        c2 = p2f2[3].c2;
-        c3 = p2f2[3].c3;
-        c4 = p2f2[3].c4;
-        sigma_gamma = p2[3].sigma_gamma;
-        costheta = p3[7].costheta;
-        lambda_epsilon = p3[7].lambda_epsilon;
-        lambda_epsilon2 = p3[7].lambda_epsilon2;
+        cutsq = p2[onetype].cutsq;
+        cut = p2f[onetype].cut;
+        sigma = p2f[onetype].sigma;
+        c1 = p2f2[onetype].c1;
+        c2 = p2f2[onetype].c2;
+        c3 = p2f2[onetype].c3;
+        c4 = p2f2[onetype].c4;
+        sigma_gamma = p2[onetype].sigma_gamma;
+        costheta = p3[onetype3].costheta;
+        lambda_epsilon = p3[onetype3].lambda_epsilon;
+        lambda_epsilon2 = p3[onetype3].lambda_epsilon2;
        if (SPQ == 0) {
-          powerp = p2f[3].powerp;
-          powerq = p2f[3].powerq;
+          powerp = p2f[onetype].powerp;
+          powerq = p2f[onetype].powerq;
        }
        if (EFLAG) {
-          c5 = p2e[3].c5;
-          c6 = p2e[3].c6;
+          c5 = p2e[onetype].c5;
+          c6 = p2e[onetype].c6;
        }
      }

@ -312,7 +317,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
        }

        const int * _noalias const jlist = firstneigh + cnumneigh[ii];
-        const int jnum = numneigh[ii];
+        const int jnum = numneigh[i];
        const int jnumhalf = numneighhalf[ii];

        acc_t fxtmp, fytmp, fztmp, fwtmp;
@ -541,11 +546,9 @@ void PairSWIntel::eval(const int offload, const int vflag,
    IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
                        ov0, ov1, ov2, ov3, ov4, ov5);

-    if (EFLAG) {
+    if (EFLAG || vflag) {
      ev_global[0] = oevdwl;
      ev_global[1] = (acc_t)0.0;
-    }
-    if (vflag) {
      ev_global[2] = ov0;
      ev_global[3] = ov1;
      ev_global[4] = ov2;
@ -585,7 +588,7 @@ template <int SPQ,int ONETYPE,int EFLAG,class flt_t,class acc_t>
 void PairSWIntel::eval(const int offload, const int vflag,
                       IntelBuffers<flt_t,acc_t> *buffers,
                       const ForceConst<flt_t> &fc, const int astart,
-                       const int aend, const int pad_width)
+                       const int aend)
 {
  typedef typename SIMD_type<flt_t>::SIMD_vec SIMD_flt_t;
  typedef typename SIMD_type<acc_t>::SIMD_vec SIMD_acc_t;
@ -601,10 +604,13 @@ void PairSWIntel::eval(const int offload, const int vflag,

  ATOM_T * _noalias const x = buffers->get_x(offload);
  const int * _noalias const ilist = list->ilist;
-  const int * _noalias const numneighhalf = buffers->get_atombin();
  const int * _noalias const numneigh = list->numneigh;
-  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int * _noalias const firstneigh = list->firstneigh[ilist[0]];
+
+  int *nhalf, *cnum;
+  buffers->get_list_data3(list, nhalf, cnum);
+  const int * _noalias const numneighhalf = nhalf;
+  const int * _noalias const cnumneigh = cnum;

  const FC_PACKED0_T * _noalias const p2 = fc.p2[0];
  const FC_PACKED1_T * _noalias const p2f = fc.p2f[0];
@ -654,7 +660,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
    in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
    in(ccachei,ccachej,ccachef:length(0) alloc_if(0) free_if(0)) \
    in(ccache_stride,nthreads,inum,nall,ntypes,vflag,eatom,offload) \
-    in(astart,nlocal,f_stride,minlocal,separate_flag,pad_width) \
+    in(astart,nlocal,f_stride,minlocal,separate_flag) \
    in(ccache_stride3)                                          \
    out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
    out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
@ -670,8 +676,8 @@ void PairSWIntel::eval(const int offload, const int vflag,
                              f_stride, x, 0);

    acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EFLAG) oevdwl = (acc_t)0;
-    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    if (EFLAG || vflag)
+      oevdwl = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;

    #if defined(_OPENMP)
    #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
@ -701,34 +707,38 @@ void PairSWIntel::eval(const int offload, const int vflag,
      SIMD_flt_t cutsq, cut, powerp, powerq, sigma, c1, c2, c3,c4, c5, c6;
      SIMD_flt_t sigma_gamma, costheta, lambda_epsilon, lambda_epsilon2;
      if (ONETYPE) {
-        cutsq = SIMD_set(p2[3].cutsq);
-        cut = SIMD_set(p2f[3].cut);
-        sigma = SIMD_set(p2f[3].sigma);
-        c1 = SIMD_set(p2f2[3].c1);
-        c2 = SIMD_set(p2f2[3].c2);
-        c3 = SIMD_set(p2f2[3].c3);
-        c4 = SIMD_set(p2f2[3].c4);
-        sigma_gamma = SIMD_set(p2[3].sigma_gamma);
-        costheta = SIMD_set(p3[7].costheta);
-        lambda_epsilon = SIMD_set(p3[7].lambda_epsilon);
-        lambda_epsilon2 = SIMD_set(p3[7].lambda_epsilon2);
+        cutsq = SIMD_set(p2[_onetype].cutsq);
+        cut = SIMD_set(p2f[_onetype].cut);
+        sigma = SIMD_set(p2f[_onetype].sigma);
+        c1 = SIMD_set(p2f2[_onetype].c1);
+        c2 = SIMD_set(p2f2[_onetype].c2);
+        c3 = SIMD_set(p2f2[_onetype].c3);
+        c4 = SIMD_set(p2f2[_onetype].c4);
+        sigma_gamma = SIMD_set(p2[_onetype].sigma_gamma);
+        costheta = SIMD_set(p3[_onetype3].costheta);
+        lambda_epsilon = SIMD_set(p3[_onetype3].lambda_epsilon);
+        lambda_epsilon2 = SIMD_set(p3[_onetype3].lambda_epsilon2);
        if (SPQ == 0) {
-          powerp = SIMD_set(p2f[3].powerp);
-          powerq = SIMD_set(p2f[3].powerq);
+          powerp = SIMD_set(p2f[_onetype].powerp);
+          powerq = SIMD_set(p2f[_onetype].powerq);
        }
        if (EFLAG) {
-          c5 = SIMD_set(p2e[3].c5);
-          c6 = SIMD_set(p2e[3].c6);
+          c5 = SIMD_set(p2e[_onetype].c5);
+          c6 = SIMD_set(p2e[_onetype].c6);
        }
      }

-      const SIMD_int goffset = SIMD_set(0,16,32,48,64,80,96,112,128,
-                                        144,160,176,192,208,224,240);
      acc_t * const dforce = &(f[0].x);
      for (int i = iifrom; i < iito; i += iip) {
        SIMD_int ilistv = SIMD_load(ilist + i);
        SIMD_int goffset = ilistv * 16;
-        SIMD_mask imask = ilistv < iito;
+        SIMD_mask imask;
+        if (swidth == 16)
+          imask = 0xFFFF;
+        else
+          imask = 0xFF;
+        const int rem = iito - i;
+        if (rem < swidth) imask = imask >> (swidth - rem);
        SIMD_flt_t xtmp, ytmp, ztmp;
        SIMD_int itype, itype_offset;

@ -741,11 +751,12 @@ void PairSWIntel::eval(const int offload, const int vflag,

        #ifdef LMP_INTEL_3BODY_FAST
        const int* ng = firstneigh + cnumneigh[i] - swidth;
+        const SIMD_int jnum = SIMD_loadz(imask, numneigh + i);
        #else
        SIMD_int ng = SIMD_load(cnumneigh + i);
+        const SIMD_int jnum = SIMD_gatherz(imask, numneigh, ilistv);
        ng = ng - 1;
        #endif
-        const SIMD_int jnum = SIMD_loadz(imask, numneigh + i);
        const SIMD_int jnumhalf = SIMD_loadz(imask, numneighhalf + i);
        const int jnum_max = SIMD_max(jnum);

@ -1051,11 +1062,9 @@ void PairSWIntel::eval(const int offload, const int vflag,
    IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
                        ov0, ov1, ov2, ov3, ov4, ov5);

-    if (EFLAG) {
+    if (EFLAG || vflag) {
      ev_global[0] = oevdwl;
      ev_global[1] = (acc_t)0.0;
-    }
-    if (vflag) {
      ev_global[2] = ov0;
      ev_global[3] = ov1;
      ev_global[4] = ov2;
@ -1162,26 +1171,22 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc,

  int tp1 = atom->ntypes + 1;
  fc.set_ntypes(tp1,memory,_cop);
-  buffers->set_ntypes(tp1);
-  flt_t **cutneighsq = buffers->get_cutneighsq();

  // Repeat cutsq calculation because done after call to init_style
-  double cut, cutneigh;
  for (int i = 1; i <= atom->ntypes; i++) {
    for (int j = i; j <= atom->ntypes; j++) {
-      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+      double cut;
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
        cut = init_one(i,j);
-        cutneigh = cut + neighbor->skin;
-        cutsq[i][j] = cutsq[j][i] = cut*cut;
-        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
-      }
+      else
+        cut = 0.0;
+      cutsq[i][j] = cutsq[j][i] = cut*cut;
    }
  }

  _onetype = 0;
-  if (atom->ntypes == 1) _onetype = 1;
-
  _spq = 1;
+  int mytypes = 0;
  for (int ii = 0; ii < tp1; ii++) {
    int i = map[ii];
    for (int jj = 0; jj < tp1; jj++) {
@ -1231,6 +1236,9 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc,
          fc.p3[ii][jj][kk].lambda_epsilon = 0;
          fc.p3[ii][jj][kk].lambda_epsilon2 = 0;
        } else {
+          mytypes++;
+          _onetype = ii * tp1 + jj;
+          _onetype3 = ii * tp1 * tp1 + jj * tp1 + kk;
          int ijkparam = elem2param[i][j][k];
          fc.p3[ii][jj][kk].costheta = params[ijkparam].costheta;
          fc.p3[ii][jj][kk].lambda_epsilon = params[ijkparam].lambda_epsilon;
@ -1239,12 +1247,7 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc,
      }
    }
  }
-
-  _host_pad = 1;
-  _offload_pad = 1;
-
-  if (INTEL_NBOR_PAD > 1)
-    _host_pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
+  if (mytypes > 1) _onetype = 0;

  #ifdef _LMP_INTEL_OFFLOAD
  if (_cop < 0) return;
@ -1253,16 +1256,11 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc,
  FC_PACKED1p2_T *op2f2 = fc.p2f2[0];
  FC_PACKED2_T *op2e = fc.p2e[0];
  FC_PACKED3_T *op3 = fc.p3[0][0];
-  flt_t * ocutneighsq = cutneighsq[0];
  int tp1sq = tp1 * tp1;
  int tp1cu = tp1sq * tp1;
-  if (op2 != NULL && op2f != NULL && op2f2 != NULL && op2e != NULL &&
-      op3 != NULL && ocutneighsq != NULL) {
-    #pragma offload_transfer target(mic:_cop) \
-      in(op2,op2f,op2f2,op2e: length(tp1sq) alloc_if(0) free_if(0))     \
-      in(op3: length(tp1cu) alloc_if(0) free_if(0)) \
-      in(ocutneighsq: length(tp1sq))
-  }
+  #pragma offload_transfer target(mic:_cop) \
+    in(op2,op2f,op2f2,op2e: length(tp1sq) alloc_if(0) free_if(0))     \
+    in(op3: length(tp1cu) alloc_if(0) free_if(0))
  #endif
 }

--- a/src/USER-INTEL/pair_sw_intel.h
+++ b/src/USER-INTEL/pair_sw_intel.h
@ -49,13 +49,13 @@ class PairSWIntel : public PairSW {
  template <int SPQ, int ONETYPE, int EFLAG, class flt_t, class acc_t>
  void eval(const int offload, const int vflag,
            IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc,
-            const int astart, const int aend, const int pad_width);
+            const int astart, const int aend);

  template <class flt_t, class acc_t>
  void pack_force_const(ForceConst<flt_t> &fc,
                        IntelBuffers<flt_t, acc_t> *buffers);

-  int _ccache_stride, _host_pad, _offload_pad, _spq, _onetype;
+  int _ccache_stride, _spq, _onetype, _onetype3;
  #ifdef LMP_USE_AVXCD
  int _ccache_stride3;
  #endif
@ -75,7 +75,7 @@ class PairSWIntel : public PairSW {
      flt_t c1, c2, c3, c4;
    } fc_packed1p2;
    typedef struct {
-      flt_t c5, c6;
+      flt_t c5, c6, d1, d2;
    } fc_packed2;
    typedef struct {
      flt_t costheta, lambda_epsilon, lambda_epsilon2, pad;
--- a/src/USER-INTEL/pair_tersoff_intel.cpp
+++ b/src/USER-INTEL/pair_tersoff_intel.cpp
@ -220,7 +220,7 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
  static void kernel_step(
      int eatom, int vflag,
      const int * _noalias const numneigh,
-      const int * _noalias const cnumneigh,
+      iarr vcnumneigh,
      const int * _noalias const firstneigh,
      int ntypes,
      typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
@ -241,7 +241,8 @@ struct IntelKernelTersoff : public lmp_intel::vector_routines<flt_t, acc_t, mic>
    const c_inner_t * _noalias const c_inner,
    const c_outer_t * _noalias const c_outer,
    typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
-    avec *vsevdwl, int compress_idx, int i, iarr js, bvec vmask_repulsive
+    avec *vsevdwl, int compress_idx, int ii, int i, iarr js,
+    bvec vmask_repulsive
  );
 };

@ -274,9 +275,12 @@ void PairTersoffIntel::eval(const int offload, const int vflag,

  const int * _noalias const ilist = list->ilist;
  const int * _noalias const numneigh = list->numneigh;
-  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
-  const int * _noalias const numneighhalf = buffers->get_atombin();
-  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const int * _noalias const firstneigh = list->firstneigh[ilist[0]];
+
+  int *nhalf, *cnum;
+  buffers->get_list_data3(list, nhalf, cnum);
+  const int * _noalias const numneighhalf = nhalf;
+  const int * _noalias const cnumneigh = cnum;

  typedef typename ForceConst<flt_t>::c_inner_t c_inner_t;
  typedef typename ForceConst<flt_t>::c_outer_t c_outer_t;
@ -332,13 +336,13 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
    IP_PRE_repack_for_offload(1, separate_flag, nlocal, nall,
                              f_stride, x, 0);

-    acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
-    if (EFLAG) oevdwl = oecoul = (acc_t)0;
-    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
+    if (EFLAG || vflag)
+      oevdwl = ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;

    // loop over neighbors of my atoms
    #if defined(_OPENMP)
-    #pragma omp parallel reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
+    #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
    #endif
    {
      int iifrom, iito, tid;
@ -379,11 +383,9 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
    IP_PRE_fdotr_reduce(1, nall, nthreads, f_stride, vflag,
                        ov0, ov1, ov2, ov3, ov4, ov5);

-    if (EFLAG) {
+    if (EFLAG || vflag) {
      ev_global[0] = oevdwl;
      ev_global[1] = 0.0;
-    }
-    if (vflag) {
      ev_global[2] = ov0;
      ev_global[3] = ov1;
      ev_global[4] = ov2;
@ -461,19 +463,16 @@ void PairTersoffIntel::pack_force_const(ForceConst<flt_t> &fc,
  int tp1 = atom->ntypes + 1;

  fc.set_ntypes(tp1, memory, _cop);
-  buffers->set_ntypes(tp1);
-  flt_t **cutneighsq = buffers->get_cutneighsq();

  // Repeat cutsq calculation because done after call to init_style
-  double cut, cutneigh;
  for (int i = 1; i <= atom->ntypes; i++) {
    for (int j = i; j <= atom->ntypes; j++) {
-      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+      double cut;
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
        cut = init_one(i, j);
-        cutneigh = cut + neighbor->skin;
-        cutsq[i][j] = cutsq[j][i] = cut*cut;
-        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
-      }
+      else
+        cut = 0.0;
+      cutsq[i][j] = cutsq[j][i] = cut*cut;
    }
  }

@ -547,7 +546,6 @@ void PairTersoffIntel::pack_force_const(ForceConst<flt_t> &fc,
  typename ForceConst<flt_t>::c_inner_loop_t * c_inner_loop = fc.c_inner_loop[0][0];
  typename ForceConst<flt_t>::c_cutoff_t * c_cutoff_inner = fc.c_cutoff_inner[0][0];
  typename ForceConst<flt_t>::c_inner_t * c_inner = fc.c_inner[0][0];
-  flt_t * ocutneighsq = cutneighsq[0];
  size_t VL = 512 / 8 / sizeof(flt_t);
  int ntypes = tp1;
  int ntypes_pad = ntypes + VL - ntypes % VL;
@ -557,8 +555,7 @@ void PairTersoffIntel::pack_force_const(ForceConst<flt_t> &fc,
  #pragma offload_transfer target(mic:_cop) \
    in(c_first_loop, c_second_loop, c_cutoff_outer, c_outer : length(tp1sq) alloc_if(0) free_if(0)) \
    in(c_inner : length(tp1cb) alloc_if(0) free_if(0)) \
-    in(c_cutoff_inner : length(tp1cb_pad) alloc_if(0) free_if(0)) \
-    in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
+    in(c_cutoff_inner : length(tp1cb_pad) alloc_if(0) free_if(0))
  #endif
 }

@ -647,7 +644,7 @@ template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
 template<int EFLAG>
 void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
    int eatom, int vflag,
-    const int * _noalias const numneigh, const int * _noalias const cnumneigh,
+    const int * _noalias const numneigh, iarr vcnumneigh,
    const int * _noalias const firstneigh, int ntypes,
    typename IntelBuffers<flt_t,acc_t>::atom_t * _noalias const x,
    const typename PairTersoffIntel::ForceConst<flt_t>::c_inner_t * _noalias const c_inner,
@ -681,6 +678,7 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
  // TDO: We could extract this from the driver routine
  ivec vis = v::int_mullo(v_i4floats, v::int_load_vl(is));
  ivec vjs = v::int_mullo(v_i4floats, v::int_load_vl(js));
+  ivec vcnumneigh_i = v::int_load_vl(vcnumneigh);
  bvec vmask = v::mask_enable_lower(compress_idx);
  fvec vx_i = v::zero(), vy_i = v::zero(), vz_i = v::zero();
  ivec vw_i = v_i0;
@ -692,7 +690,6 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
  fvec vrijsq = vdx_ij * vdx_ij + vdy_ij *  vdy_ij + vdz_ij * vdz_ij;
  fvec vrij = sqrt(vrijsq);
  ivec vis_orig = v::int_load_vl(is);
-  ivec vcnumneigh_i = v::int_gather<4>(v_i0, vmask, vis_orig, cnumneigh);
  ivec vnumneigh_i = v::int_gather<4>(v_i0, vmask, vis_orig, numneigh);
  ivec vc_idx_ij = v::int_mullo(v_i4floats, vw_j + v::int_mullo(v_i_ntypes, vw_i));

@ -930,6 +927,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
    typename IntelBuffers<flt_t,acc_t>::vec3_acc_t * _noalias const f,
    avec *vsevdwl,
    int compress_idx,
+    int ii,
    int i,
    iarr js,
    bvec vmask_repulsive
@ -970,7 +968,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
  fvec vrijsq = vdx_ij * vdx_ij + vdy_ij *  vdy_ij + vdz_ij * vdz_ij;
  fvec vrij = sqrt(vrijsq);

-  int cnumneigh_i = cnumneigh[i];
+  int cnumneigh_i = cnumneigh[ii];
  int numneigh_i = numneigh[i];
  ivec vc_idx_j = v::int_mullo(v_i4floats, vw_j);
  ivec vc_idx_j_ntypes = v::int_mullo(v_i_ntypes, vc_idx_j);
@ -1147,7 +1145,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
 ) {
  int compress_idx = 0;
  int ii, jj;
-  iarr is, js;
+  iarr is, js, vc;
  avec vsevdwl = v::acc_zero();
  ivec v_i4floats(static_cast<int>(sizeof(typename v::fscal) * 4));
  ivec vj, v_NEIGHMASK(NEIGHMASK);
@ -1166,7 +1164,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
    flt_t y_i = x[i].y;
    flt_t z_i = x[i].z;
    int jlist_off_i = cnumneigh[ii];
-    int jnum = numneigh[ii];
+    int jnum = numneigh[i];
    for (jj = 0; jj < jnum; jj++) {
      int j = firstneigh[jlist_off_i + jj] & NEIGHMASK;
      int w_j = x[j].w;
@ -1178,6 +1176,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
      if (rsq < cutsq) {
        is[compress_idx] = ii;
        js[compress_idx] = j;
+        vc[compress_idx] = jlist_off_i;
        if (jj < numneighhalf[ii])
          repulsive_flag[compress_idx] = 1;
        compress_idx += 1;
@ -1187,7 +1186,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
          vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
          kernel_step<EFLAG>(
              eatom, vflag,
-              numneigh, cnumneigh, firstneigh, ntypes,
+              numneigh, vc, firstneigh, ntypes,
              x, c_inner, c_outer, f,
              &vsevdwl, compress_idx,
              is, js, vmask_repulsive
@ -1203,7 +1202,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
              numneigh, cnumneigh, firstneigh, ntypes,
              x, c_inner, c_outer, f,
              &vsevdwl, compress_idx,
-              i, js, vmask_repulsive
+              ii, i, js, vmask_repulsive
          );
          compress_idx = 0;
          v::int_clear_arr(repulsive_flag);
@ -1215,7 +1214,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
        vmask_repulsive = v::int_cmpneq(v::int_load_vl(repulsive_flag), ivec(0));
        IntelKernelTersoff::kernel_step<EFLAG>(
            eatom, vflag,
-            numneigh, cnumneigh, firstneigh, ntypes,
+            numneigh, vc, firstneigh, ntypes,
            x, c_inner, c_outer, f,
            &vsevdwl, compress_idx,
            is, js, vmask_repulsive
--- a/src/USER-INTEL/pppm_intel.cpp
+++ b/src/USER-INTEL/pppm_intel.cpp
@ -581,6 +581,12 @@ void PPPMIntel::fieldforce_ik(IntelBuffers<flt_t,acc_t> *buffers)
  else
    nthr = comm->nthreads;

+  if (fix->need_zero(0)) {
+    int zl = nlocal;
+    if (force->newton_pair) zl += atom->nghost;
+    memset(f, 0, zl * sizeof(FORCE_T));
+  }
+  
  #if defined(_OPENMP)
  #pragma omp parallel default(none) \
    shared(nlocal, nthr) if(!_use_lrt)
@ -726,6 +732,12 @@ void PPPMIntel::fieldforce_ad(IntelBuffers<flt_t,acc_t> *buffers)
  FFT_SCALAR * _noalias const particle_eky = this->particle_eky;
  FFT_SCALAR * _noalias const particle_ekz = this->particle_ekz;

+  if (fix->need_zero(0)) {
+    int zl = nlocal;
+    if (force->newton_pair) zl += atom->nghost;
+    memset(f, 0, zl * sizeof(FORCE_T));
+  }
+  
  #if defined(_OPENMP)
  #pragma omp parallel default(none) \
    shared(nlocal, nthr) if(!_use_lrt)
--- a/src/memory.cpp
+++ b/src/memory.cpp
@ -79,7 +79,9 @@ void *Memory::srealloc(void *ptr, bigint nbytes, const char *name)

 #if defined(LMP_USE_TBB_ALLOCATOR)
  ptr = scalable_aligned_realloc(ptr, nbytes, LAMMPS_MEMALIGN);
-#elif defined(LMP_INTEL_NO_TBB) && defined(LAMMPS_MEMALIGN)
+#elif defined(LMP_INTEL_NO_TBB) && defined(LAMMPS_MEMALIGN) && \
+      defined(__INTEL_COMPILER)
+
  ptr = realloc(ptr, nbytes);
  uintptr_t offset = ((uintptr_t)(const void *)(ptr)) % LAMMPS_MEMALIGN;
  if (offset) {
--- a/src/pair.h
+++ b/src/pair.h
@ -207,6 +207,10 @@ class Pair : protected Pointers {

  typedef union {int i; float f;} union_int_float_t;

+  // Accessor for the user-intel package to determine virial calc for hybrid
+
+  inline int fdotr_is_set() const { return vflag_fdotr; }
+
 protected:
  int vflag_fdotr;
  int maxeatom,maxvatom;