diff --git a/src/USER-INTEL/Install.sh b/src/USER-INTEL/Install.sh
index 2193be0097..e07261f172 100644
--- a/src/USER-INTEL/Install.sh
+++ b/src/USER-INTEL/Install.sh
@@ -26,14 +26,12 @@ action () {
 # do not install child files if parent does not exist
 
 for file in *_intel.cpp; do
-  test $file = thr_intel.cpp && continue
   dep=`echo $file | sed 's/neigh_full_intel/neigh_full/g' | \
       sed 's/_offload_intel//g' | sed 's/_intel//g'`
   action $file $dep
 done
 
 for file in *_intel.h; do
-  test $file = thr_intel.h && continue
   dep=`echo $file | sed 's/_offload_intel//g' | sed 's/_intel//g'`
   action $file $dep
 done
@@ -42,6 +40,8 @@ action intel_preprocess.h
 action intel_buffers.h
 action intel_buffers.cpp
 action math_extra_intel.h
+action intel_simd.h pair_sw_intel.cpp
+action intel_intrinsics.h pair_tersoff_intel.cpp
 
 # step 2: handle cases and tasks not handled in step 1.
 
diff --git a/src/USER-INTEL/README b/src/USER-INTEL/README
index 3465101074..0fe0813531 100644
--- a/src/USER-INTEL/README
+++ b/src/USER-INTEL/README
@@ -3,9 +3,11 @@
                           LAMMPS Intel(R) Package
                      --------------------------------
                      
-                        W. Michael Brown (Intel)
-                       michael.w.brown at intel.com
-
+             W. Michael Brown (Intel) michael.w.brown at intel.com
+                   Rodrigo Canales (RWTH Aachen University)
+                  Markus H�hnerbach (RWTH Aachen University)
+		   Ahmed E. Ismail (RWTH Aachen University)
+                   Paolo Bientinesi (RWTH Aachen University)
                           Anupama Kurpad (Intel)
                           Biswajit Mishra (Shell)
 
@@ -53,3 +55,12 @@ By default, when running with offload to Intel(R) coprocessors, affinity
 for host MPI tasks and OpenMP threads is set automatically within the code.
 This currently requires the use of system calls. To disable at build time,
 compile with -DINTEL_OFFLOAD_NOAFFINITY.
+
+-----------------------------------------------------------------------------
+
+Vector intrinsics are temporarily being used for the Stillinger-Weber 
+potential to allow for advanced features in the AVX512 instruction set to
+be exploited on early hardware. We hope to see compiler improvements for
+AVX512 that will eliminate this requirement, so it is not recommended to
+develop code based on the intrinsics implementation. Please e-mail the 
+authors for more details.
diff --git a/src/USER-INTEL/TEST/in.intel.lc b/src/USER-INTEL/TEST/in.intel.lc
index 646065f6f9..feabfdc2e6 100644
--- a/src/USER-INTEL/TEST/in.intel.lc
+++ b/src/USER-INTEL/TEST/in.intel.lc
@@ -7,7 +7,7 @@
 package intel 1 mode mixed balance $b
 package omp 0
 suffix $s
-processors * * * grid numa
+# processors * * * grid numa
 
 variable        x index 4
 variable        y index 2
diff --git a/src/USER-INTEL/fix_intel.cpp b/src/USER-INTEL/fix_intel.cpp
index 602c20e5db..fac76939dd 100644
--- a/src/USER-INTEL/fix_intel.cpp
+++ b/src/USER-INTEL/fix_intel.cpp
@@ -60,6 +60,8 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) :  Fix(lmp, narg, arg)
 
   int ncops = force->inumeric(FLERR,arg[3]);
 
+  _nbor_pack_width = 1;
+
   _precision_mode = PREC_MODE_MIXED;
   _offload_balance = 1.0;
   _overflow_flag[LMP_OVERFLOW] = 0;
@@ -307,12 +309,14 @@ void FixIntel::setup(int vflag)
 
 /* ---------------------------------------------------------------------- */
 
-void FixIntel::pair_init_check()
+void FixIntel::pair_init_check(const bool cdmessage)
 {
   #ifdef INTEL_VMASK
   atom->sortfreq = 1;
   #endif
 
+  _nbor_pack_width = 1;
+
   #ifdef _LMP_INTEL_OFFLOAD
   if (_offload_balance != 0.0) atom->sortfreq = 1;
 
@@ -371,15 +375,12 @@ void FixIntel::pair_init_check()
   char kmode[80];
   if (_precision_mode == PREC_MODE_SINGLE) {
     strcpy(kmode, "single");
-    get_single_buffers()->free_all_nbor_buffers();
     get_single_buffers()->need_tag(need_tag);
   } else if (_precision_mode == PREC_MODE_MIXED) {
     strcpy(kmode, "mixed");
-    get_mixed_buffers()->free_all_nbor_buffers();
     get_mixed_buffers()->need_tag(need_tag);
   } else {
     strcpy(kmode, "double");
-    get_double_buffers()->free_all_nbor_buffers();
     get_double_buffers()->need_tag(need_tag);
   }
 
@@ -399,6 +400,13 @@ void FixIntel::pair_init_check()
 	fprintf(screen,"Using Intel Package without Coprocessor.\n");
       }
       fprintf(screen,"Precision: %s\n",kmode);
+      if (cdmessage) {
+	#ifdef LMP_USE_AVXCD
+	fprintf(screen,"AVX512 CD Optimizations: Enabled\n");
+	#else
+	fprintf(screen,"AVX512 CD Optimizations: Disabled\n");
+	#endif
+      }
       fprintf(screen,
 	      "----------------------------------------------------------\n");
     }
diff --git a/src/USER-INTEL/fix_intel.h b/src/USER-INTEL/fix_intel.h
index 18cd76be1d..c1e5dd1061 100644
--- a/src/USER-INTEL/fix_intel.h
+++ b/src/USER-INTEL/fix_intel.h
@@ -11,6 +11,10 @@
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
 #ifdef FIX_CLASS
 
 FixStyle(INTEL,FixIntel)
@@ -39,7 +43,7 @@ class FixIntel : public Fix {
   virtual int setmask();
   virtual void init();
   virtual void setup(int);
-  void pair_init_check();
+  void pair_init_check(const bool cdmessage=false);
 
   // Get all forces, calculation results from coprocesser
   void sync_coprocessor();
@@ -58,12 +62,15 @@ class FixIntel : public Fix {
   inline IntelBuffers<double,double> * get_double_buffers()
     { return _double_buffers; }
 
+  inline int nbor_pack_width() const { return _nbor_pack_width; }
+  inline void nbor_pack_width(const int w) { _nbor_pack_width = w; }
+
  protected:
   IntelBuffers<float,float> *_single_buffers;
   IntelBuffers<float,double> *_mixed_buffers;
   IntelBuffers<double,double> *_double_buffers;
 
-  int _precision_mode, _nthreads;
+  int _precision_mode, _nthreads, _nbor_pack_width;
 
  public:
   inline int* get_overflow_flag() { return _overflow_flag; }
diff --git a/src/USER-INTEL/intel_buffers.cpp b/src/USER-INTEL/intel_buffers.cpp
index 154db19258..1cae48cfb3 100644
--- a/src/USER-INTEL/intel_buffers.cpp
+++ b/src/USER-INTEL/intel_buffers.cpp
@@ -343,11 +343,15 @@ void IntelBuffers<flt_t, acc_t>::free_nbor_list()
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::_grow_nbor_list(NeighList *list,
                                                  const int nlocal,
-                                                 const int offload_end)
+	                                         const int nthreads,
+	                                         const int offload_end,
+	                                         const int pack_width)
 {
   free_nbor_list();
   _list_alloc_atoms = 1.10 * nlocal;
-  int list_alloc_size = (_list_alloc_atoms + _off_threads) * get_max_nbors();
+  int nt = MAX(nthreads, _off_threads);
+  int list_alloc_size = (_list_alloc_atoms + nt + pack_width - 1) * 
+    get_max_nbors();
   lmp->memory->create(_list_alloc, list_alloc_size, "_list_alloc");
   #ifdef _LMP_INTEL_OFFLOAD
   if (offload_end > 0) {
@@ -393,6 +397,9 @@ void IntelBuffers<flt_t, acc_t>::free_ccache()
     flt_t *ccachew = _ccachew;
     int *ccachei = _ccachei;
     int *ccachej = _ccachej;
+    #ifdef LMP_USE_AVXCD
+    acc_t *ccachef = _ccachef;
+    #endif
 
     #ifdef _LMP_INTEL_OFFLOAD
     if (_off_ccache) {
@@ -409,6 +416,9 @@ void IntelBuffers<flt_t, acc_t>::free_ccache()
     lmp->memory->destroy(ccachew);
     lmp->memory->destroy(ccachei);
     lmp->memory->destroy(ccachej);
+    #ifdef LMP_USE_AVXCD
+    lmp->memory->destroy(ccachef);
+    #endif
 
     _ccachex = 0;
   }
@@ -418,7 +428,8 @@ void IntelBuffers<flt_t, acc_t>::free_ccache()
 
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
-                                           const int nthreads)
+	const int nthreads,
+	const int width)
 {
   #ifdef _LMP_INTEL_OFFLOAD
   if (_ccachex && off_flag && _off_ccache == 0)
@@ -427,7 +438,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
   if (_ccachex)
     return;
 
-  const int nsize = get_max_nbors();
+  const int nsize = get_max_nbors() * width;
   int esize = MIN(sizeof(int), sizeof(flt_t));
   IP_PRE_get_stride(_ccache_stride, nsize, esize, 0);
   int nt = MAX(nthreads, _off_threads);
@@ -439,6 +450,11 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
   lmp->memory->create(_ccachew, vsize, "_ccachew");
   lmp->memory->create(_ccachei, vsize, "_ccachei");
   lmp->memory->create(_ccachej, vsize, "_ccachej");
+  #ifdef LMP_USE_AVXCD
+  IP_PRE_get_stride(_ccache_stride3, nsize * 3, esize, 0);
+  lmp->memory->create(_ccachef, _ccache_stride3 * nt, "_ccachef");
+  #endif
+  memset(_ccachej, 0, vsize * sizeof(int));
 
   #ifdef _LMP_INTEL_OFFLOAD
   if (off_flag) {
@@ -454,7 +470,8 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
       #pragma offload_transfer target(mic:_cop) \
         nocopy(ccachex,ccachey:length(vsize) alloc_if(1) free_if(0)) \
         nocopy(ccachez,ccachew:length(vsize) alloc_if(1) free_if(0)) \
-        nocopy(ccachei,ccachej:length(vsize) alloc_if(1) free_if(0))
+        nocopy(ccachei:length(vsize) alloc_if(1) free_if(0)) \
+        in(ccachej:length(vsize) alloc_if(1) free_if(0))
     }
     _off_ccache = 1;
   }
diff --git a/src/USER-INTEL/intel_buffers.h b/src/USER-INTEL/intel_buffers.h
index afd7fe46ed..cbae3f56d5 100644
--- a/src/USER-INTEL/intel_buffers.h
+++ b/src/USER-INTEL/intel_buffers.h
@@ -75,14 +75,14 @@ class IntelBuffers {
     free_local();
   }
 
-  inline void grow_nbor(NeighList *list, const int nlocal,
-                        const int offload_end) {
+  inline void grow_nbor(NeighList *list, const int nlocal, const int nthreads,
+                        const int offload_end, const int pack_width=1) {
     grow_local(list, offload_end);
     if (offload_end) {
       grow_nmax();
       grow_binhead();
     }
-    grow_nbor_list(list, nlocal, offload_end);
+    grow_nbor_list(list, nlocal, nthreads, offload_end, pack_width);
   }
 
   void free_nmax();
@@ -111,7 +111,7 @@ class IntelBuffers {
   }
 
   void free_ccache();
-  void grow_ccache(const int off_flag, const int nthreads);
+  void grow_ccache(const int off_flag, const int nthreads, const int width=1);
   inline int ccache_stride() { return _ccache_stride; }
   inline flt_t * get_ccachex() { return _ccachex; }
   inline flt_t * get_ccachey() { return _ccachey; }
@@ -119,6 +119,10 @@ class IntelBuffers {
   inline flt_t * get_ccachew() { return _ccachew; }
   inline int * get_ccachei() { return _ccachei; }
   inline int * get_ccachej() { return _ccachej; }
+  #ifdef LMP_USE_AVXCD
+  inline int ccache_stride3() { return _ccache_stride3; }
+  inline acc_t * get_ccachef() { return _ccachef; }
+  #endif
 
   inline int get_max_nbors() {
     int mn = lmp->neighbor->oneatom * sizeof(int) /
@@ -129,9 +133,10 @@ class IntelBuffers {
   void free_nbor_list();
 
   inline void grow_nbor_list(NeighList *list, const int nlocal,
-                             const int offload_end) {
+                             const int nthreads, const int offload_end,
+			     const int pack_width) {
     if (nlocal > _list_alloc_atoms)
-      _grow_nbor_list(list, nlocal, offload_end);
+      _grow_nbor_list(list, nlocal, nthreads, offload_end, pack_width);
     #ifdef _LMP_INTEL_OFFLOAD
     else if (offload_end > 0 && _off_map_stencil != list->stencil)
       _grow_stencil(list);
@@ -281,6 +286,10 @@ class IntelBuffers {
   int _ccache_stride;
   flt_t *_ccachex, *_ccachey, *_ccachez, *_ccachew;
   int *_ccachei, *_ccachej;
+  #ifdef LMP_USE_AVXCD
+  int _ccache_stride3;
+  acc_t * _ccachef;
+  #endif
 
   #ifdef _LMP_INTEL_OFFLOAD
   int _separate_buffers;
@@ -305,8 +314,8 @@ class IntelBuffers {
   void _grow_nmax();
   void _grow_local(NeighList *list, const int offload_end);
   void _grow_binhead();
-  void _grow_nbor_list(NeighList *list, const int nlocal,
-                       const int offload_end);
+  void _grow_nbor_list(NeighList *list, const int nlocal, const int nthreads,
+                       const int offload_end, const int pack_width);
   void _grow_stencil(NeighList *list);
 };
 
diff --git a/src/USER-INTEL/intel_preprocess.h b/src/USER-INTEL/intel_preprocess.h
index 39809cf882..aabe6e32ad 100644
--- a/src/USER-INTEL/intel_preprocess.h
+++ b/src/USER-INTEL/intel_preprocess.h
@@ -55,30 +55,37 @@ enum {LMP_OVERFLOW, LMP_LOCAL_MIN, LMP_LOCAL_MAX, LMP_GHOST_MIN,
 enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
       TIME_OFFLOAD_PAIR, TIME_OFFLOAD_WAIT, TIME_OFFLOAD_LATENCY,
       TIME_IMBALANCE};
-#define NUM_ITIMERS ( TIME_IMBALANCE + 1 )
 
+#define NUM_ITIMERS ( TIME_IMBALANCE + 1 )
 #define INTEL_MIC_VECTOR_WIDTH 16
 #define INTEL_VECTOR_WIDTH 4
+
 #ifdef __AVX__
 #undef INTEL_VECTOR_WIDTH
 #define INTEL_VECTOR_WIDTH 8
 #endif
+
 #ifdef __AVX2__
 #undef INTEL_VECTOR_WIDTH
 #define INTEL_VECTOR_WIDTH 8
 #endif
+
 #ifdef __AVX512F__
 #undef INTEL_VECTOR_WIDTH
 #define INTEL_VECTOR_WIDTH 16
 #define INTEL_V512 1
 #define INTEL_VMASK 1
 #else
-
 #ifdef __MIC__
 #define INTEL_V512 1
 #define INTEL_VMASK 1
 #endif
+#endif
 
+#ifdef __AVX512CD__
+#ifndef _LMP_INTEL_OFFLOAD
+#define LMP_USE_AVXCD
+#endif
 #endif
 
 #define INTEL_DATA_ALIGN 64
@@ -134,6 +141,18 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
 			   datasize);				\
   }
 
+#define IP_PRE_omp_range_id_vec(ifrom, ito, tid, inum,          \
+				nthreads, vecsize)		\
+  {								\
+    tid = omp_get_thread_num();         			\
+    int idelta = static_cast<int>(ceil(static_cast<float>(inum)	\
+				       /vecsize/nthreads));	\
+    idelta *= vecsize;						\
+    ifrom = tid*idelta;						\
+    ito = ifrom + idelta;					\
+    if (ito > inum) ito = inum;					\
+  }
+
 #else
 
 #define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads)	\
@@ -364,6 +383,43 @@ inline double MIC_Wtime() {
   }                                                                     \
 }
 
+#define IP_PRE_ev_tally_dihed(eflag, eatom, vflag, deng, i1, i2, i3, i4,\
+			      f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y,	\
+			      f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z,	\
+			      vb3x, vb3y, vb3z,oedihedral, force,	\
+			      newton, nlocal)				\
+{                                                                       \
+  flt_t ev_pre;								\
+  if (newton) ev_pre = (flt_t)1.0;					\
+  else {								\
+    ev_pre = (flt_t)0.0;						\
+    if (i1 < nlocal) ev_pre += (flt_t)0.25;				\
+    if (i2 < nlocal) ev_pre += (flt_t)0.25;				\
+    if (i3 < nlocal) ev_pre += (flt_t)0.25;				\
+    if (i4 < nlocal) ev_pre += (flt_t)0.25;				\
+  }									\
+									\
+  if (eflag) {								\
+    oedihedral += ev_pre * deng;					\
+    if (eatom) {							\
+      flt_t qdeng = deng * (flt_t)0.25;					\
+      if (newton || i1 < nlocal) f[i1].w += qdeng;			\
+      if (newton || i2 < nlocal) f[i2].w += qdeng;			\
+      if (newton || i3 < nlocal) f[i3].w += qdeng;			\
+      if (newton || i4 < nlocal) f[i4].w += qdeng;			\
+    }									\
+  }									\
+									\
+  if (vflag) {								\
+    sv0 += ev_pre * (vb1x*f1x + vb2x*f3x + (vb3x+vb2x)*f4x);		\
+    sv1 += ev_pre * (vb1y*f1y + vb2y*f3y + (vb3y+vb2y)*f4y);		\
+    sv2 += ev_pre * (vb1z*f1z + vb2z*f3z + (vb3z+vb2z)*f4z);		\
+    sv3 += ev_pre * (vb1x*f1y + vb2x*f3y + (vb3x+vb2x)*f4y);		\
+    sv4 += ev_pre * (vb1x*f1z + vb2x*f3z + (vb3x+vb2x)*f4z);		\
+    sv5 += ev_pre * (vb1y*f1z + vb2y*f3z + (vb3y+vb2y)*f4z);		\
+  }                                                                     \
+}
+
 #define IP_PRE_ev_tally_atom(evflag, eflag, vflag, f, fwtmp)    	\
 {									\
   if (evflag) {								\
diff --git a/src/USER-INTEL/math_extra_intel.h b/src/USER-INTEL/math_extra_intel.h
index 6d209f57fc..403b74d8fe 100644
--- a/src/USER-INTEL/math_extra_intel.h
+++ b/src/USER-INTEL/math_extra_intel.h
@@ -351,4 +351,128 @@
   ans##_0 = (aug_3 - t) / aug_0;			\
 }
 
+/* ----------------------------------------------------------------------
+   normalize a quaternion
+------------------------------------------------------------------------- */
+
+#define ME_qnormalize(q)						\
+{									\
+  double norm = 1.0 /							\
+    sqrt(q##_w*q##_w + q##_i*q##_i + q##_j*q##_j + q##_k*q##_k);	\
+  q##_w *= norm;							\
+  q##_i *= norm;							\
+  q##_j *= norm;							\
+  q##_k *= norm;							\
+}
+
+/* ----------------------------------------------------------------------
+   compute omega from angular momentum
+   w = omega = angular velocity in space frame
+   wbody = angular velocity in body frame
+   project space-frame angular momentum onto body axes
+     and divide by principal moments
+------------------------------------------------------------------------- */
+
+#define ME_mq_to_omega(m, quat, moments_0, moments_1, moments_2, w)	\
+{									\
+  double wbody_0, wbody_1, wbody_2;					\
+  double rot_0, rot_1, rot_2, rot_3, rot_4, rot_5, rot_6, rot_7, rot_8;	\
+									\
+  double w2 = quat##_w * quat##_w;					\
+  double i2 = quat##_i * quat##_i;					\
+  double j2 = quat##_j * quat##_j;					\
+  double k2 = quat##_k * quat##_k;					\
+  double twoij = 2.0 * quat##_i * quat##_j;				\
+  double twoik = 2.0 * quat##_i * quat##_k;				\
+  double twojk = 2.0 * quat##_j * quat##_k;				\
+  double twoiw = 2.0 * quat##_i * quat##_w;				\
+  double twojw = 2.0 * quat##_j * quat##_w;				\
+  double twokw = 2.0 * quat##_k * quat##_w;				\
+    									\
+  rot##_0 = w2 + i2 - j2 - k2;					        \
+  rot##_1 = twoij - twokw;						\
+  rot##_2 = twojw + twoik;						\
+	  								\
+  rot##_3 = twoij + twokw;					        \
+  rot##_4 = w2 - i2 + j2 - k2;				                \
+  rot##_5 = twojk - twoiw;					        \
+									\
+  rot##_6 = twoik - twojw;				                \
+  rot##_7 = twojk + twoiw;				                \
+  rot##_8 = w2 - i2 - j2 + k2;			                        \
+									\
+  wbody_0 = rot##_0*m##_0 + rot##_3*m##_1 + rot##_6*m##_2;              \
+  wbody_1 = rot##_1*m##_0 + rot##_4*m##_1 + rot##_7*m##_2;              \
+  wbody_2 = rot##_2*m##_0 + rot##_5*m##_1 + rot##_8*m##_2;              \
+									\
+  wbody_0 *= moments_0;							\
+  wbody_1 *= moments_1;							\
+  wbody_2 *= moments_2;							\
+									\
+  w##_0 = rot##_0*wbody_0 + rot##_1*wbody_1 + rot##_2*wbody_2;          \
+  w##_1 = rot##_3*wbody_0 + rot##_4*wbody_1 + rot##_5*wbody_2;          \
+  w##_2 = rot##_6*wbody_0 + rot##_7*wbody_1 + rot##_8*wbody_2;          \
+}
+
+#define ME_omega_richardson(dtf,dtq,angmomin,quatin,torque,i0,i1,i2)	\
+{									\
+  angmomin[0] += dtf * torque[0];					\
+  double angmom_0 = angmomin[0];					\
+  angmomin[1] += dtf * torque[1];					\
+  double angmom_1 = angmomin[1];					\
+  angmomin[2] += dtf * torque[2];					\
+  double angmom_2 = angmomin[2];					\
+									\
+  double quat_w = quatin[0];						\
+  double quat_i = quatin[1];						\
+  double quat_j = quatin[2];						\
+  double quat_k = quatin[3];						\
+									\
+  double omega_0, omega_1, omega_2;					\
+  ME_mq_to_omega(angmom,quat,i0,i1,i2,omega);				\
+									\
+  double wq_0, wq_1, wq_2, wq_3;					\
+  wq_0 = -omega_0*quat_i - omega_1*quat_j - omega_2*quat_k;		\
+  wq_1 = quat_w*omega_0 + omega_1*quat_k - omega_2*quat_j;		\
+  wq_2 = quat_w*omega_1 + omega_2*quat_i - omega_0*quat_k;		\
+  wq_3 = quat_w*omega_2 + omega_0*quat_j - omega_1*quat_i;		\
+									\
+  double qfull_w, qfull_i, qfull_j, qfull_k;				\
+  qfull_w = quat_w + dtq * wq_0;					\
+  qfull_i = quat_i + dtq * wq_1;					\
+  qfull_j = quat_j + dtq * wq_2;					\
+  qfull_k = quat_k + dtq * wq_3;					\
+  ME_qnormalize(qfull);							\
+									\
+  double qhalf_w, qhalf_i, qhalf_j, qhalf_k;				\
+  qhalf_w = quat_w + 0.5*dtq * wq_0;					\
+  qhalf_i = quat_i + 0.5*dtq * wq_1;					\
+  qhalf_j = quat_j + 0.5*dtq * wq_2;					\
+  qhalf_k = quat_k + 0.5*dtq * wq_3;					\
+  ME_qnormalize(qhalf);							\
+  									\
+  ME_mq_to_omega(angmom,qhalf,i0,i1,i2,omega);				\
+  wq_0 = -omega_0*qhalf_i - omega_1*qhalf_j - omega_2*qhalf_k;		\
+  wq_1 = qhalf_w*omega_0 + omega_1*qhalf_k - omega_2*qhalf_j;		\
+  wq_2 = qhalf_w*omega_1 + omega_2*qhalf_i - omega_0*qhalf_k;		\
+  wq_3 = qhalf_w*omega_2 + omega_0*qhalf_j - omega_1*qhalf_i;		\
+									\
+  qhalf_w += 0.5*dtq * wq_0;						\
+  qhalf_i += 0.5*dtq * wq_1;						\
+  qhalf_j += 0.5*dtq * wq_2;						\
+  qhalf_k += 0.5*dtq * wq_3;						\
+  ME_qnormalize(qhalf);							\
+									\
+  quat_w = 2.0*qhalf_w - qfull_w;					\
+  quat_i = 2.0*qhalf_i - qfull_i;					\
+  quat_j = 2.0*qhalf_j - qfull_j;					\
+  quat_k = 2.0*qhalf_k - qfull_k;					\
+  ME_qnormalize(quat);							\
+									\
+  quatin[0] = quat_w;							\
+  quatin[1] = quat_i;							\
+  quatin[2] = quat_j;							\
+  quatin[3] = quat_k;							\
+}
+
 #endif
diff --git a/src/USER-INTEL/neigh_half_bin_intel.cpp b/src/USER-INTEL/neigh_half_bin_intel.cpp
index ca3f12135d..6700712cb4 100644
--- a/src/USER-INTEL/neigh_half_bin_intel.cpp
+++ b/src/USER-INTEL/neigh_half_bin_intel.cpp
@@ -15,6 +15,8 @@
    Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
+//#define OUTER_CHUNK 1
+
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "atom.h"
@@ -26,6 +28,14 @@
 #include <omp.h>
 #endif
 
+#ifdef LMP_USE_AVXCD
+#include "intel_simd.h"
+#endif
+
+#ifdef OUTER_CHUNK
+#include "intel_simd.h"
+#endif
+
 using namespace LAMMPS_NS;
 
 #ifdef _LMP_INTEL_OFFLOAD
@@ -42,17 +52,11 @@ using namespace LAMMPS_NS;
   for (int s = 0; s < n3; s++) {                                      \
     if (sptr[s] == tag) {                                             \
       if (s < n1) {                                                   \
-        if (special_flag[1] == 0) which = -1;                         \
-        else if (special_flag[1] == 1) which = 0;                     \
-        else which = 1;                                               \
+        which = 1;						      \
       } else if (s < n2) {                                            \
-        if (special_flag[2] == 0) which = -1;                         \
-        else if (special_flag[2] == 1) which = 0;                     \
-        else which = 2;                                               \
+        which = 2;						      \
       } else {                                                        \
-        if (special_flag[3] == 0) which = -1;                         \
-        else if (special_flag[3] == 1) which = 0;                     \
-        else which = 3;                                               \
+        which = 3;						      \
       }                                                               \
     }                                                                 \
   }                                                                   \
@@ -199,7 +203,7 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
   if (offload) {
     fix->start_watch(TIME_PACK);
     buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
-    buffers->grow_nbor(list, atom->nlocal, aend);
+    buffers->grow_nbor(list, atom->nlocal, comm->nthreads, aend);
 
     ATOM_T biga;
     biga.x = INTEL_BIGP;
@@ -335,7 +339,7 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
     signal(tag)
   #endif
   {
-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime();
     #endif
 
@@ -494,12 +498,12 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
 	    if (j >= nlocal) {
 	      if (j == nall)
 		jlist[jj] = nall_offset;
-	      else if (which > 0)
+	      else if (which) 
 		jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
 	      else jlist[jj]-=ghost_offset;
             } else
             #endif
-	      if (which > 0) jlist[jj] = j ^ (which << SBBITS);
+	      if (which) jlist[jj] = j ^ (which << SBBITS);
           }
         }
       }
@@ -520,7 +524,7 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
       }
       #endif
     } // end omp
-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
     #endif
   } // end offload
@@ -688,7 +692,7 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
   if (offload) {
     fix->start_watch(TIME_PACK);
     buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
-    buffers->grow_nbor(list, atom->nlocal, aend);
+    buffers->grow_nbor(list, atom->nlocal, comm->nthreads, aend);
 
     ATOM_T biga;
     biga.x = INTEL_BIGP;
@@ -827,7 +831,7 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
     signal(tag)
   #endif
   {
-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime();
     #endif
 
@@ -848,33 +852,47 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
 
       const int num = aend - astart;
       int tid, ifrom, ito;
+
+      #ifdef OUTER_CHUNK
+      const int swidth = ip_simd::SIMD_type<flt_t>::width();
+      IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, swidth);
+      ifrom += astart;
+      ito += astart;
+      int e_ito = ito;
+      if (ito == num) {
+	int imod = ito % swidth;
+	if (imod) e_ito += swidth - e_ito;
+      }
+      const int list_size = (e_ito + tid + 1) * maxnbors;
+      #else
+      const int swidth = 1;
       IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
       ifrom += astart;
       ito += astart;
+      const int list_size = (ito + tid + 1) * maxnbors;
+      #endif
 
       int which;
 
-      const int list_size = (ito + tid + 1) * maxnbors;
+      int pack_offset = maxnbors * swidth;
       int ct = (ifrom + tid) * maxnbors;
       int *neighptr = firstneigh + ct;
+
+      int max_chunk = 0;
+      int lane = 0;
       for (int i = ifrom; i < ito; i++) {
-        int j, k, n, n2, itype, jtype, ibin;
-        double xtmp, ytmp, ztmp, delx, dely, delz, rsq;
-
-        n = 0;
-	n2 = maxnbors;
-
-        xtmp = x[i].x;
-        ytmp = x[i].y;
-        ztmp = x[i].z;
-        itype = x[i].w;
+        const flt_t xtmp = x[i].x;
+        const flt_t ytmp = x[i].y;
+        const flt_t ztmp = x[i].z;
+        const int itype = x[i].w;
         const int ioffset = ntypes * itype;
 
         // loop over rest of atoms in i's bin, ghosts are at end of linked list
         // if j is owned atom, store it, since j is beyond i in linked list
         // if j is ghost, only store if j coords are "above/to the right" of i
 
-        for (j = bins[i]; j >= 0; j = bins[j]) {
+	int raw_count = pack_offset;
+        for (int j = bins[i]; j >= 0; j = bins[j]) {
           if (j >= nlocal) {
             if (offload_noghost && offload) continue;
             if (x[j].z < ztmp) continue;
@@ -884,116 +902,145 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
             }
           } else if (offload_noghost && i < offload_end) continue;
 
-          jtype = x[j].w;
           #ifndef _LMP_INTEL_OFFLOAD
-          if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+          if (exclude) {
+	    const int jtype = x[j].w;
+	    if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
+	  }
 	  #endif
 
-          delx = xtmp - x[j].x;
-          dely = ytmp - x[j].y;
-          delz = ztmp - x[j].z;
-          rsq = delx * delx + dely * dely + delz * delz;
+	  neighptr[raw_count++] = j;
+	}
 
-          if (rsq <= cutneighsq[ioffset + jtype]) {
-	    if (j < nlocal) {
-              if (need_ic) {
-                int no_special;
-                ominimum_image_check(no_special, delx, dely, delz);
-                if (no_special)
-                  neighptr[n++] = -j - 1;
-                else
-                  neighptr[n++] = j;
-              } else
-	        neighptr[n++] = j;
-	      #ifdef _LMP_INTEL_OFFLOAD
-	      if (j < lmin) lmin = j;
-	      if (j > lmax) lmax = j;
-              #endif
-	    } else {
-              if (need_ic) {
-                int no_special;
-                ominimum_image_check(no_special, delx, dely, delz);
-                if (no_special)
-                  neighptr[n2++] = -j - 1;
-                else
-                  neighptr[n2++] = j;
-              } else
-	        neighptr[n2++] = j;
-	      #ifdef _LMP_INTEL_OFFLOAD
-	      if (j < gmin) gmin = j;
-	      if (j > gmax) gmax = j;
-              #endif
-            }
-	  }
-        }
         // loop over all atoms in other bins in stencil, store every pair
 
-        ibin = atombin[i];
-
-        for (k = 0; k < nstencil; k++) {
-          for (j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) {
+        const int ibin = atombin[i];
+        for (int k = 0; k < nstencil; k++) {
+          for (int j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) {
             if (offload_noghost) {
               if (j < nlocal) {
                 if (i < offload_end) continue;
               } else if (offload) continue;
             }
 
-            jtype = x[j].w;
             #ifndef _LMP_INTEL_OFFLOAD
-            if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+            if (exclude) {
+	      const int jtype = x[j].w;
+	      if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
+	    }
 	    #endif
 
-            delx = xtmp - x[j].x;
-            dely = ytmp - x[j].y;
-            delz = ztmp - x[j].z;
-            rsq = delx * delx + dely * dely + delz * delz;
-            if (rsq <= cutneighsq[ioffset + jtype]) {
-	      if (j < nlocal) {
-                if (need_ic) {
-                  int no_special;
-                  ominimum_image_check(no_special, delx, dely, delz);
-                  if (no_special)
-                    neighptr[n++] = -j - 1;
-		  else
-                    neighptr[n++] = j;
-                } else
-  		  neighptr[n++] = j;
-                #ifdef _LMP_INTEL_OFFLOAD
-		if (j < lmin) lmin = j;
-		if (j > lmax) lmax = j;
-                #endif
-	      } else {
-		if (need_ic) {
-		  int no_special;
-		  ominimum_image_check(no_special, delx, dely, delz);
-                  if (no_special)
-                    neighptr[n2++] = -j - 1;
-		  else
-                    neighptr[n2++] = j;
-                } else
-  		  neighptr[n2++] = j;
-	        #ifdef _LMP_INTEL_OFFLOAD
-		if (j < gmin) gmin = j;
-		if (j > gmax) gmax = j;
-                #endif
-	      }
-	    }
-          }
-        }
-        ilist[i] = i;
-
-        cnumneigh[i] = ct;
-        if (n > maxnbors) *overflow = 1;
-	for (k = maxnbors; k < n2; k++) neighptr[n++] = neighptr[k];
-        while( (n % pad_width) != 0 ) neighptr[n++] = e_nall;
-        numneigh[i] = n;
-	while((n % (INTEL_DATA_ALIGN / sizeof(int))) != 0) n++;
-        ct += n;
-        neighptr += n;
-	if (ct + n + maxnbors > list_size) {
-	  *overflow = 1;
-	  ct = (ifrom + tid) * maxnbors;
+	    neighptr[raw_count++] = j;
+	  }
 	}
+
+        #if defined(LMP_SIMD_COMPILER)
+	#ifdef _LMP_INTEL_OFFLOAD
+	int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
+	#pragma vector aligned
+        #pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
+	#else
+	#pragma vector aligned
+        #pragma simd
+	#endif
+	#endif
+	for (int u = pack_offset; u < raw_count; u++) {
+          int j = neighptr[u];
+          const flt_t delx = xtmp - x[j].x;
+          const flt_t dely = ytmp - x[j].y;
+          const flt_t delz = ztmp - x[j].z;
+	  const int jtype = x[j].w;
+          const flt_t rsq = delx * delx + dely * dely + delz * delz;
+          if (rsq > cutneighsq[ioffset + jtype]) 
+	    neighptr[u] = e_nall;
+	  else {
+	    if (need_ic) {
+	      int no_special;
+	      ominimum_image_check(no_special, delx, dely, delz);
+	      if (no_special)
+		neighptr[u] = -j - 1;
+	    }
+            #ifdef _LMP_INTEL_OFFLOAD
+            if (j < nlocal) {
+              if (j < vlmin) vlmin = j;
+              if (j > vlmax) vlmax = j;
+            } else {
+              if (j < vgmin) vgmin = j;
+              if (j > vgmax) vgmax = j;
+            }
+            #endif
+	  }
+	}
+        #ifdef _LMP_INTEL_OFFLOAD
+	lmin = MIN(lmin,vlmin);
+	gmin = MIN(gmin,vgmin);
+	lmax = MAX(lmax,vlmax);
+	gmax = MAX(gmax,vgmax);
+        #endif
+
+        int n = lane, n2 = pack_offset;
+	for (int u = pack_offset; u < raw_count; u++) {
+	  const int j = neighptr[u];
+	  int pj = j;
+	  if (pj < e_nall) {
+	    if (need_ic)
+	      if (pj < 0) pj = -pj - 1;
+
+	    if (pj < nlocal) {
+	      neighptr[n] = j;
+	      n += swidth;
+	    } else
+	      neighptr[n2++] = j;
+	  }
+	}
+	int ns = (n - lane) / swidth;
+	if (ns > maxnbors || n2 > list_size) *overflow = 1;
+	for (int u = pack_offset; u < n2; u++) {
+	  neighptr[n] = neighptr[u];
+	  n += swidth;
+	}
+
+        ilist[i] = i;
+        cnumneigh[i] = ct + lane;
+	ns += n2 - pack_offset;
+	#ifndef OUTER_CHUNK
+        while( (ns % pad_width) != 0 ) neighptr[ns++] = e_nall;
+	#endif
+        numneigh[i] = ns;
+
+	#ifdef OUTER_CHUNK
+	if (ns > max_chunk) max_chunk = ns;
+	lane++;
+	pack_offset -= maxnbors;
+	if (lane == swidth) {
+	  ct += max_chunk * swidth;
+	  const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
+	  const int edge = (ct % alignb);
+	  if (edge) ct += alignb - edge;
+	  neighptr = firstneigh + ct;
+	  max_chunk = 0;
+	  pack_offset = maxnbors * swidth;
+	  lane = 0;
+	  if (ct + pack_offset + maxnbors > list_size) {
+            if (i < ito - 1) {
+	      *overflow = 1;
+	      ct = (ifrom + tid) * maxnbors;
+            }
+	  }
+	}
+	#else
+	ct += ns;
+	const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
+	const int edge = (ct % alignb);
+	if (edge) ct += alignb - edge;
+	neighptr = firstneigh + ct;
+	if (ct + pack_offset + maxnbors > list_size) {
+	  if (i < ito - 1) {
+	    *overflow = 1;
+	    ct = (ifrom + tid) * maxnbors;
+	  }
+	}
+	#endif
       }
 
       if (*overflow == 1)
@@ -1032,7 +1079,16 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
         for (int i = ifrom; i < ito; ++i) {
           int * _noalias jlist = firstneigh + cnumneigh[i];
           const int jnum = numneigh[i];
+	  #ifndef OUTER_CHUNK
+          #if defined(LMP_SIMD_COMPILER)
+	  #pragma vector aligned
+          #pragma simd
+	  #endif
           for (int jj = 0; jj < jnum; jj++) {
+	  #else
+	  const int trip = jnum * swidth;
+          for (int jj = 0; jj < trip; jj+= swidth) {
+	  #endif
             const int j = jlist[jj];
 	    if (need_ic && j < 0) {
 	      which = 0;
@@ -1044,12 +1100,12 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
 	    if (j >= nlocal) {
 	      if (j == e_nall)
 		jlist[jj] = nall_offset;
-	      else if (which > 0)
+	      else if (which) 
 		jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
 	      else jlist[jj]-=ghost_offset;
             } else
 	    #endif
-            if (which > 0) jlist[jj] = j ^ (which << SBBITS);
+            if (which) jlist[jj] = j ^ (which << SBBITS);
           }
         }
       }
@@ -1070,7 +1126,7 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
       }
       #endif
     } // end omp
-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
     #endif
   } // end offload
@@ -1238,7 +1294,7 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
   if (offload) {
     fix->start_watch(TIME_PACK);
     buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
-    buffers->grow_nbor(list, atom->nlocal, aend);
+    buffers->grow_nbor(list, atom->nlocal, comm->nthreads, aend);
 
     ATOM_T biga;
     biga.x = INTEL_BIGP;
@@ -1377,7 +1433,7 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
     signal(tag)
   #endif
   {
-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime();
     #endif
 
@@ -1550,12 +1606,12 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
 	    if (j >= nlocal) {
 	      if (j == e_nall)
 		jlist[jj] = nall_offset;
-	      else if (which > 0)
+	      else if (which) 
 		jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
 	      else jlist[jj]-=ghost_offset;
             } else
             #endif
-	      if (which > 0) jlist[jj] = j ^ (which << SBBITS);
+	      if (which) jlist[jj] = j ^ (which << SBBITS);
           }
         }
       }
@@ -1576,7 +1632,7 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
       }
       #endif
     } // end omp
-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
     #endif
   } // end offload
@@ -1741,10 +1797,12 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
   const int nall = atom->nlocal + atom->nghost;
   int pad = 1;
 
+  const int pack_width = fix->nbor_pack_width();
+
   if (offload) {
     fix->start_watch(TIME_PACK);
     buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
-    buffers->grow_nbor(list, atom->nlocal, aend);
+    buffers->grow_nbor(list, atom->nlocal, comm->nthreads, aend, pack_width);
 
     ATOM_T biga;
     biga.x = INTEL_BIGP;
@@ -1871,7 +1929,7 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
     in(atombin:length(aend) alloc_if(0) free_if(0)) \
     in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
     in(special_flag:length(0) alloc_if(0) free_if(0)) \
-    in(maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \
+    in(maxnbors,nthreads,maxspecial,nstencil,e_nall,offload,pack_width)	\
     in(offload_end,separate_buffers,astart, aend, nlocal, molecular, ntypes) \
     in(xperiodic, yperiodic, zperiodic, xprd_half, yprd_half, zprd_half) \
     out(overflow:length(5) alloc_if(0) free_if(0)) \
@@ -1879,7 +1937,7 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
     signal(tag)
   #endif
   {
-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime();
     #endif
 
@@ -1900,36 +1958,40 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
 
       const int num = aend - astart;
       int tid, ifrom, ito;
-      IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
+
+      IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, pack_width);
       ifrom += astart;
       ito += astart;
+      int e_ito = ito;
+      if (ito == num) {
+	int imod = ito % pack_width;
+	if (imod) e_ito += pack_width - e_ito;
+      }
+      const int list_size = (e_ito + tid + 1) * maxnbors;
 
       int which;
 
-      const int list_size = (ito + tid + 1) * maxnbors;
+      int pack_offset = maxnbors * pack_width;
       int ct = (ifrom + tid) * maxnbors;
       int *neighptr = firstneigh + ct;
+
+      int max_chunk = 0;
+      int lane = 0;
       for (int i = ifrom; i < ito; i++) {
-        int j, k, n, n2, itype, jtype, ibin;
-        double xtmp, ytmp, ztmp, delx, dely, delz, rsq;
-
-        n = 0;
-        n2 = maxnbors;
-
-        xtmp = x[i].x;
-        ytmp = x[i].y;
-        ztmp = x[i].z;
-        itype = x[i].w;
+        const flt_t xtmp = x[i].x;
+        const flt_t ytmp = x[i].y;
+        const flt_t ztmp = x[i].z;
+        const int itype = x[i].w;
         const tagint itag = tag[i];
         const int ioffset = ntypes * itype;
 
+        const int ibin = atombin[i];
+	int raw_count = pack_offset;
+
         // loop over all atoms in surrounding bins in stencil including self
         // skip i = j
-
-        ibin = atombin[i];
-
-        for (k = 0; k < nstencil; k++) {
-          for (j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) {
+        for (int k = 0; k < nstencil; k++) {
+          for (int j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) {
             if (i == j) continue;
 
             if (offload_noghost) {
@@ -1938,76 +2000,121 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
               } else if (offload) continue;
             }
 
-            jtype = x[j].w;
             #ifndef _LMP_INTEL_OFFLOAD
-            if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
-            #endif
-
-            delx = xtmp - x[j].x;
-            dely = ytmp - x[j].y;
-            delz = ztmp - x[j].z;
-            rsq = delx * delx + dely * dely + delz * delz;
-            if (rsq <= cutneighsq[ioffset + jtype]) {
-	      const int jtag = tag[j];
-              int flist = 0;
-	      if (itag > jtag) {
-		if ((itag+jtag) % 2 == 0) flist = 1;
-	      } else if (itag < jtag) {
-		if ((itag+jtag) % 2 == 1) flist = 1;
-	      } else {
-		if (x[j].z < ztmp) flist = 1;
-		else if (x[j].z == ztmp && x[j].y < ytmp) flist = 1;
-		else if (x[j].z == ztmp && x[j].y == ytmp && x[j].x < xtmp)
-		  flist = 1;
-	      }
-	      if (flist) {
-                if (need_ic) {
-                  int no_special;
-                  ominimum_image_check(no_special, delx, dely, delz);
-                  if (no_special)
-                    neighptr[n2++] = -j - 1;
-		  else
-                    neighptr[n2++] = j;
-                } else
- 		  neighptr[n2++] = j;
-              } else {
-                if (need_ic) {
-                  int no_special;
-                  ominimum_image_check(no_special, delx, dely, delz);
-                  if (no_special)
-                    neighptr[n++] = -j - 1;
-		  else
-                    neighptr[n++] = j;
-                } else
- 		  neighptr[n++] = j;
-              }
-
-              #ifdef _LMP_INTEL_OFFLOAD
-	      if (j < nlocal) {
-                if (j < lmin) lmin = j;
-                if (j > lmax) lmax = j;
-              } else {
-                if (j < gmin) gmin = j;
-                if (j > gmax) gmax = j;
-              }
-              #endif
+            if (exclude) {
+              const int jtype = x[j].w;
+              if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
             }
+            #endif
+	    
+	    neighptr[raw_count++] = j;
           }
         }
-        ilist[i] = i;
 
-        cnumneigh[i] = ct;
-        if (n > maxnbors) *overflow = 1;
-	atombin[i] = n;
-        for (k = maxnbors; k < n2; k++) neighptr[n++] = neighptr[k];
-        numneigh[i] = n;
-        while((n % (INTEL_DATA_ALIGN / sizeof(int))) != 0) n++;
-        ct += n;
-        neighptr += n;
-        if (ct + n + maxnbors > list_size) {
-          *overflow = 1;
-          ct = (ifrom + tid) * maxnbors;
+        #if defined(LMP_SIMD_COMPILER)
+	#ifdef _LMP_INTEL_OFFLOAD
+	int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
+	#pragma vector aligned
+        #pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
+	#else
+	#pragma vector aligned
+        #pragma simd
+	#endif
+	#endif
+	for (int u = pack_offset; u < raw_count; u++) {
+          int j = neighptr[u];
+          const flt_t delx = xtmp - x[j].x;
+          const flt_t dely = ytmp - x[j].y;
+          const flt_t delz = ztmp - x[j].z;
+          const int jtype = x[j].w;
+          const flt_t rsq = delx * delx + dely * dely + delz * delz;
+          if (rsq > cutneighsq[ioffset + jtype]) 
+	    neighptr[u] = e_nall;
+	  else {
+            if (need_ic) {
+              int no_special;
+              ominimum_image_check(no_special, delx, dely, delz);
+              if (no_special)
+                neighptr[u] = -j - 1;
+            }
+            #ifdef _LMP_INTEL_OFFLOAD
+            if (j < nlocal) {
+              if (j < vlmin) vlmin = j;
+              if (j > vlmax) vlmax = j;
+            } else {
+              if (j < vgmin) vgmin = j;
+              if (j > vgmax) vgmax = j;
+            }
+            #endif
+	  }
+	}
+        #ifdef _LMP_INTEL_OFFLOAD
+	lmin = MIN(lmin,vlmin);
+	gmin = MIN(gmin,vgmin);
+	lmax = MAX(lmax,vlmax);
+	gmax = MAX(gmax,vgmax);
+        #endif
+
+        int n = lane, n2 = pack_offset;
+	for (int u = pack_offset; u < raw_count; u++) {
+	  const int j = neighptr[u];
+	  int pj = j;
+	  if (pj < e_nall) {
+	    if (need_ic)
+	      if (pj < 0) pj = -pj - 1;
+	
+	    const int jtag = tag[pj];
+	    int flist = 0;
+	    if (itag > jtag) {
+	      if ((itag+jtag) % 2 == 0) flist = 1;
+	    } else if (itag < jtag) {
+	      if ((itag+jtag) % 2 == 1) flist = 1;
+	    } else {
+              if (x[pj].z < ztmp) flist = 1;
+	      else if (x[pj].z == ztmp && x[pj].y < ytmp) flist = 1;
+	      else if (x[pj].z == ztmp && x[pj].y == ytmp && x[pj].x < xtmp) 
+	      flist = 1;
+	    }
+	    if (flist) {
+	      neighptr[n2++] = j;
+	    } else {
+	      neighptr[n] = j;
+	      n += pack_width;
+	    }
+          }
         }
+	int ns = (n - lane) / pack_width;
+	if (ns > maxnbors || n2 > list_size) *overflow = 1;
+	atombin[i] = ns;
+	for (int u = pack_offset; u < n2; u++) {
+	  neighptr[n] = neighptr[u];
+	  n += pack_width;
+	}
+
+        ilist[i] = i;
+        cnumneigh[i] = ct + lane;
+	ns += n2 - pack_offset;
+        numneigh[i] = ns;
+
+	if (ns > max_chunk) max_chunk = ns;
+	lane++;
+	pack_offset -= maxnbors;
+	if (lane == pack_width) {
+	  ct += max_chunk * pack_width;
+	  const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
+	  const int edge = (ct % alignb);
+	  if (edge) ct += alignb - edge;
+	  neighptr = firstneigh + ct;
+	  max_chunk = 0;
+	  pack_offset = maxnbors * pack_width;
+	  lane = 0;
+	  if (ct + pack_offset + maxnbors > list_size) {
+  	    if (i < ito - 1) {
+	      *overflow = 1;
+	      ct = (ifrom + tid) * maxnbors;
+	    }
+          }
+	}
       }
 
       if (*overflow == 1)
@@ -2046,7 +2153,9 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
         for (int i = ifrom; i < ito; ++i) {
           int * _noalias jlist = firstneigh + cnumneigh[i];
           const int jnum = numneigh[i];
-          for (int jj = 0; jj < jnum; jj++) {
+
+	  const int trip = jnum * pack_width;
+          for (int jj = 0; jj < trip; jj+=pack_width) {
             const int j = jlist[jj];
 	    if (need_ic && j < 0) {
 	      which = 0;
@@ -2058,12 +2167,12 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
             if (j >= nlocal) {
               if (j == e_nall)
                 jlist[jj] = nall_offset;
-              else if (which > 0)
+              else if (which)
                 jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
               else jlist[jj]-=ghost_offset;
             } else
             #endif
-            if (which > 0) jlist[jj] = j ^ (which << SBBITS);
+            if (which) jlist[jj] = j ^ (which << SBBITS);
           }
         }
       }
@@ -2083,7 +2192,7 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
       }
       #endif
     } // end omp
-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
     #endif
   } // end offload
@@ -2113,3 +2222,4 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
     #endif
   }
 }
+
diff --git a/src/USER-INTEL/pair_gayberne_intel.cpp b/src/USER-INTEL/pair_gayberne_intel.cpp
index 5b3e65bf66..9de0b9291a 100644
--- a/src/USER-INTEL/pair_gayberne_intel.cpp
+++ b/src/USER-INTEL/pair_gayberne_intel.cpp
@@ -12,9 +12,17 @@
    Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
-#include <math.h>
 #include "pair_gayberne_intel.h"
 #include "math_extra_intel.h"
+
+#ifdef _LMP_INTEL_OFFLOAD
+#pragma offload_attribute(push,target(mic))
+#endif
+#include <cmath>
+#ifdef _LMP_INTEL_OFFLOAD
+#pragma offload_attribute(pop)
+#endif
+
 #include "atom.h"
 #include "comm.h"
 #include "atom_vec_ellipsoid.h"
@@ -295,7 +303,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
     signal(f_start)
   #endif
   {
-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute=MIC_Wtime();
     #endif
 
@@ -335,8 +343,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
 
     acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
     if (EVFLAG) {
-      oevdwl = (acc_t)0;
-      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+      oevdwl = (acc_t)0.0;
+      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
     }
 
     // loop over neighbors of my atoms
@@ -394,8 +402,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
         fxtmp = fytmp = fztmp = t1tmp = t2tmp = t3tmp = (acc_t)0.0;
 
         if (EVFLAG) {
-          if (EFLAG) fwtmp = sevdwl = (acc_t)0;
-          if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+          if (EFLAG) fwtmp = sevdwl = (acc_t)0.0;
+          if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
         }
 
         bool multiple_forms = false;
@@ -485,14 +493,14 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
           tempv_1 = kappa_1 * inv_r;
           tempv_2 = kappa_2 * inv_r;
           flt_t sigma12 = ME_dot3(r12hat, tempv);
-          sigma12 = pow((flt_t)0.5 * sigma12,(flt_t) - 0.5);
+          sigma12 = std::pow((flt_t)0.5 * sigma12,(flt_t) - 0.5);
           flt_t h12 = r - sigma12;
 
           // energy
           // compute u_r
 
           flt_t varrho = sigma / (h12 + gamma * sigma);
-          flt_t varrho6 = pow(varrho, (flt_t)6.0);
+          flt_t varrho6 = std::pow(varrho, (flt_t)6.0);
           flt_t varrho12 = varrho6 * varrho6;
           flt_t u_r = (flt_t)4.0 * epsilon * (varrho12 - varrho6);
 
@@ -500,7 +508,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
 
           flt_t eta = (flt_t)2.0 * ijci[jtype].lshape;
           flt_t det_g12 = ME_det3(g12);
-          eta = pow(eta / det_g12, upsilon);
+          eta = std::pow(eta / det_g12, upsilon);
 
           // compute chi_12
 
@@ -516,7 +524,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
           tempv_1 = iota_1 * inv_r;
           tempv_2 = iota_2 * inv_r;
           flt_t chi = ME_dot3(r12hat, tempv);
-          chi = pow(chi * (flt_t)2.0, mu);
+          chi = std::pow(chi * (flt_t)2.0, mu);
 
           // force
           // compute dUr/dr
@@ -524,7 +532,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
           temp1 = ((flt_t)2.0 * varrho12 * varrho - varrho6 * varrho) /
 	    sigma;
           temp1 = temp1 * (flt_t)24.0 * epsilon;
-          flt_t u_slj = temp1 * pow(sigma12, (flt_t)3.0) * (flt_t)0.5;
+          flt_t u_slj = temp1 * std::pow(sigma12, (flt_t)3.0) * (flt_t)0.5;
           flt_t dUr_0, dUr_1, dUr_2;
           temp2 = ME_dot3(kappa, r12hat);
           flt_t uslj_rsq = u_slj / rsq_form[jj];
@@ -536,8 +544,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
 
           flt_t dchi_0, dchi_1, dchi_2;
           temp1 = ME_dot3(iota, r12hat);
-          temp2 = (flt_t)-4.0 / rsq_form[jj] * mu *
-	    pow(chi, (mu - (flt_t)1.0) / mu);
+          temp2 = (flt_t)-4.0 / rsq_form[jj] * mu * 
+	    std::pow(chi, (mu - (flt_t)1.0) / mu);
           dchi_0 = temp2 * (iota_0 - temp1 * r12hat_0);
           dchi_1 = temp2 * (iota_1 - temp1 * r12hat_1);
           dchi_2 = temp2 * (iota_2 - temp1 * r12hat_2);
@@ -714,7 +722,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
 	    }
 
 	    if (EVFLAG) {
-	      flt_t ev_pre = (flt_t)0;
+	      flt_t ev_pre = (flt_t)0.0;
 	      if (NEWTON_PAIR || i < nlocal)
 		ev_pre += (flt_t)0.5;
 	      if (NEWTON_PAIR || j < nlocal)
@@ -863,7 +871,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
       }
     }
 
-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
     #endif
   } // offload
diff --git a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
index 7ed23d4dee..41ab6885ec 100644
--- a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
@@ -217,7 +217,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
     ITABLE_IN signal(f_start)
   #endif
   {
-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime();
     #endif
 
@@ -459,7 +459,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
         ev_global[7] = ov5;
       }
     }
-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
     #endif
   } // end of offload region
diff --git a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
index 427d7be33f..40451d6f36 100644
--- a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
@@ -212,7 +212,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
     ITABLE_IN signal(f_start)
   #endif
   {
-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime();
     #endif
 
@@ -263,7 +263,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
 	  if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 	}
 
-        #if defined(__INTEL_COMPILER)
+        #if defined(LMP_SIMD_COMPILER)
 	#pragma vector aligned
 	#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
 	                       sv0, sv1, sv2, sv3, sv4, sv5)
@@ -283,7 +283,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
 
           const flt_t r2inv = (flt_t)1.0 / rsq;
 
-	  #ifdef __MIC__
+	  #ifdef INTEL_VMASK
 	  if (rsq < c_forcei[jtype].cutsq) {
           #endif
             #ifdef INTEL_ALLOW_TABLE
@@ -335,11 +335,11 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
               }
             }
             #endif
-	  #ifdef __MIC__
+	  #ifdef INTEL_VMASK
 	  }
 	  #endif
 
-	  #ifdef __MIC__
+	  #ifdef INTEL_VMASK
 	  if (rsq < c_forcei[jtype].cut_ljsq) {
 	  #endif
             flt_t r6inv = r2inv * r2inv * r2inv;
@@ -354,7 +354,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
               forcelj *= factor_lj;
               if (EFLAG) evdwl *= factor_lj;
             }
-	  #ifdef __MIC__
+	  #ifdef INTEL_VMASK
 	  }
 	  #else
 	  if (rsq > c_forcei[jtype].cutsq)
@@ -363,7 +363,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
 	    { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
 	  #endif
 
-	  #ifdef __MIC__
+	  #ifdef INTEL_VMASK
 	  if (rsq < c_forcei[jtype].cutsq) {
 	  #endif
             const flt_t fpair = (forcecoul + forcelj) * r2inv;
@@ -395,7 +395,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
               }
  	      IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz);
             }
-          #ifdef __MIC__
+          #ifdef INTEL_VMASK
 	  }
 	  #endif
         } // for jj
@@ -426,7 +426,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
         ev_global[7] = ov5;
       }
     }
-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
     #endif
   } // end of offload region
diff --git a/src/USER-INTEL/pair_lj_cut_intel.cpp b/src/USER-INTEL/pair_lj_cut_intel.cpp
index a3665b6588..e1300b1a52 100644
--- a/src/USER-INTEL/pair_lj_cut_intel.cpp
+++ b/src/USER-INTEL/pair_lj_cut_intel.cpp
@@ -88,39 +88,73 @@ void PairLJCutIntel::compute(int eflag, int vflag,
     fix->stop_watch(TIME_PACK);
   }
 
-  if (evflag || vflag_fdotr) {
-    int ovflag = 0;
-    if (vflag_fdotr) ovflag = 2;
-    else if (vflag) ovflag = 1;
-    if (eflag) {
-      if (force->newton_pair) {
-        eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
-        eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
+  if (_onetype) {
+    if (evflag || vflag_fdotr) {
+      int ovflag = 0;
+      if (vflag_fdotr) ovflag = 2;
+      else if (vflag) ovflag = 1;
+      if (eflag) {
+	if (force->newton_pair) {
+	  eval<1,1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+	  eval<1,1,1,1>(0, ovflag, buffers, fc, host_start, inum);
+	} else {
+	  eval<1,1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+	  eval<1,1,1,0>(0, ovflag, buffers, fc, host_start, inum);
+	}
       } else {
-        eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
-        eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
+	if (force->newton_pair) {
+	  eval<1,1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+	  eval<1,1,0,1>(0, ovflag, buffers, fc, host_start, inum);
+	} else {
+	  eval<1,1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+	  eval<1,1,0,0>(0, ovflag, buffers, fc, host_start, inum);
+	}
       }
     } else {
       if (force->newton_pair) {
-        eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
-        eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
+	eval<1,0,0,1>(1, 0, buffers, fc, 0, offload_end);
+	eval<1,0,0,1>(0, 0, buffers, fc, host_start, inum);
       } else {
-        eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
-        eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
+	eval<1,0,0,0>(1, 0, buffers, fc, 0, offload_end);
+	eval<1,0,0,0>(0, 0, buffers, fc, host_start, inum);
       }
     }
   } else {
-    if (force->newton_pair) {
-      eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
-      eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
+    if (evflag || vflag_fdotr) {
+      int ovflag = 0;
+      if (vflag_fdotr) ovflag = 2;
+      else if (vflag) ovflag = 1;
+      if (eflag) {
+	if (force->newton_pair) {
+	  eval<0,1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+	  eval<0,1,1,1>(0, ovflag, buffers, fc, host_start, inum);
+	} else {
+	  eval<0,1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+	  eval<0,1,1,0>(0, ovflag, buffers, fc, host_start, inum);
+	}
+      } else {
+	if (force->newton_pair) {
+	  eval<0,1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+	  eval<0,1,0,1>(0, ovflag, buffers, fc, host_start, inum);
+	} else {
+	  eval<0,1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+	  eval<0,1,0,0>(0, ovflag, buffers, fc, host_start, inum);
+	}
+      }
     } else {
-      eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
-      eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
+      if (force->newton_pair) {
+	eval<0,0,0,1>(1, 0, buffers, fc, 0, offload_end);
+	eval<0,0,0,1>(0, 0, buffers, fc, host_start, inum);
+      } else {
+	eval<0,0,0,0>(1, 0, buffers, fc, 0, offload_end);
+	eval<0,0,0,0>(0, 0, buffers, fc, host_start, inum);
+      }
     }
   }
 }
 
-template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+template <int ONETYPE, int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, 
+	  class acc_t>
 void PairLJCutIntel::eval(const int offload, const int vflag,
                           IntelBuffers<flt_t,acc_t> *buffers,
                           const ForceConst<flt_t> &fc,
@@ -159,7 +193,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
   const int nthreads = tc;
   int *overflow = fix->get_off_overflow_flag();
   {
-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime();
     #endif
 
@@ -187,12 +221,25 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
       FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
       memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
 
+      flt_t cutsq, lj1, lj2, lj3, lj4, offset;
+      if (ONETYPE) {
+	cutsq = ljc12o[3].cutsq;
+	lj1 = ljc12o[3].lj1;
+	lj2 = ljc12o[3].lj2;
+	lj3 = lj34[3].lj3;
+	lj4 = lj34[3].lj4;
+	offset = ljc12o[3].offset;
+      }
       for (int i = iifrom; i < iito; ++i) {
-        const int itype = x[i].w;
-
-        const int ptr_off = itype * ntypes;
-        const FC_PACKED1_T * _noalias const ljc12oi = ljc12o + ptr_off;
-        const FC_PACKED2_T * _noalias const lj34i = lj34 + ptr_off;
+        int itype, ptr_off;
+        const FC_PACKED1_T * _noalias ljc12oi;
+        const FC_PACKED2_T * _noalias lj34i;
+	if (!ONETYPE) {
+	  itype = x[i].w;
+          ptr_off = itype * ntypes;
+          ljc12oi = ljc12o + ptr_off;
+          lj34i = lj34 + ptr_off;
+	}
 
         const int * _noalias const jlist = firstneigh + cnumneigh[i];
         const int jnum = numneigh[i];
@@ -218,25 +265,42 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
           flt_t forcelj, evdwl;
           forcelj = evdwl = (flt_t)0.0;
 
-          const int sbindex = jlist[jj] >> SBBITS & 3;
-          const int j = jlist[jj] & NEIGHMASK;
+	  int j, jtype, sbindex;
+	  if (!ONETYPE) {
+	    sbindex = jlist[jj] >> SBBITS & 3;
+	    j = jlist[jj] & NEIGHMASK;
+	  } else
+	    j = jlist[jj];
+
           const flt_t delx = xtmp - x[j].x;
           const flt_t dely = ytmp - x[j].y;
           const flt_t delz = ztmp - x[j].z;
-          const int jtype = x[j].w;
+          if (!ONETYPE) {
+	    jtype = x[j].w;
+            cutsq = ljc12oi[jtype].cutsq;
+	  }
           const flt_t rsq = delx * delx + dely * dely + delz * delz;
 
           #ifdef INTEL_VMASK
-          if (rsq < ljc12oi[jtype].cutsq) {
+          if (rsq < cutsq) {
 	  #endif
-            flt_t factor_lj = special_lj[sbindex];
+            flt_t factor_lj;
+	    if (!ONETYPE) factor_lj = special_lj[sbindex];
             flt_t r2inv = 1.0 / rsq;
             flt_t r6inv = r2inv * r2inv * r2inv;
             #ifndef INTEL_VMASK
-	    if (rsq > ljc12oi[jtype].cutsq) r6inv = (flt_t)0.0;
+	    if (rsq > cutsq) r6inv = (flt_t)0.0;
 	    #endif
-            forcelj = r6inv * (ljc12oi[jtype].lj1 * r6inv - ljc12oi[jtype].lj2);
-            flt_t fpair = factor_lj * forcelj * r2inv;
+	    if (!ONETYPE) {
+	      lj1 = ljc12oi[jtype].lj1;
+	      lj2 = ljc12oi[jtype].lj2;
+	    }
+            forcelj = r6inv * (lj1 * r6inv - lj2);
+            flt_t fpair;
+	    if (!ONETYPE)
+	      fpair = factor_lj * forcelj * r2inv;
+	    else
+	      fpair = forcelj * r2inv;
 
             fxtmp += delx * fpair;
             fytmp += dely * fpair;
@@ -255,9 +319,13 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
                 ev_pre += (flt_t)0.5;
 
               if (EFLAG) {
-                evdwl = r6inv * (lj34i[jtype].lj3 * r6inv-lj34i[jtype].lj4) -
-                    ljc12oi[jtype].offset;
-                evdwl *= factor_lj;
+		if (!ONETYPE) {
+		  lj3 = lj34i[jtype].lj3;
+		  lj4 = lj34i[jtype].lj4;
+		  offset = ljc12oi[jtype].offset;
+		}
+                evdwl = r6inv * (lj3 * r6inv - lj4) - offset;
+                if (!ONETYPE) evdwl *= factor_lj;
                 sevdwl += ev_pre*evdwl;
                 if (eatom) {
                   if (NEWTON_PAIR || i < nlocal)
@@ -302,7 +370,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
         ev_global[7] = ov5;
       }
     }
-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
     #endif
   } // end offload
@@ -352,6 +420,9 @@ template <class flt_t, class acc_t>
 void PairLJCutIntel::pack_force_const(ForceConst<flt_t> &fc,
                                       IntelBuffers<flt_t,acc_t> *buffers)
 {
+  _onetype = 0;
+  if (atom->ntypes == 1 && !atom->molecular) _onetype = 1;
+
   int tp1 = atom->ntypes + 1;
   fc.set_ntypes(tp1,memory,_cop);
   buffers->set_ntypes(tp1);
diff --git a/src/USER-INTEL/pair_lj_cut_intel.h b/src/USER-INTEL/pair_lj_cut_intel.h
index 13737affab..a9c77324f3 100644
--- a/src/USER-INTEL/pair_lj_cut_intel.h
+++ b/src/USER-INTEL/pair_lj_cut_intel.h
@@ -39,13 +39,14 @@ class PairLJCutIntel : public PairLJCut {
 
  private:
   FixIntel *fix;
-  int _cop;
+  int _cop, _onetype;
 
   template <class flt_t> class ForceConst;
   template <class flt_t, class acc_t>
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
-  template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+  template <int ONETYPE, int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, 
+	    class acc_t>
   void eval(const int offload, const int vflag,
             IntelBuffers<flt_t,acc_t> * buffers,
             const ForceConst<flt_t> &fc, const int astart, const int aend);
diff --git a/src/USER-INTEL/pair_sw_intel.cpp b/src/USER-INTEL/pair_sw_intel.cpp
index 0380147abf..a1491e2859 100755
--- a/src/USER-INTEL/pair_sw_intel.cpp
+++ b/src/USER-INTEL/pair_sw_intel.cpp
@@ -15,11 +15,19 @@
    Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
-#include <math.h>
+#include "pair_sw_intel.h"
+
+#ifdef _LMP_INTEL_OFFLOAD
+#pragma offload_attribute(push,target(mic))
+#endif
+#include <cmath>
+#ifdef _LMP_INTEL_OFFLOAD
+#pragma offload_attribute(pop)
+#endif
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include "pair_sw_intel.h"
 #include "atom.h"
 #include "neighbor.h"
 #include "neigh_request.h"
@@ -33,10 +41,17 @@
 #include "modify.h"
 #include "suffix.h"
 
+#ifdef LMP_USE_AVXCD
+#define OUTER_CHUNK 1
+#include "intel_simd.h"
+using namespace ip_simd;
+#endif
+
 using namespace LAMMPS_NS;
 
 #define FC_PACKED0_T typename ForceConst<flt_t>::fc_packed0
 #define FC_PACKED1_T typename ForceConst<flt_t>::fc_packed1
+#define FC_PACKED1p2_T typename ForceConst<flt_t>::fc_packed1p2
 #define FC_PACKED2_T typename ForceConst<flt_t>::fc_packed2
 #define FC_PACKED3_T typename ForceConst<flt_t>::fc_packed3
 
@@ -62,7 +77,7 @@ void PairSWIntel::compute(int eflag, int vflag)
 {
   if (fix->precision() == FixIntel::PREC_MODE_MIXED)
     compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
-                          force_const_single);
+			  force_const_single);
   else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
     compute<double,double>(eflag, vflag, fix->get_double_buffers(),
                            force_const_double);
@@ -107,44 +122,81 @@ void PairSWIntel::compute(int eflag, int vflag,
     fix->stop_watch(TIME_PACK);
   }
 
-  if (_spq) {
-    if (evflag || vflag_fdotr) {
-      int ovflag = 0;
-      if (vflag_fdotr) ovflag = 2;
-      else if (vflag) ovflag = 1;
-      if (eflag) {
-	eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-	eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+  if (_onetype) {
+    if (_spq) {
+      if (evflag || vflag_fdotr) {
+	int ovflag = 0;
+	if (vflag_fdotr) ovflag = 2;
+	else if (vflag) ovflag = 1;
+	if (eflag) {
+	  eval<1,1,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+	  eval<1,1,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+	} else {
+	  eval<1,1,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+	  eval<1,1,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+	}
       } else {
-	eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-	eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+	eval<1,1,0,0>(1, 0, buffers, fc, 0, offload_end, _offload_pad);
+	eval<1,1,0,0>(0, 0, buffers, fc, host_start, inum, _host_pad);
       }
     } else {
-      eval<1,0,0>(1, 0, buffers, fc, 0, offload_end, _offload_pad);
-      eval<1,0,0>(0, 0, buffers, fc, host_start, inum, _host_pad);
+      if (evflag || vflag_fdotr) {
+	int ovflag = 0;
+	if (vflag_fdotr) ovflag = 2;
+	else if (vflag) ovflag = 1;
+	if (eflag) {
+	  eval<0,1,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+	  eval<0,1,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+	} else {
+	  eval<0,1,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+	  eval<0,1,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+	}
+      } else {
+	eval<0,1,0,0>(1, 0, buffers, fc, 0, offload_end, _offload_pad);
+	eval<0,1,0,0>(0, 0, buffers, fc, host_start, inum, _host_pad);
+      }
     }
   } else {
-   if (evflag || vflag_fdotr) {
-      int ovflag = 0;
-      if (vflag_fdotr) ovflag = 2;
-      else if (vflag) ovflag = 1;
-      if (eflag) {
-	eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-	eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+    if (_spq) {
+      if (evflag || vflag_fdotr) {
+	int ovflag = 0;
+	if (vflag_fdotr) ovflag = 2;
+	else if (vflag) ovflag = 1;
+	if (eflag) {
+	  eval<1,0,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+	  eval<1,0,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+	} else {
+	  eval<1,0,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+	  eval<1,0,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+	}
       } else {
-	eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
-	eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+	eval<1,0,0,0>(1, 0, buffers, fc, 0, offload_end, _offload_pad);
+	eval<1,0,0,0>(0, 0, buffers, fc, host_start, inum, _host_pad);
       }
     } else {
-      eval<0,0,0>(1, 0, buffers, fc, 0, offload_end, _offload_pad);
-      eval<0,0,0>(0, 0, buffers, fc, host_start, inum, _host_pad);
+      if (evflag || vflag_fdotr) {
+	int ovflag = 0;
+	if (vflag_fdotr) ovflag = 2;
+	else if (vflag) ovflag = 1;
+	if (eflag) {
+	  eval<0,0,1,1>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+	  eval<0,0,1,1>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+	} else {
+	  eval<0,0,1,0>(1, ovflag, buffers, fc, 0, offload_end, _offload_pad);
+	  eval<0,0,1,0>(0, ovflag, buffers, fc, host_start, inum, _host_pad);
+	}
+      } else {
+	eval<0,0,0,0>(1, 0, buffers, fc, 0, offload_end, _offload_pad);
+	eval<0,0,0,0>(0, 0, buffers, fc, host_start, inum, _host_pad);
+      }
     }
   }
 }
 
 /* ---------------------------------------------------------------------- */
+#ifndef LMP_USE_AVXCD
 
-template <int SPQ, int EVFLAG, int EFLAG, class flt_t, class acc_t>
+template <int SPQ,int ONETYPE,int EVFLAG,int EFLAG,class flt_t,class acc_t>
 void PairSWIntel::eval(const int offload, const int vflag,
                        IntelBuffers<flt_t,acc_t> *buffers,
                        const ForceConst<flt_t> &fc, const int astart,
@@ -166,6 +218,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
 
   const FC_PACKED0_T * _noalias const p2 = fc.p2[0];
   const FC_PACKED1_T * _noalias const p2f = fc.p2f[0];
+  const FC_PACKED1p2_T * _noalias const p2f2 = fc.p2f2[0];
   const FC_PACKED2_T * _noalias const p2e = fc.p2e[0];
   const FC_PACKED3_T * _noalias const p3 = fc.p3[0][0];
 
@@ -198,7 +251,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
 
   if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
   #pragma offload target(mic:_cop) if(offload) \
-    in(p2,p2f,p2e,p3:length(0) alloc_if(0) free_if(0)) \
+    in(p2,p2f,p2f2,p2e,p3:length(0) alloc_if(0) free_if(0)) \
     in(firstneigh:length(0) alloc_if(0) free_if(0)) \
     in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
     in(numneigh:length(0) alloc_if(0) free_if(0)) \
@@ -215,7 +268,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
     signal(f_start)
   #endif
   {
-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime();
     #endif
 
@@ -251,18 +304,40 @@ void PairSWIntel::eval(const int offload, const int vflag,
       int * _noalias const tjtype = ccachej + toffs;
 
       // loop over full neighbor list of my atoms
+      flt_t cutsq, cut, powerp, powerq, sigma, c1, c2, c3, c4, c5, c6;
+      flt_t sigma_gamma, costheta, lambda_epsilon, lambda_epsilon2;
+      if (ONETYPE) {
+        cutsq = p2[3].cutsq;
+	cut = p2f[3].cut;
+	sigma = p2f[3].sigma;
+	c1 = p2f2[3].c1;
+	c2 = p2f2[3].c2;
+	c3 = p2f2[3].c3;
+	c4 = p2f2[3].c4;
+	sigma_gamma = p2[3].sigma_gamma;
+	costheta = p3[7].costheta;
+	lambda_epsilon = p3[7].lambda_epsilon;
+	lambda_epsilon2 = p3[7].lambda_epsilon2;
+	if (SPQ == 0) {
+          powerp = p2f[3].powerp;
+	  powerq = p2f[3].powerq;
+        }
+	if (EFLAG) {
+          c5 = p2e[3].c5;
+	  c6 = p2e[3].c6;
+        }
+      }
 
       for (int i = iifrom; i < iito; ++i) {
+        int itype, itype_offset;
         const flt_t xtmp = x[i].x;
         const flt_t ytmp = x[i].y;
         const flt_t ztmp = x[i].z;
-        const int itype = x[i].w;
 
-        const int ptr_off = itype * ntypes;
-        const FC_PACKED0_T * _noalias const p2i = p2 + ptr_off;
-        const FC_PACKED1_T * _noalias const p2fi = p2f + ptr_off;
-        const FC_PACKED2_T * _noalias const p2ei = p2e + ptr_off;
-        const FC_PACKED3_T * _noalias const p3i = p3 + ptr_off*ntypes;
+	if (!ONETYPE) {
+          itype = x[i].w;
+	  itype_offset = itype * ntypes;
+        } 
 
         const int * _noalias const jlist = firstneigh + cnumneigh[i];
         const int jnum = numneigh[i];
@@ -283,50 +358,58 @@ void PairSWIntel::eval(const int offload, const int vflag,
           const flt_t delx = x[j].x - xtmp;
           const flt_t dely = x[j].y - ytmp;
           const flt_t delz = x[j].z - ztmp;
-          const int jtype = x[j].w;
+          int jtype, ijtype;
+          if (!ONETYPE) {
+	    jtype = x[j].w;
+	    ijtype = itype_offset + jtype;
+	    cutsq = p2[ijtype].cutsq;
+	  } 
           const flt_t rsq1 = delx * delx + dely * dely + delz * delz;
-          if (rsq1 < p2i[jtype].cutsq) {
+          if (rsq1 < cutsq) {
 	    tdelx[ejnum] = delx;
 	    tdely[ejnum] = dely;
 	    tdelz[ejnum] = delz;
 	    trsq[ejnum] = rsq1;
 	    tj[ejnum] = j;
-	    tjtype[ejnum] = jtype;
+	    if (!ONETYPE) tjtype[ejnum] = jtype;
 	    ejnum++;
 	    if (jj < jnumhalf) ejnumhalf++;
 	  }
 	}
 	int ejnum_pad = ejnum;
+	
 	while ( (ejnum_pad % pad_width) != 0) {
 	  tdelx[ejnum_pad] = (flt_t)0.0;
 	  tdely[ejnum_pad] = (flt_t)0.0;
 	  tdelz[ejnum_pad] = (flt_t)0.0;
-	  trsq[ejnum_pad] = (flt_t)1.0;
+	  trsq[ejnum_pad] = p2[3].cutsq + (flt_t)1.0;
 	  tj[ejnum_pad] = nall;
-	  tjtype[ejnum_pad] = 0;
+	  if (!ONETYPE) tjtype[ejnum_pad] = 0;
 	  ejnum_pad++;
 	}
-
+	
         #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned
-        #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
-				 sv0, sv1, sv2, sv3, sv4, sv5)
+	#pragma vector aligned
+		#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+					 sv0, sv1, sv2, sv3, sv4, sv5)
 	#endif
         for (int jj = 0; jj < ejnum_pad; jj++) {
           acc_t fjxtmp, fjytmp, fjztmp, fjtmp;
           fjxtmp = fjytmp = fjztmp = (acc_t)0.0;
           if (EFLAG) fjtmp = (acc_t)0.0;
+	  int ijtype;
 
           const flt_t delx = tdelx[jj];
           const flt_t dely = tdely[jj];
           const flt_t delz = tdelz[jj];
-          const int jtype = tjtype[jj];
+	  if (!ONETYPE) ijtype = tjtype[jj] + itype_offset;
           const flt_t rsq1 = trsq[jj];
 
           const flt_t r1 = sqrt(rsq1);
           const flt_t rinvsq1 = (flt_t)1.0 / rsq1;
-          const flt_t rainv1 = (flt_t)1.0 / (r1 - p2fi[jtype].cut);
-
+	  if (!ONETYPE) cut = p2f[ijtype].cut;
+          const flt_t rainv1 = (flt_t)1.0 / (r1 - cut);
+	  
 	  // two-body interactions, skip half of them
 	  flt_t rp, rq;
 	  if (SPQ == 1) {
@@ -335,14 +418,26 @@ void PairSWIntel::eval(const int offload, const int vflag,
 	    rp = (flt_t)1.0 / rp;
 	    rq = (flt_t)1.0;
 	  } else {
-	    rp = pow(r1, -p2fi[jtype].powerp);
-	    rq = pow(r1, -p2fi[jtype].powerq);
+            if (!ONETYPE) {
+              powerp = p2f[ijtype].powerp;
+	      powerq = p2f[ijtype].powerq;
+            }
+	    rp = std::pow(r1, powerp);
+	    rq = std::pow(r1, powerq);
 	  }
+
+	  if (!ONETYPE) {
+            sigma = p2f[ijtype].sigma;
+	    c1 = p2f2[ijtype].c1;
+	    c2 = p2f2[ijtype].c2;
+	    c3 = p2f2[ijtype].c3; 
+	    c4 = p2f2[ijtype].c4;
+          }
+
 	  const flt_t rainvsq = rainv1 * rainv1 * r1;
-	  flt_t expsrainv = exp(p2fi[jtype].sigma * rainv1);
+	  flt_t expsrainv = exp(sigma * rainv1);
 	  if (jj >= ejnumhalf) expsrainv = (flt_t)0.0;
-	  const flt_t fpair = (p2fi[jtype].c1 * rp - p2fi[jtype].c2 * rq +
-			       (p2fi[jtype].c3 * rp -p2fi[jtype].c4 * rq) *
+	  const flt_t fpair = (c1 * rp - c2 * rq + (c3 * rp - c4 * rq) * 
 			       rainvsq) * expsrainv * rinvsq1;
 
 	  fxtmp -= delx * fpair;
@@ -355,8 +450,11 @@ void PairSWIntel::eval(const int offload, const int vflag,
 	  if (EVFLAG) {
 	    if (EFLAG) {
 	      flt_t evdwl;
-	      evdwl = (p2ei[jtype].c5 * rp - p2ei[jtype].c6 * rq) *
-	    	      expsrainv;
+	      if (!ONETYPE) {
+		c5 = p2e[ijtype].c5;
+		c6 = p2e[ijtype].c6;
+	      }
+	      evdwl = (c5 * rp - c6 * rq) * expsrainv;
 	      sevdwl += evdwl;
 	      if (eatom) {
 		fwtmp += (acc_t)0.5 * evdwl;
@@ -369,43 +467,57 @@ void PairSWIntel::eval(const int offload, const int vflag,
 
 	  /*---------------------------------------------*/
 
-          flt_t gsrainv1 = p2i[jtype].sigma_gamma * rainv1;
+	  int ijkoff;
+	  if (!ONETYPE) {
+            sigma_gamma = p2[ijtype].sigma_gamma;
+	    ijkoff = ijtype * ntypes;
+          }
+
+          flt_t gsrainv1 = sigma_gamma * rainv1;
           flt_t gsrainvsq1 = gsrainv1 * rainv1 / r1;
           flt_t expgsrainv1 = exp(gsrainv1);
-	  const int joffset = jtype * ntypes;
 
           for (int kk = 0; kk < ejnum; kk++) {
+            int iktype, ijktype;
+	    if (!ONETYPE) {
+              iktype = tjtype[kk];
+	      ijktype = ijkoff + iktype;
+	      iktype += itype_offset;
+	      cut = p2[iktype].cut;
+	      sigma_gamma = p2[iktype].sigma_gamma;
+	      costheta = p3[ijktype].costheta;
+	      lambda_epsilon = p3[ijktype].lambda_epsilon;
+	      lambda_epsilon2 = p3[ijktype].lambda_epsilon2;
+            }
+
             flt_t delr2[3];
             delr2[0] = tdelx[kk];
             delr2[1] = tdely[kk];
             delr2[2] = tdelz[kk];
-	    const int ktype = tjtype[kk];
             const flt_t rsq2 = trsq[kk];
 
 	    const flt_t r2 = sqrt(rsq2);
 	    const flt_t rinvsq2 = (flt_t)1.0 / rsq2;
-	    const flt_t rainv2 = (flt_t)1.0 / (r2 - p2i[ktype].cut);
-	    const flt_t gsrainv2 = p2i[ktype].sigma_gamma * rainv2;
+	    const flt_t rainv2 = (flt_t)1.0 / (r2 - cut);
+	    const flt_t gsrainv2 = sigma_gamma * rainv2;
 	    const flt_t gsrainvsq2 = gsrainv2 * rainv2 / r2;
 	    const flt_t expgsrainv2 = exp(gsrainv2);
 
 	    const flt_t rinv12 = (flt_t)1.0 / (r1 * r2);
 	    const flt_t cs = (delx * delr2[0] + dely * delr2[1] +
                               delz * delr2[2]) * rinv12;
-	    const flt_t delcs = cs - p3i[joffset + ktype].costheta;
+	    const flt_t delcs = cs - costheta;
 	    const flt_t delcssq = delcs*delcs;
 
 	    flt_t kfactor;
-	    if (jj == kk) kfactor = (flt_t)0.0;
+	    if (jj == kk || jj >= ejnum) kfactor = (flt_t)0.0;
 	    else kfactor = (flt_t)1.0;
 
 	    const flt_t facexp = expgsrainv1*expgsrainv2*kfactor;
-	    const flt_t facrad = p3i[joffset + ktype].lambda_epsilon *
-	                         facexp * delcssq;
+	    const flt_t facrad = lambda_epsilon * facexp * delcssq;
 	    const flt_t frad1 = facrad*gsrainvsq1;
 	    const flt_t frad2 = facrad*gsrainvsq2;
-	    const flt_t facang = p3i[joffset + ktype].lambda_epsilon2 *
-	                         facexp * delcs;
+	    const flt_t facang = lambda_epsilon2 * facexp * delcs;
 	    const flt_t facang12 = rinv12*facang;
 	    const flt_t csfacang = cs*facang;
 	    const flt_t csfac1 = rinvsq1*csfacang;
@@ -438,7 +550,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
           f[j].x += fjxtmp;
           f[j].y += fjytmp;
           f[j].z += fjztmp;
-          if (EFLAG)
+          if (EFLAG) 
 	    if (eatom) f[j].w += fjtmp;
         } // for jj
 
@@ -468,7 +580,7 @@ void PairSWIntel::eval(const int offload, const int vflag,
         ev_global[7] = ov5;
       }
     }
-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
     *timer_compute = MIC_Wtime() - *timer_compute;
     #endif
   } // end offload
@@ -483,6 +595,535 @@ void PairSWIntel::eval(const int offload, const int vflag,
     fix->add_result_array(f_start, 0, offload);
 }
 
+#else 
+
+/* ----------------------------------------------------------------------
+
+Vector intrinsics are temporarily being used for the Stillinger-Weber
+potential to allow for advanced features in the AVX512 instruction set to
+be exploited on early hardware. We hope to see compiler improvements for
+AVX512 that will eliminate this requirement, so it is not recommended to
+develop code based on the intrinsics implementation. Please e-mail the
+authors for more details.
+
+------------------------------------------------------------------------- */
+
+template <int SPQ,int ONETYPE,int EVFLAG,int EFLAG,class flt_t,class acc_t>
+void PairSWIntel::eval(const int offload, const int vflag,
+                       IntelBuffers<flt_t,acc_t> *buffers,
+                       const ForceConst<flt_t> &fc, const int astart, 
+		       const int aend, const int pad_width)
+{
+  typedef typename SIMD_type<flt_t>::SIMD_vec SIMD_flt_t;
+  typedef typename SIMD_type<acc_t>::SIMD_vec SIMD_acc_t;
+  const int swidth = SIMD_type<flt_t>::width();
+
+  const int inum = aend - astart;
+  if (inum == 0) return;
+  int nlocal, nall, minlocal;
+  fix->get_buffern(offload, nlocal, nall, minlocal);
+
+  const int ago = neighbor->ago;
+  IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
+
+  ATOM_T * _noalias const x = buffers->get_x(offload);
+  const int * _noalias const numneighhalf = buffers->get_atombin();
+  const int * _noalias const numneigh = list->numneigh;
+  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
+  const int * _noalias const firstneigh = buffers->firstneigh(list);
+
+  const FC_PACKED0_T * _noalias const p2 = fc.p2[0];
+  const FC_PACKED1_T * _noalias const p2f = fc.p2f[0];
+  const FC_PACKED1p2_T * _noalias const p2f2 = fc.p2f2[0];
+  const FC_PACKED2_T * _noalias const p2e = fc.p2e[0];
+  const FC_PACKED3_T * _noalias const p3 = fc.p3[0][0];
+
+  flt_t * _noalias const ccachex = buffers->get_ccachex();
+  flt_t * _noalias const ccachey = buffers->get_ccachey();
+  flt_t * _noalias const ccachez = buffers->get_ccachez();
+  flt_t * _noalias const ccachew = buffers->get_ccachew();
+  int * _noalias const ccachei = buffers->get_ccachei();
+  int * _noalias const ccachej = buffers->get_ccachej();
+  acc_t * _noalias const ccachef = buffers->get_ccachef();
+  const int ccache_stride = _ccache_stride;
+  const int ccache_stride3 = _ccache_stride3;
+
+  const int ntypes = atom->ntypes + 1;
+  const int eatom = this->eflag_atom;
+
+  // Determine how much data to transfer
+  int x_size, q_size, f_stride, ev_size, separate_flag;
+  IP_PRE_get_transfern(ago, /* NEWTON_PAIR*/ 1, EVFLAG, EFLAG, vflag,
+                       buffers, offload, fix, separate_flag,
+                       x_size, q_size, ev_size, f_stride);
+
+  int tc;
+  FORCE_T * _noalias f_start;
+  acc_t * _noalias ev_global;
+  IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
+  const int nthreads = tc;
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  double *timer_compute = fix->off_watch_pair();
+  int *overflow = fix->get_off_overflow_flag();
+
+  if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
+  #pragma offload target(mic:_cop) if(offload) \
+    in(p2,p2f,p2f2,p2e,p3:length(0) alloc_if(0) free_if(0)) \
+    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
+    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
+    in(numneigh:length(0) alloc_if(0) free_if(0)) \
+    in(x:length(x_size) alloc_if(0) free_if(0)) \
+    in(numneighhalf:length(0) alloc_if(0) free_if(0)) \
+    in(overflow:length(0) alloc_if(0) free_if(0)) \
+    in(ccachex,ccachey,ccachez,ccachew:length(0) alloc_if(0) free_if(0)) \
+    in(ccachei,ccachej:length(0) alloc_if(0) free_if(0)) \
+    in(ccache_stride,nthreads,inum,nall,ntypes,vflag,eatom,offload) \
+    in(astart,nlocal,f_stride,minlocal,separate_flag,pad_width) \
+    in(ccache_stride3)						\
+    out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
+    out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
+    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
+    signal(f_start)
+  #endif
+  {
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime();
+    #endif
+
+    IP_PRE_repack_for_offload(1, separate_flag, nlocal, nall,
+                              f_stride, x, 0);
+
+    acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
+    if (EVFLAG) {
+      oevdwl = (acc_t)0;
+      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    }
+
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) \
+      shared(f_start,f_stride,nlocal,nall,minlocal) \
+      reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
+    #endif
+    {
+      int iifrom, iito, tid;
+      IP_PRE_omp_range_id_vec(iifrom, iito, tid, inum, nthreads, swidth);
+      iifrom += astart;
+      iito += astart;
+
+      FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
+      memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+
+      const int toffs = tid * ccache_stride;
+      flt_t * _noalias const tdelx = ccachex + toffs;
+      flt_t * _noalias const tdely = ccachey + toffs;
+      flt_t * _noalias const tdelz = ccachez + toffs;
+      flt_t * _noalias const trsq = ccachew + toffs;
+      int * _noalias const tj = ccachei + toffs;
+      int * _noalias const tjtype = ccachej + toffs;
+      acc_t * _noalias const tf = ccachef + tid * ccache_stride3;
+
+      // loop over full neighbor list of my atoms
+
+      SIMD_flt_t cutsq, cut, powerp, powerq, sigma, c1, c2, c3,c4, c5, c6;
+      SIMD_flt_t sigma_gamma, costheta, lambda_epsilon, lambda_epsilon2;
+      if (ONETYPE) {
+	cutsq = SIMD_set(p2[3].cutsq);
+	cut = SIMD_set(p2f[3].cut);
+	sigma = SIMD_set(p2f[3].sigma);
+	c1 = SIMD_set(p2f2[3].c1);
+	c2 = SIMD_set(p2f2[3].c2);
+	c3 = SIMD_set(p2f2[3].c3);
+	c4 = SIMD_set(p2f2[3].c4);
+	sigma_gamma = SIMD_set(p2[3].sigma_gamma);
+	costheta = SIMD_set(p3[7].costheta);
+	lambda_epsilon = SIMD_set(p3[7].lambda_epsilon);
+	lambda_epsilon2 = SIMD_set(p3[7].lambda_epsilon2);
+	if (SPQ == 0) {
+	  powerp = SIMD_set(p2f[3].powerp);
+	  powerq = SIMD_set(p2f[3].powerq);
+	}
+	if (EFLAG) {
+          c5 = SIMD_set(p2e[3].c5);
+          c6 = SIMD_set(p2e[3].c6);
+        }
+      }
+
+      SIMD_int ilist = SIMD_set(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
+      const SIMD_int goffset = SIMD_set(0,16,32,48,64,80,96,112,128,
+					144,160,176,192,208,224,240);
+      ilist = ilist + iifrom;
+      for (int i = iifrom; i < iito; i += swidth) {
+	SIMD_mask imask = ilist < iito;
+	SIMD_flt_t xtmp, ytmp, ztmp;
+	SIMD_int itype, itype_offset;
+
+	if (ONETYPE)
+	  SIMD_atom_gather(imask, &(x[i].x), goffset, xtmp, ytmp, ztmp);
+	else {
+	  SIMD_atom_gather(imask, &(x[i].x), goffset, xtmp, ytmp, ztmp, itype);
+	  itype_offset = itype * ntypes;
+	}
+
+	#ifdef OUTER_CHUNK
+	const int* ng = firstneigh + cnumneigh[i] - swidth;
+	#else
+        SIMD_int ng = SIMD_load(cnumneigh + i);
+	ng = ng - 1;
+	#endif
+	const SIMD_int jnum = SIMD_loadz(imask, numneigh + i);
+	const SIMD_int jnumhalf = SIMD_loadz(imask, numneighhalf + i);
+	const int jnum_max = SIMD_max(jnum);
+
+	SIMD_acc_t fxtmp = SIMD_set((acc_t)0);
+	SIMD_acc_t fytmp = SIMD_set((acc_t)0);
+	SIMD_acc_t fztmp = SIMD_set((acc_t)0);
+	SIMD_acc_t fwtmp, fxtmp2, fytmp2, fztmp2, fwtmp2;
+	if (is_same<flt_t,acc_t>::value == 0) {
+	  fxtmp2 = SIMD_set((acc_t)0);
+	  fytmp2 = SIMD_set((acc_t)0);
+	  fztmp2 = SIMD_set((acc_t)0);
+          if (EFLAG) fwtmp2 = SIMD_set((acc_t)0);
+	}
+
+        SIMD_acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
+        if (EVFLAG) {
+          if (EFLAG) {
+            fwtmp = SIMD_set((acc_t)0);
+	    sevdwl = SIMD_set((acc_t)0);
+          }
+          if (vflag==1) {
+            sv0 = SIMD_set((acc_t)0);
+	    sv1 = SIMD_set((acc_t)0);
+	    sv2 = SIMD_set((acc_t)0);
+	    sv3 = SIMD_set((acc_t)0);
+	    sv4 = SIMD_set((acc_t)0);
+	    sv5 = SIMD_set((acc_t)0);
+          }
+        }
+
+	SIMD_int ejnum = SIMD_set(0);
+	SIMD_int ejnumhalf = SIMD_set(0);
+	SIMD_int coffset = SIMD_set(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 
+				    11, 12, 13, 14, 15);
+        for (int jj = 0; jj < jnum_max; jj++) {
+          SIMD_mask jmask = jj < jnum;
+
+	  #ifdef OUTER_CHUNK
+	  ng += swidth;
+	  SIMD_int j = SIMD_load(ng);
+	  #else
+	  ng = ng + 1;
+	  SIMD_int j = SIMD_gather(jmask, firstneigh, ng);
+	  #endif
+          j = j & SIMD_set(NEIGHMASK);
+	  const SIMD_int joffset = j << 4;
+
+	  SIMD_flt_t delx, dely, delz;
+	  SIMD_int jtype, ijtype;
+	  if (ONETYPE)
+	    SIMD_atom_gather(jmask, &(x[0].x), joffset, delx, dely, delz);
+	  else {
+	    SIMD_atom_gather(jmask, &(x[0].x), joffset, delx, dely, delz, 
+			     jtype);
+	    ijtype = (jtype + itype_offset) << 2;
+	    cutsq = SIMD_gather(jmask, &(p2[0].cutsq), ijtype);
+	  }
+
+	  delx = delx - xtmp;
+	  dely = dely - ytmp;
+	  delz = delz - ztmp;
+          SIMD_flt_t rsq1 = delx * delx;
+	  rsq1 = SIMD_fma(dely, dely, rsq1);
+	  rsq1 = SIMD_fma(delz, delz, rsq1);
+
+	  const SIMD_mask rmask = SIMD_lt(jmask, rsq1, cutsq);
+	  SIMD_scatter(rmask, tdelx, coffset, delx);
+	  SIMD_scatter(rmask, tdely, coffset, dely);
+	  SIMD_scatter(rmask, tdelz, coffset, delz);
+	  SIMD_scatter(rmask, trsq, coffset, rsq1);
+	  SIMD_scatter(rmask, tj, coffset, j);
+	  if (!ONETYPE) SIMD_scatter(rmask, tjtype, coffset, jtype);
+	  ejnum = SIMD_add(rmask, ejnum, 1);
+	  coffset = SIMD_add(rmask, coffset, swidth);
+	  const SIMD_mask hmask = SIMD_lt(rmask, SIMD_set(jj), jnumhalf);
+	  ejnumhalf = SIMD_add(hmask, ejnumhalf, 1);
+	}
+
+	const int ejnum_max = SIMD_max(ejnum);
+	const int ejnumhalf_max = SIMD_max(ejnumhalf);
+	memset(tf, 0, ejnum_max * sizeof(double) * swidth * 3);
+        for (int jj = 0; jj < ejnum_max; jj++) {
+          SIMD_int ijtype;
+	  const int coffset = jj * swidth;
+	  if (!ONETYPE) {
+	    ijtype = SIMD_load(tjtype + coffset);
+	    ijtype = (ijtype + itype_offset) << 2;
+	    cut = SIMD_gather(&(p2f[0].cut), ijtype);
+	  }
+
+	  SIMD_acc_t fjxtmp = SIMD_set((acc_t)0);
+	  SIMD_acc_t fjytmp = SIMD_set((acc_t)0);
+	  SIMD_acc_t fjztmp = SIMD_set((acc_t)0);
+	  SIMD_acc_t fjtmp, fjxtmp2, fjytmp2, fjztmp2, fjtmp2;
+          if (EFLAG) fjtmp = SIMD_set((acc_t)0.0);
+
+	  if (is_same<flt_t,acc_t>::value == 0) {
+	    fjxtmp2 = SIMD_set((acc_t)0);
+	    fjytmp2 = SIMD_set((acc_t)0);
+	    fjztmp2 = SIMD_set((acc_t)0);
+	    if (EFLAG) fjtmp2 = SIMD_set((acc_t)0.0);
+	  }
+
+          const SIMD_flt_t delx = SIMD_load(tdelx + coffset);
+          const SIMD_flt_t dely = SIMD_load(tdely + coffset);
+          const SIMD_flt_t delz = SIMD_load(tdelz + coffset);
+          const SIMD_flt_t rsq1 = SIMD_load(trsq + coffset);
+
+          const SIMD_flt_t r1 = SIMD_sqrt(rsq1); 
+          const SIMD_flt_t rinvsq1 = SIMD_rcp(rsq1);
+          const SIMD_flt_t rainv1 = SIMD_rcp(r1 - cut);
+	  
+	  // two-body interactions, skip half of them
+	  if (jj < ejnumhalf_max) {
+            SIMD_flt_t rp, rq;
+	    if (SPQ == 1) {
+              rp = r1 * r1;
+	      rp = rp * rp;
+	      rp = SIMD_rcp(rp);
+	      rq = SIMD_set((flt_t)1.0);
+            } else {
+	      if (!ONETYPE) {
+		powerp = SIMD_gather(&(p2f[0].powerp), ijtype);
+		powerq = SIMD_gather(&(p2f[0].powerq), ijtype);
+	      }
+	      rp = SIMD_pow(r1, powerp);
+	      rq = SIMD_pow(r1, powerq);
+	    }
+
+	    if (!ONETYPE) {
+	      sigma = SIMD_gather(&(p2f[0].sigma), ijtype);
+	      c1 = SIMD_gather(&(p2f2[0].c1), ijtype);
+	      c2 = SIMD_gather(&(p2f2[0].c2), ijtype);
+	      c3 = SIMD_gather(&(p2f2[0].c3), ijtype);
+	      c4 = SIMD_gather(&(p2f2[0].c4), ijtype);
+	    }
+
+	    const SIMD_flt_t rainvsq = rainv1 * rainv1 * r1;
+	    const SIMD_flt_t expsrainv = SIMD_exp(sigma * rainv1);
+	    const SIMD_flt_t fpair = (c1 * rp - c2 * rq + (c3 * rp - c4 * rq) *
+				      rainvsq) * expsrainv * rinvsq1;
+
+	    const SIMD_flt_t fjx = delx * fpair;
+	    const SIMD_flt_t fjy = dely * fpair;
+	    const SIMD_flt_t fjz = delz * fpair;
+
+	    const SIMD_mask hmask = jj < ejnumhalf;
+	    SIMD_accumulate3(hmask, fjx, fjy, fjz, fxtmp, fytmp, fztmp,
+			     fjxtmp, fjytmp, fjztmp, fxtmp2, fytmp2,
+			     fztmp2, fjxtmp2, fjytmp2, fjztmp2); 
+          
+	    if (EVFLAG) {
+	      if (EFLAG) {
+	        if (!ONETYPE) {
+	          c5 = SIMD_gather(&(p2e[0].c5), ijtype);
+	          c6 = SIMD_gather(&(p2e[0].c6), ijtype);
+                }            
+	        SIMD_flt_t evdwl;
+		evdwl = (c5 * rp - c6 * rq) * expsrainv;
+		SIMD_acc_energy3(hmask, evdwl, eatom, sevdwl, fwtmp, fjtmp,
+                                 fwtmp2, fjtmp2);
+	      }
+	      SIMD_ev_tally_nbor(hmask, vflag, (flt_t)1.0, fpair, delx, dely, 
+                                 delz, sv0, sv1, sv2, sv3, sv4, sv5);
+	    }
+          }
+
+	  /*---------------------------------------------*/
+	  SIMD_int ijkoff;
+	  if (!ONETYPE) {
+	    sigma_gamma = SIMD_gather(&(p2[0].sigma_gamma), ijtype);
+	    ijkoff = ijtype * ntypes;
+	  }
+          const SIMD_flt_t gsrainv1 = sigma_gamma * rainv1;
+          const SIMD_flt_t gsrainvsq1 = gsrainv1 * rainv1 / r1;
+          const SIMD_flt_t expgsrainv1 = SIMD_exp(gsrainv1);
+
+	  const SIMD_mask jmask = jj < ejnum;
+          for (int kk = jj+1; kk < ejnum_max; kk++) {
+	    SIMD_int iktype, ijktype;
+	    const int kcoffset = kk * swidth;
+	    if (!ONETYPE) {
+	      iktype = SIMD_load(tjtype + kcoffset);
+	      ijktype = ijkoff + (iktype << 2);
+	      iktype = (iktype + itype_offset) << 2;
+	      cut = SIMD_gather(&(p2[0].cut), iktype);
+	      sigma_gamma = SIMD_gather(&(p2[0].sigma_gamma), iktype);
+	      costheta = SIMD_gather(&(p3[0].costheta), ijktype);
+	      lambda_epsilon = SIMD_gather(&(p3[0].lambda_epsilon), ijktype);
+	      lambda_epsilon2 = SIMD_gather(&(p3[0].lambda_epsilon2), ijktype);
+	    }
+	    const SIMD_flt_t delr2x = SIMD_load(tdelx + kcoffset);
+	    const SIMD_flt_t delr2y = SIMD_load(tdely + kcoffset);
+	    const SIMD_flt_t delr2z = SIMD_load(tdelz + kcoffset);
+	    const SIMD_flt_t rsq2 = SIMD_load(trsq + kcoffset);
+
+	    const SIMD_flt_t r2 = SIMD_sqrt(rsq2);
+	    const SIMD_flt_t rinvsq2 = SIMD_rcp(rsq2);
+	    const SIMD_flt_t rainv2 = SIMD_rcp(r2 - cut);
+	    const SIMD_flt_t gsrainv2 = sigma_gamma * rainv2;
+	    const SIMD_flt_t gsrainvsq2 = gsrainv2 * rainv2 / r2;
+	    const SIMD_flt_t expgsrainv2 = SIMD_exp(gsrainv2);
+	    const SIMD_flt_t rinv12 = SIMD_rcp(r1 * r2);
+	    const SIMD_flt_t cs = (delx * delr2x + dely * delr2y + 
+                              delz * delr2z) * rinv12;
+	    const SIMD_flt_t delcs = cs - costheta;
+	    const SIMD_flt_t delcssq = delcs*delcs;
+
+	    const SIMD_flt_t facexp = expgsrainv1*expgsrainv2;
+	    const SIMD_flt_t facrad = lambda_epsilon * facexp * delcssq;
+	    const SIMD_flt_t frad1 = facrad * gsrainvsq1;
+	    const SIMD_flt_t frad2 = facrad * gsrainvsq2;
+	    const SIMD_flt_t facang = lambda_epsilon2 * facexp * delcs;
+	    const SIMD_flt_t facang12 = rinv12 * facang;
+	    const SIMD_flt_t csfacang = cs * facang;
+
+	    const SIMD_flt_t csfac1 = rinvsq1 * csfacang;
+	    const SIMD_flt_t fjx = delx * (frad1 + csfac1)-delr2x*facang12;
+	    const SIMD_flt_t fjy = dely * (frad1 + csfac1)-delr2y*facang12;
+	    const SIMD_flt_t fjz = delz * (frad1 + csfac1)-delr2z*facang12;
+
+	    const SIMD_flt_t csfac2 = rinvsq2 * csfacang;
+	    SIMD_flt_t fkx = delx * facang12 - delr2x * (frad2 + csfac2);
+	    SIMD_flt_t fky = dely * facang12 - delr2y * (frad2 + csfac2);
+	    SIMD_flt_t fkz = delz * facang12 - delr2z * (frad2 + csfac2);
+
+	    const SIMD_mask kmask = SIMD_lt(jmask, kk, ejnum);
+
+	    SIMD_acc_cache3(kmask, fjx, fjy, fjz, fkx, fky, fkz, fxtmp, fytmp,
+			    fztmp, fjxtmp, fjytmp, fjztmp, fxtmp2, fytmp2,
+			    fztmp2, fjxtmp2, fjytmp2, fjztmp2, 
+			    tf + kcoffset * 3, swidth); 
+
+	    if (EVFLAG) {
+	      if (EFLAG)
+		SIMD_acc_three(kmask, facrad, eatom, sevdwl, fwtmp, fjtmp,
+			       fwtmp2, fjtmp2);
+	      SIMD_ev_tally_nbor3v(kmask, vflag, fjx, fjy, fjz, fkx, fky, fkz,
+				   delx, dely, delz, delr2x, delr2y, delr2z,
+				   sv0, sv1, sv2, sv3, sv4, sv5);
+	    }
+	    
+	  } // for kk
+	  if (is_same<flt_t,acc_t>::value == 1)
+	    SIMD_cache3(tf + coffset * 3, swidth, fjxtmp, fjytmp, fjztmp);
+	  else
+	    SIMD_cache3(tf + coffset * 3, swidth, fjxtmp, fjytmp, fjztmp, 
+	                fjxtmp2, fjytmp2, fjztmp2);
+
+	  //if (EFLAG)
+	  //  if (eatom) f[j].w += fjtmp;
+        } // for jj first loop
+
+        for (int jj = 0; jj < ejnum_max; jj++) {
+	  const int coffset = jj * swidth;
+	  const SIMD_mask jmask = jj < ejnum;
+          const SIMD_int j = SIMD_load(tj + coffset);
+	  const SIMD_int joffset = j << 4;
+
+	  SIMD_acc_t fjxtmp, fjytmp, fjztmp, fjxtmp2, fjytmp2, fjztmp2;
+	  int foffset = swidth;
+	  if (is_same<flt_t,acc_t>::value == 0) foffset = foffset >> 1;
+	  acc_t *p = tf + coffset * 3;
+	  fjxtmp = SIMD_load(p);
+	  if (is_same<flt_t,acc_t>::value == 0) {
+	    p = p + foffset;
+	    fjxtmp2 = SIMD_load(p);
+	  }
+	  p = p + foffset;
+	  fjytmp = SIMD_load(p);
+	  if (is_same<flt_t,acc_t>::value == 0) {
+	    p = p + foffset;
+	    fjytmp2 = SIMD_load(p);
+	  }
+	  p = p + foffset;
+	  fjztmp = SIMD_load(p);
+	  if (is_same<flt_t,acc_t>::value == 0) {
+	    p = p + foffset;
+	    fjztmp2 = SIMD_load(p);
+	  }
+	  
+	  SIMD_conflict_pi_reduce3(jmask, joffset, fjxtmp, fjytmp, fjztmp);
+	  SIMD_jforce_update(jmask, &(f[0].x), joffset, fjxtmp, fjytmp, 
+			     fjztmp);
+          if (is_same<flt_t,acc_t>::value == 0) {
+	    SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
+	    SIMD_mask jmask2 = jmask >> 8;
+	    SIMD_conflict_pi_reduce3(jmask2, joffset2, fjxtmp2, fjytmp2, 
+				     fjztmp2);
+	    SIMD_jforce_update(jmask2, &(f[0].x), joffset2, fjxtmp2, fjytmp2, 
+			       fjztmp2);
+	  }
+	} // for jj second loop
+
+	SIMD_iforce_update(imask, &(f[i].x), goffset, fxtmp, fytmp, fztmp);
+	if (is_same<flt_t,acc_t>::value == 0) {
+	  imask = imask >> 8;
+	  SIMD_iforce_update(imask, &(f[i+8].x), goffset, fxtmp2, fytmp2, 
+			     fztmp2);
+	}
+	if (EVFLAG) {
+	  if (EFLAG) oevdwl += SIMD_sum(sevdwl);
+	  // f[i].w
+	  if (vflag == 1) {
+	    ov0 += SIMD_sum(sv0);
+	    ov1 += SIMD_sum(sv1);
+	    ov2 += SIMD_sum(sv2);
+	    ov3 += SIMD_sum(sv3);
+	    ov4 += SIMD_sum(sv4);
+	    ov5 += SIMD_sum(sv5);
+	  }
+	}
+	ilist = ilist + swidth;
+      } // for ii
+      #if defined(_OPENMP)
+      #pragma omp barrier
+      #endif
+      IP_PRE_fdotr_acc_force(1, EVFLAG,  EFLAG, vflag, eatom, nall, nlocal, 
+	                     minlocal, nthreads, f_start, f_stride, x);
+    } // end omp
+  
+    if (EVFLAG) {
+      if (EFLAG) {
+        ev_global[0] = oevdwl;
+        ev_global[1] = (acc_t)0.0;
+      }
+      if (vflag) {
+        ev_global[2] = ov0;
+        ev_global[3] = ov1;
+        ev_global[4] = ov2;
+        ev_global[5] = ov3;
+        ev_global[6] = ov4;
+        ev_global[7] = ov5;
+      }
+    }
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // end offload
+  if (offload)
+    fix->stop_watch(TIME_OFFLOAD_LATENCY);
+  else
+    fix->stop_watch(TIME_HOST_PAIR);
+
+  if (EVFLAG)
+    fix->add_result_array(f_start, ev_global, offload, eatom);
+  else
+    fix->add_result_array(f_start, 0, offload);
+}
+
+#endif
+
 /* ---------------------------------------------------------------------- */
 
 void PairSWIntel::allocate()
@@ -506,7 +1147,8 @@ void PairSWIntel::init_style()
                "The 'package intel' command is required for /intel styles");
   fix = static_cast<FixIntel *>(modify->fix[ifix]);
 
-  fix->pair_init_check();
+  fix->pair_init_check(true);
+
   #ifdef _LMP_INTEL_OFFLOAD
   _cop = fix->coprocessor_number();
   #endif
@@ -540,12 +1182,26 @@ template <class flt_t, class acc_t>
 void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc,
                                    IntelBuffers<flt_t,acc_t> *buffers)
 {
+  #ifdef LMP_USE_AVXCD
+  fix->nbor_pack_width(SIMD_type<flt_t>::width());
+  #endif
+
   int off_ccache = 0;
   #ifdef _LMP_INTEL_OFFLOAD
   if (_cop >= 0) off_ccache = 1;
   #endif
-  buffers->grow_ccache(off_ccache, comm->nthreads);
+
+  #ifdef LMP_USE_AVXCD
+  const int swidth = SIMD_type<flt_t>::width();
+  #else
+  const int swidth = 1;
+  #endif
+
+  buffers->grow_ccache(off_ccache, comm->nthreads, swidth);
   _ccache_stride = buffers->ccache_stride();
+  #ifdef LMP_USE_AVXCD
+  _ccache_stride3 = buffers->ccache_stride3();
+  #endif
 
   int tp1 = atom->ntypes + 1;
   fc.set_ntypes(tp1,memory,_cop);
@@ -564,6 +1220,9 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc,
       }
     }
   }
+  
+  _onetype = 0;
+  if (atom->ntypes == 1) _onetype = 1;
 
   _spq = 1;
   for (int ii = 0; ii < tp1; ii++) {
@@ -578,10 +1237,10 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc,
 	fc.p2f[ii][jj].powerp = 0;
 	fc.p2f[ii][jj].powerq = 0;
 	fc.p2f[ii][jj].sigma = 0;
-	fc.p2f[ii][jj].c1 = 0;
-	fc.p2f[ii][jj].c2 = 0;
-	fc.p2f[ii][jj].c3 = 0;
-	fc.p2f[ii][jj].c4 = 0;
+	fc.p2f2[ii][jj].c1 = 0;
+	fc.p2f2[ii][jj].c2 = 0;
+	fc.p2f2[ii][jj].c3 = 0;
+	fc.p2f2[ii][jj].c4 = 0;
 	fc.p2e[ii][jj].c5 = 0;
 	fc.p2e[ii][jj].c6 = 0;
       } else {
@@ -590,13 +1249,13 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc,
 	fc.p2[ii][jj].cut = params[ijparam].cut;
 	fc.p2[ii][jj].sigma_gamma = params[ijparam].sigma_gamma;
 	fc.p2f[ii][jj].cut = params[ijparam].cut;
-	fc.p2f[ii][jj].powerp = params[ijparam].powerp;
-	fc.p2f[ii][jj].powerq = params[ijparam].powerq;
+	fc.p2f[ii][jj].powerp = -params[ijparam].powerp;
+	fc.p2f[ii][jj].powerq = -params[ijparam].powerq;
 	fc.p2f[ii][jj].sigma = params[ijparam].sigma;
-	fc.p2f[ii][jj].c1 = params[ijparam].c1;
-	fc.p2f[ii][jj].c2 = params[ijparam].c2;
-	fc.p2f[ii][jj].c3 = params[ijparam].c3;
-	fc.p2f[ii][jj].c4 = params[ijparam].c4;
+	fc.p2f2[ii][jj].c1 = params[ijparam].c1;
+	fc.p2f2[ii][jj].c2 = params[ijparam].c2;
+	fc.p2f2[ii][jj].c3 = params[ijparam].c3;
+	fc.p2f2[ii][jj].c4 = params[ijparam].c4;
 	fc.p2e[ii][jj].c5 = params[ijparam].c5;
 	fc.p2e[ii][jj].c6 = params[ijparam].c6;
 
@@ -634,15 +1293,16 @@ void PairSWIntel::pack_force_const(ForceConst<flt_t> &fc,
   if (_cop < 0) return;
   FC_PACKED0_T *op2 = fc.p2[0];
   FC_PACKED1_T *op2f = fc.p2f[0];
+  FC_PACKED1p2_T *op2f2 = fc.p2f2[0];
   FC_PACKED2_T *op2e = fc.p2e[0];
   FC_PACKED3_T *op3 = fc.p3[0][0];
   flt_t * ocutneighsq = cutneighsq[0];
   int tp1sq = tp1 * tp1;
   int tp1cu = tp1sq * tp1;
-  if (op2 != NULL && op2f != NULL && op2e != NULL && op3 != NULL &&
-      ocutneighsq != NULL) {
+  if (op2 != NULL && op2f != NULL && op2f2 != NULL && op2e != NULL && 
+      op3 != NULL && ocutneighsq != NULL) {
     #pragma offload_transfer target(mic:_cop) \
-      in(op2,op2f,op2e: length(tp1sq) alloc_if(0) free_if(0)) \
+      in(op2,op2f,op2f2,op2e: length(tp1sq) alloc_if(0) free_if(0))	\
       in(op3: length(tp1cu) alloc_if(0) free_if(0)) \
       in(ocutneighsq: length(tp1sq))
   }
@@ -659,19 +1319,21 @@ void PairSWIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
     if (_ntypes > 0) {
       fc_packed0 *op2 = p2[0];
       fc_packed1 *op2f = p2f[0];
+      fc_packed1p2 *op2f2 = p2f2[0];
       fc_packed2 *op2e = p2e[0];
       fc_packed3 *op3 = p3[0][0];
 
       #ifdef _LMP_INTEL_OFFLOAD
-      if (op2 != NULL && op2f != NULL && op2e != NULL && op3 != NULL &&
-          _cop >= 0) {
+      if (op2 != NULL && op2f != NULL && op2f2 != NULL && op2e != NULL && 
+	  op3 != NULL && _cop >= 0) {
         #pragma offload_transfer target(mic:_cop) \
-          nocopy(op2, op2f, op2e, op3: alloc_if(0) free_if(1))
+          nocopy(op2, op2f, op2f2, op2e, op3: alloc_if(0) free_if(1))
       }
       #endif
 
       _memory->destroy(op2);
       _memory->destroy(op2f);
+      _memory->destroy(op2f2);
       _memory->destroy(op2e);
       _memory->destroy(op3);
     }
@@ -679,20 +1341,22 @@ void PairSWIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
       _cop = cop;
       memory->create(p2,ntypes,ntypes,"fc.p2");
       memory->create(p2f,ntypes,ntypes,"fc.p2f");
+      memory->create(p2f2,ntypes,ntypes,"fc.p2f2");
       memory->create(p2e,ntypes,ntypes,"fc.p2e");
       memory->create(p3,ntypes,ntypes,ntypes,"fc.p3");
 
       #ifdef _LMP_INTEL_OFFLOAD
       fc_packed0 *op2 = p2[0];
       fc_packed1 *op2f = p2f[0];
+      fc_packed1p2 *op2f2 = p2f2[0];
       fc_packed2 *op2e = p2e[0];
       fc_packed3 *op3 = p3[0][0];
       int tp1sq = ntypes * ntypes;
       int tp1cu = tp1sq * ntypes;
-      if (op2 != NULL && op2f != NULL && op2e != NULL && op3 != NULL &&
-          cop >= 0) {
+      if (op2 != NULL && op2f != NULL && op2f2 != NULL && op2e != NULL && 
+	  op3 != NULL && cop >= 0) {
         #pragma offload_transfer target(mic:cop) \
-          nocopy(op2,op2f,op2e: length(tp1sq) alloc_if(1) free_if(0)) \
+          nocopy(op2,op2f,op2f2,op2e: length(tp1sq) alloc_if(1) free_if(0)) \
           nocopy(op3: length(tp1cu) alloc_if(1) free_if(0))
       }
       #endif
diff --git a/src/USER-INTEL/pair_sw_intel.h b/src/USER-INTEL/pair_sw_intel.h
index a38dd75cf3..8723803a35 100755
--- a/src/USER-INTEL/pair_sw_intel.h
+++ b/src/USER-INTEL/pair_sw_intel.h
@@ -11,6 +11,10 @@
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
 #ifdef PAIR_CLASS
 
 PairStyle(sw/intel,PairSWIntel)
@@ -42,7 +46,7 @@ class PairSWIntel : public PairSW {
   template <class flt_t, class acc_t>
   void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
                const ForceConst<flt_t> &fc);
-  template <int SPQ, int EVFLAG, int EFLAG, class flt_t, class acc_t>
+  template <int SPQ,int ONETYPE,int EVFLAG,int EFLAG,class flt_t,class acc_t>
   void eval(const int offload, const int vflag,
             IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc,
 	    const int astart, const int aend, const int pad_width);
@@ -51,7 +55,10 @@ class PairSWIntel : public PairSW {
   void pack_force_const(ForceConst<flt_t> &fc,
                         IntelBuffers<flt_t, acc_t> *buffers);
 
-  int _ccache_stride, _host_pad, _offload_pad, _spq;
+  int _ccache_stride, _host_pad, _offload_pad, _spq, _onetype;
+  #ifdef LMP_USE_AVXCD
+  int _ccache_stride3;
+  #endif
 
   // ----------------------------------------------------------------------
 
@@ -62,8 +69,11 @@ class PairSWIntel : public PairSW {
       flt_t cutsq, cut, sigma_gamma, pad;
     } fc_packed0;
     typedef struct {
-      flt_t powerp, powerq, cut, sigma, c1, c2, c3, c4;
+      flt_t powerp, powerq, cut, sigma;
     } fc_packed1;
+    typedef struct {
+      flt_t c1, c2, c3, c4;
+    } fc_packed1p2;
     typedef struct {
       flt_t c5, c6;
     } fc_packed2;
@@ -73,6 +83,7 @@ class PairSWIntel : public PairSW {
 
     fc_packed0 **p2;
     fc_packed1 **p2f;
+    fc_packed1p2 **p2f2;
     fc_packed2 **p2e;
     fc_packed3 ***p3;