git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@14386 f3b2605a-c512-4ea7-a41b-209d697bcdaa

2015-12-19 18:30:53 +00:00
parent fb2cb218c8
commit 72ebb1dd9b
17 changed files with 1499 additions and 402 deletions
--- a/src/USER-INTEL/Install.sh
+++ b/src/USER-INTEL/Install.sh
@ -26,14 +26,12 @@ action () {
 # do not install child files if parent does not exist

 for file in *_intel.cpp; do
-  test $file = thr_intel.cpp && continue
  dep=`echo $file | sed 's/neigh_full_intel/neigh_full/g' | \
      sed 's/_offload_intel//g' | sed 's/_intel//g'`
  action $file $dep
 done

 for file in *_intel.h; do
-  test $file = thr_intel.h && continue
  dep=`echo $file | sed 's/_offload_intel//g' | sed 's/_intel//g'`
  action $file $dep
 done
@ -42,6 +40,8 @@ action intel_preprocess.h
 action intel_buffers.h
 action intel_buffers.cpp
 action math_extra_intel.h
+action intel_simd.h pair_sw_intel.cpp
+action intel_intrinsics.h pair_tersoff_intel.cpp

 # step 2: handle cases and tasks not handled in step 1.

--- a/src/USER-INTEL/README
+++ b/src/USER-INTEL/README
@ -3,9 +3,11 @@
                          LAMMPS Intel(R) Package
                     --------------------------------
                     
-                        W. Michael Brown (Intel)
-                       michael.w.brown at intel.com
-
+             W. Michael Brown (Intel) michael.w.brown at intel.com
+                   Rodrigo Canales (RWTH Aachen University)
+                  Markus H<>hnerbach (RWTH Aachen University)
+		   Ahmed E. Ismail (RWTH Aachen University)
+                   Paolo Bientinesi (RWTH Aachen University)
                          Anupama Kurpad (Intel)
                          Biswajit Mishra (Shell)

@ -53,3 +55,12 @@ By default, when running with offload to Intel(R) coprocessors, affinity
 for host MPI tasks and OpenMP threads is set automatically within the code.
 This currently requires the use of system calls. To disable at build time,
 compile with -DINTEL_OFFLOAD_NOAFFINITY.
+
+-----------------------------------------------------------------------------
+
+Vector intrinsics are temporarily being used for the Stillinger-Weber 
+potential to allow for advanced features in the AVX512 instruction set to
+be exploited on early hardware. We hope to see compiler improvements for
+AVX512 that will eliminate this requirement, so it is not recommended to
+develop code based on the intrinsics implementation. Please e-mail the 
+authors for more details.
--- a/src/USER-INTEL/TEST/in.intel.lc
+++ b/src/USER-INTEL/TEST/in.intel.lc
@ -7,7 +7,7 @@
 package intel 1 mode mixed balance $b
 package omp 0
 suffix $s
-processors * * * grid numa
+# processors * * * grid numa

 variable        x index 4
 variable        y index 2
--- a/src/USER-INTEL/fix_intel.cpp
+++ b/src/USER-INTEL/fix_intel.cpp
@ -60,6 +60,8 @@ FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) :  Fix(lmp, narg, arg)

  int ncops = force->inumeric(FLERR,arg[3]);

+  _nbor_pack_width = 1;
+
  _precision_mode = PREC_MODE_MIXED;
  _offload_balance = 1.0;
  _overflow_flag[LMP_OVERFLOW] = 0;
@ -307,12 +309,14 @@ void FixIntel::setup(int vflag)

 /* ---------------------------------------------------------------------- */

-void FixIntel::pair_init_check()
+void FixIntel::pair_init_check(const bool cdmessage)
 {
  #ifdef INTEL_VMASK
  atom->sortfreq = 1;
  #endif

+  _nbor_pack_width = 1;
+
  #ifdef _LMP_INTEL_OFFLOAD
  if (_offload_balance != 0.0) atom->sortfreq = 1;

@ -371,15 +375,12 @@ void FixIntel::pair_init_check()
  char kmode[80];
  if (_precision_mode == PREC_MODE_SINGLE) {
    strcpy(kmode, "single");
-    get_single_buffers()->free_all_nbor_buffers();
    get_single_buffers()->need_tag(need_tag);
  } else if (_precision_mode == PREC_MODE_MIXED) {
    strcpy(kmode, "mixed");
-    get_mixed_buffers()->free_all_nbor_buffers();
    get_mixed_buffers()->need_tag(need_tag);
  } else {
    strcpy(kmode, "double");
-    get_double_buffers()->free_all_nbor_buffers();
    get_double_buffers()->need_tag(need_tag);
  }

@ -399,6 +400,13 @@ void FixIntel::pair_init_check()
 	fprintf(screen,"Using Intel Package without Coprocessor.\n");
      }
      fprintf(screen,"Precision: %s\n",kmode);
+      if (cdmessage) {
+	#ifdef LMP_USE_AVXCD
+	fprintf(screen,"AVX512 CD Optimizations: Enabled\n");
+	#else
+	fprintf(screen,"AVX512 CD Optimizations: Disabled\n");
+	#endif
+      }
      fprintf(screen,
 	      "----------------------------------------------------------\n");
    }
--- a/src/USER-INTEL/fix_intel.h
+++ b/src/USER-INTEL/fix_intel.h
@ -11,6 +11,10 @@
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */

+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
 #ifdef FIX_CLASS

 FixStyle(INTEL,FixIntel)
@ -39,7 +43,7 @@ class FixIntel : public Fix {
  virtual int setmask();
  virtual void init();
  virtual void setup(int);
-  void pair_init_check();
+  void pair_init_check(const bool cdmessage=false);

  // Get all forces, calculation results from coprocesser
  void sync_coprocessor();
@ -58,12 +62,15 @@ class FixIntel : public Fix {
  inline IntelBuffers<double,double> * get_double_buffers()
    { return _double_buffers; }

+  inline int nbor_pack_width() const { return _nbor_pack_width; }
+  inline void nbor_pack_width(const int w) { _nbor_pack_width = w; }
+
 protected:
  IntelBuffers<float,float> *_single_buffers;
  IntelBuffers<float,double> *_mixed_buffers;
  IntelBuffers<double,double> *_double_buffers;

-  int _precision_mode, _nthreads;
+  int _precision_mode, _nthreads, _nbor_pack_width;

 public:
  inline int* get_overflow_flag() { return _overflow_flag; }
--- a/src/USER-INTEL/intel_buffers.cpp
+++ b/src/USER-INTEL/intel_buffers.cpp
@ -343,11 +343,15 @@ void IntelBuffers<flt_t, acc_t>::free_nbor_list()
 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::_grow_nbor_list(NeighList *list,
                                                 const int nlocal,
-                                                 const int offload_end)
+	                                         const int nthreads,
+	                                         const int offload_end,
+	                                         const int pack_width)
 {
  free_nbor_list();
  _list_alloc_atoms = 1.10 * nlocal;
-  int list_alloc_size = (_list_alloc_atoms + _off_threads) * get_max_nbors();
+  int nt = MAX(nthreads, _off_threads);
+  int list_alloc_size = (_list_alloc_atoms + nt + pack_width - 1) * 
+    get_max_nbors();
  lmp->memory->create(_list_alloc, list_alloc_size, "_list_alloc");
  #ifdef _LMP_INTEL_OFFLOAD
  if (offload_end > 0) {
@ -393,6 +397,9 @@ void IntelBuffers<flt_t, acc_t>::free_ccache()
    flt_t *ccachew = _ccachew;
    int *ccachei = _ccachei;
    int *ccachej = _ccachej;
+    #ifdef LMP_USE_AVXCD
+    acc_t *ccachef = _ccachef;
+    #endif

    #ifdef _LMP_INTEL_OFFLOAD
    if (_off_ccache) {
@ -409,6 +416,9 @@ void IntelBuffers<flt_t, acc_t>::free_ccache()
    lmp->memory->destroy(ccachew);
    lmp->memory->destroy(ccachei);
    lmp->memory->destroy(ccachej);
+    #ifdef LMP_USE_AVXCD
+    lmp->memory->destroy(ccachef);
+    #endif

    _ccachex = 0;
  }
@ -418,7 +428,8 @@ void IntelBuffers<flt_t, acc_t>::free_ccache()

 template <class flt_t, class acc_t>
 void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
-                                           const int nthreads)
+	const int nthreads,
+	const int width)
 {
  #ifdef _LMP_INTEL_OFFLOAD
  if (_ccachex && off_flag && _off_ccache == 0)
@ -427,7 +438,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
  if (_ccachex)
    return;

-  const int nsize = get_max_nbors();
+  const int nsize = get_max_nbors() * width;
  int esize = MIN(sizeof(int), sizeof(flt_t));
  IP_PRE_get_stride(_ccache_stride, nsize, esize, 0);
  int nt = MAX(nthreads, _off_threads);
@ -439,6 +450,11 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
  lmp->memory->create(_ccachew, vsize, "_ccachew");
  lmp->memory->create(_ccachei, vsize, "_ccachei");
  lmp->memory->create(_ccachej, vsize, "_ccachej");
+  #ifdef LMP_USE_AVXCD
+  IP_PRE_get_stride(_ccache_stride3, nsize * 3, esize, 0);
+  lmp->memory->create(_ccachef, _ccache_stride3 * nt, "_ccachef");
+  #endif
+  memset(_ccachej, 0, vsize * sizeof(int));

  #ifdef _LMP_INTEL_OFFLOAD
  if (off_flag) {
@ -454,7 +470,8 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
      #pragma offload_transfer target(mic:_cop) \
        nocopy(ccachex,ccachey:length(vsize) alloc_if(1) free_if(0)) \
        nocopy(ccachez,ccachew:length(vsize) alloc_if(1) free_if(0)) \
-        nocopy(ccachei,ccachej:length(vsize) alloc_if(1) free_if(0))
+        nocopy(ccachei:length(vsize) alloc_if(1) free_if(0)) \
+        in(ccachej:length(vsize) alloc_if(1) free_if(0))
    }
    _off_ccache = 1;
  }
--- a/src/USER-INTEL/intel_buffers.h
+++ b/src/USER-INTEL/intel_buffers.h
@ -75,14 +75,14 @@ class IntelBuffers {
    free_local();
  }

-  inline void grow_nbor(NeighList *list, const int nlocal,
-                        const int offload_end) {
+  inline void grow_nbor(NeighList *list, const int nlocal, const int nthreads,
+                        const int offload_end, const int pack_width=1) {
    grow_local(list, offload_end);
    if (offload_end) {
      grow_nmax();
      grow_binhead();
    }
-    grow_nbor_list(list, nlocal, offload_end);
+    grow_nbor_list(list, nlocal, nthreads, offload_end, pack_width);
  }

  void free_nmax();
@ -111,7 +111,7 @@ class IntelBuffers {
  }

  void free_ccache();
-  void grow_ccache(const int off_flag, const int nthreads);
+  void grow_ccache(const int off_flag, const int nthreads, const int width=1);
  inline int ccache_stride() { return _ccache_stride; }
  inline flt_t * get_ccachex() { return _ccachex; }
  inline flt_t * get_ccachey() { return _ccachey; }
@ -119,6 +119,10 @@ class IntelBuffers {
  inline flt_t * get_ccachew() { return _ccachew; }
  inline int * get_ccachei() { return _ccachei; }
  inline int * get_ccachej() { return _ccachej; }
+  #ifdef LMP_USE_AVXCD
+  inline int ccache_stride3() { return _ccache_stride3; }
+  inline acc_t * get_ccachef() { return _ccachef; }
+  #endif

  inline int get_max_nbors() {
    int mn = lmp->neighbor->oneatom * sizeof(int) /
@ -129,9 +133,10 @@ class IntelBuffers {
  void free_nbor_list();

  inline void grow_nbor_list(NeighList *list, const int nlocal,
-                             const int offload_end) {
+                             const int nthreads, const int offload_end,
+			     const int pack_width) {
    if (nlocal > _list_alloc_atoms)
-      _grow_nbor_list(list, nlocal, offload_end);
+      _grow_nbor_list(list, nlocal, nthreads, offload_end, pack_width);
    #ifdef _LMP_INTEL_OFFLOAD
    else if (offload_end > 0 && _off_map_stencil != list->stencil)
      _grow_stencil(list);
@ -281,6 +286,10 @@ class IntelBuffers {
  int _ccache_stride;
  flt_t *_ccachex, *_ccachey, *_ccachez, *_ccachew;
  int *_ccachei, *_ccachej;
+  #ifdef LMP_USE_AVXCD
+  int _ccache_stride3;
+  acc_t * _ccachef;
+  #endif

  #ifdef _LMP_INTEL_OFFLOAD
  int _separate_buffers;
@ -305,8 +314,8 @@ class IntelBuffers {
  void _grow_nmax();
  void _grow_local(NeighList *list, const int offload_end);
  void _grow_binhead();
-  void _grow_nbor_list(NeighList *list, const int nlocal,
-                       const int offload_end);
+  void _grow_nbor_list(NeighList *list, const int nlocal, const int nthreads,
+                       const int offload_end, const int pack_width);
  void _grow_stencil(NeighList *list);
 };

--- a/src/USER-INTEL/intel_preprocess.h
+++ b/src/USER-INTEL/intel_preprocess.h
@ -55,30 +55,37 @@ enum {LMP_OVERFLOW, LMP_LOCAL_MIN, LMP_LOCAL_MAX, LMP_GHOST_MIN,
 enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
      TIME_OFFLOAD_PAIR, TIME_OFFLOAD_WAIT, TIME_OFFLOAD_LATENCY,
      TIME_IMBALANCE};
-#define NUM_ITIMERS ( TIME_IMBALANCE + 1 )

+#define NUM_ITIMERS ( TIME_IMBALANCE + 1 )
 #define INTEL_MIC_VECTOR_WIDTH 16
 #define INTEL_VECTOR_WIDTH 4
+
 #ifdef __AVX__
 #undef INTEL_VECTOR_WIDTH
 #define INTEL_VECTOR_WIDTH 8
 #endif
+
 #ifdef __AVX2__
 #undef INTEL_VECTOR_WIDTH
 #define INTEL_VECTOR_WIDTH 8
 #endif
+
 #ifdef __AVX512F__
 #undef INTEL_VECTOR_WIDTH
 #define INTEL_VECTOR_WIDTH 16
 #define INTEL_V512 1
 #define INTEL_VMASK 1
 #else
-
 #ifdef __MIC__
 #define INTEL_V512 1
 #define INTEL_VMASK 1
 #endif
+#endif

+#ifdef __AVX512CD__
+#ifndef _LMP_INTEL_OFFLOAD
+#define LMP_USE_AVXCD
+#endif
 #endif

 #define INTEL_DATA_ALIGN 64
@ -134,6 +141,18 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
 			   datasize);				\
  }

+#define IP_PRE_omp_range_id_vec(ifrom, ito, tid, inum,          \
+				nthreads, vecsize)		\
+  {								\
+    tid = omp_get_thread_num();         			\
+    int idelta = static_cast<int>(ceil(static_cast<float>(inum)	\
+				       /vecsize/nthreads));	\
+    idelta *= vecsize;						\
+    ifrom = tid*idelta;						\
+    ito = ifrom + idelta;					\
+    if (ito > inum) ito = inum;					\
+  }
+
 #else

 #define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads)	\
@ -364,6 +383,43 @@ inline double MIC_Wtime() {
  }                                                                     \
 }

+#define IP_PRE_ev_tally_dihed(eflag, eatom, vflag, deng, i1, i2, i3, i4,\
+			      f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y,	\
+			      f4z, vb1x, vb1y, vb1z, vb2x, vb2y, vb2z,	\
+			      vb3x, vb3y, vb3z,oedihedral, force,	\
+			      newton, nlocal)				\
+{                                                                       \
+  flt_t ev_pre;								\
+  if (newton) ev_pre = (flt_t)1.0;					\
+  else {								\
+    ev_pre = (flt_t)0.0;						\
+    if (i1 < nlocal) ev_pre += (flt_t)0.25;				\
+    if (i2 < nlocal) ev_pre += (flt_t)0.25;				\
+    if (i3 < nlocal) ev_pre += (flt_t)0.25;				\
+    if (i4 < nlocal) ev_pre += (flt_t)0.25;				\
+  }									\
+									\
+  if (eflag) {								\
+    oedihedral += ev_pre * deng;					\
+    if (eatom) {							\
+      flt_t qdeng = deng * (flt_t)0.25;					\
+      if (newton || i1 < nlocal) f[i1].w += qdeng;			\
+      if (newton || i2 < nlocal) f[i2].w += qdeng;			\
+      if (newton || i3 < nlocal) f[i3].w += qdeng;			\
+      if (newton || i4 < nlocal) f[i4].w += qdeng;			\
+    }									\
+  }									\
+									\
+  if (vflag) {								\
+    sv0 += ev_pre * (vb1x*f1x + vb2x*f3x + (vb3x+vb2x)*f4x);		\
+    sv1 += ev_pre * (vb1y*f1y + vb2y*f3y + (vb3y+vb2y)*f4y);		\
+    sv2 += ev_pre * (vb1z*f1z + vb2z*f3z + (vb3z+vb2z)*f4z);		\
+    sv3 += ev_pre * (vb1x*f1y + vb2x*f3y + (vb3x+vb2x)*f4y);		\
+    sv4 += ev_pre * (vb1x*f1z + vb2x*f3z + (vb3x+vb2x)*f4z);		\
+    sv5 += ev_pre * (vb1y*f1z + vb2y*f3z + (vb3y+vb2y)*f4z);		\
+  }                                                                     \
+}
+
 #define IP_PRE_ev_tally_atom(evflag, eflag, vflag, f, fwtmp)    	\
 {									\
  if (evflag) {								\
--- a/src/USER-INTEL/math_extra_intel.h
+++ b/src/USER-INTEL/math_extra_intel.h
@ -351,4 +351,128 @@
  ans##_0 = (aug_3 - t) / aug_0;			\
 }

+/* ----------------------------------------------------------------------
+   normalize a quaternion
+------------------------------------------------------------------------- */
+
+#define ME_qnormalize(q)						\
+{									\
+  double norm = 1.0 /							\
+    sqrt(q##_w*q##_w + q##_i*q##_i + q##_j*q##_j + q##_k*q##_k);	\
+  q##_w *= norm;							\
+  q##_i *= norm;							\
+  q##_j *= norm;							\
+  q##_k *= norm;							\
+}
+
+/* ----------------------------------------------------------------------
+   compute omega from angular momentum
+   w = omega = angular velocity in space frame
+   wbody = angular velocity in body frame
+   project space-frame angular momentum onto body axes
+     and divide by principal moments
+------------------------------------------------------------------------- */
+
+#define ME_mq_to_omega(m, quat, moments_0, moments_1, moments_2, w)	\
+{									\
+  double wbody_0, wbody_1, wbody_2;					\
+  double rot_0, rot_1, rot_2, rot_3, rot_4, rot_5, rot_6, rot_7, rot_8;	\
+									\
+  double w2 = quat##_w * quat##_w;					\
+  double i2 = quat##_i * quat##_i;					\
+  double j2 = quat##_j * quat##_j;					\
+  double k2 = quat##_k * quat##_k;					\
+  double twoij = 2.0 * quat##_i * quat##_j;				\
+  double twoik = 2.0 * quat##_i * quat##_k;				\
+  double twojk = 2.0 * quat##_j * quat##_k;				\
+  double twoiw = 2.0 * quat##_i * quat##_w;				\
+  double twojw = 2.0 * quat##_j * quat##_w;				\
+  double twokw = 2.0 * quat##_k * quat##_w;				\
+    									\
+  rot##_0 = w2 + i2 - j2 - k2;					        \
+  rot##_1 = twoij - twokw;						\
+  rot##_2 = twojw + twoik;						\
+	  								\
+  rot##_3 = twoij + twokw;					        \
+  rot##_4 = w2 - i2 + j2 - k2;				                \
+  rot##_5 = twojk - twoiw;					        \
+									\
+  rot##_6 = twoik - twojw;				                \
+  rot##_7 = twojk + twoiw;				                \
+  rot##_8 = w2 - i2 - j2 + k2;			                        \
+									\
+  wbody_0 = rot##_0*m##_0 + rot##_3*m##_1 + rot##_6*m##_2;              \
+  wbody_1 = rot##_1*m##_0 + rot##_4*m##_1 + rot##_7*m##_2;              \
+  wbody_2 = rot##_2*m##_0 + rot##_5*m##_1 + rot##_8*m##_2;              \
+									\
+  wbody_0 *= moments_0;							\
+  wbody_1 *= moments_1;							\
+  wbody_2 *= moments_2;							\
+									\
+  w##_0 = rot##_0*wbody_0 + rot##_1*wbody_1 + rot##_2*wbody_2;          \
+  w##_1 = rot##_3*wbody_0 + rot##_4*wbody_1 + rot##_5*wbody_2;          \
+  w##_2 = rot##_6*wbody_0 + rot##_7*wbody_1 + rot##_8*wbody_2;          \
+}
+
+#define ME_omega_richardson(dtf,dtq,angmomin,quatin,torque,i0,i1,i2)	\
+{									\
+  angmomin[0] += dtf * torque[0];					\
+  double angmom_0 = angmomin[0];					\
+  angmomin[1] += dtf * torque[1];					\
+  double angmom_1 = angmomin[1];					\
+  angmomin[2] += dtf * torque[2];					\
+  double angmom_2 = angmomin[2];					\
+									\
+  double quat_w = quatin[0];						\
+  double quat_i = quatin[1];						\
+  double quat_j = quatin[2];						\
+  double quat_k = quatin[3];						\
+									\
+  double omega_0, omega_1, omega_2;					\
+  ME_mq_to_omega(angmom,quat,i0,i1,i2,omega);				\
+									\
+  double wq_0, wq_1, wq_2, wq_3;					\
+  wq_0 = -omega_0*quat_i - omega_1*quat_j - omega_2*quat_k;		\
+  wq_1 = quat_w*omega_0 + omega_1*quat_k - omega_2*quat_j;		\
+  wq_2 = quat_w*omega_1 + omega_2*quat_i - omega_0*quat_k;		\
+  wq_3 = quat_w*omega_2 + omega_0*quat_j - omega_1*quat_i;		\
+									\
+  double qfull_w, qfull_i, qfull_j, qfull_k;				\
+  qfull_w = quat_w + dtq * wq_0;					\
+  qfull_i = quat_i + dtq * wq_1;					\
+  qfull_j = quat_j + dtq * wq_2;					\
+  qfull_k = quat_k + dtq * wq_3;					\
+  ME_qnormalize(qfull);							\
+									\
+  double qhalf_w, qhalf_i, qhalf_j, qhalf_k;				\
+  qhalf_w = quat_w + 0.5*dtq * wq_0;					\
+  qhalf_i = quat_i + 0.5*dtq * wq_1;					\
+  qhalf_j = quat_j + 0.5*dtq * wq_2;					\
+  qhalf_k = quat_k + 0.5*dtq * wq_3;					\
+  ME_qnormalize(qhalf);							\
+  									\
+  ME_mq_to_omega(angmom,qhalf,i0,i1,i2,omega);				\
+  wq_0 = -omega_0*qhalf_i - omega_1*qhalf_j - omega_2*qhalf_k;		\
+  wq_1 = qhalf_w*omega_0 + omega_1*qhalf_k - omega_2*qhalf_j;		\
+  wq_2 = qhalf_w*omega_1 + omega_2*qhalf_i - omega_0*qhalf_k;		\
+  wq_3 = qhalf_w*omega_2 + omega_0*qhalf_j - omega_1*qhalf_i;		\
+									\
+  qhalf_w += 0.5*dtq * wq_0;						\
+  qhalf_i += 0.5*dtq * wq_1;						\
+  qhalf_j += 0.5*dtq * wq_2;						\
+  qhalf_k += 0.5*dtq * wq_3;						\
+  ME_qnormalize(qhalf);							\
+									\
+  quat_w = 2.0*qhalf_w - qfull_w;					\
+  quat_i = 2.0*qhalf_i - qfull_i;					\
+  quat_j = 2.0*qhalf_j - qfull_j;					\
+  quat_k = 2.0*qhalf_k - qfull_k;					\
+  ME_qnormalize(quat);							\
+									\
+  quatin[0] = quat_w;							\
+  quatin[1] = quat_i;							\
+  quatin[2] = quat_j;							\
+  quatin[3] = quat_k;							\
+}
+
 #endif
--- a/src/USER-INTEL/neigh_half_bin_intel.cpp
+++ b/src/USER-INTEL/neigh_half_bin_intel.cpp
@ -15,6 +15,8 @@
   Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */

+//#define OUTER_CHUNK 1
+
 #include "neighbor.h"
 #include "neigh_list.h"
 #include "atom.h"
@ -26,6 +28,14 @@
 #include <omp.h>
 #endif

+#ifdef LMP_USE_AVXCD
+#include "intel_simd.h"
+#endif
+
+#ifdef OUTER_CHUNK
+#include "intel_simd.h"
+#endif
+
 using namespace LAMMPS_NS;

 #ifdef _LMP_INTEL_OFFLOAD
@ -42,17 +52,11 @@ using namespace LAMMPS_NS;
  for (int s = 0; s < n3; s++) {                                      \
    if (sptr[s] == tag) {                                             \
      if (s < n1) {                                                   \
-        if (special_flag[1] == 0) which = -1;                         \
-        else if (special_flag[1] == 1) which = 0;                     \
-        else which = 1;                                               \
+        which = 1;						      \
      } else if (s < n2) {                                            \
-        if (special_flag[2] == 0) which = -1;                         \
-        else if (special_flag[2] == 1) which = 0;                     \
-        else which = 2;                                               \
+        which = 2;						      \
      } else {                                                        \
-        if (special_flag[3] == 0) which = -1;                         \
-        else if (special_flag[3] == 1) which = 0;                     \
-        else which = 3;                                               \
+        which = 3;						      \
      }                                                               \
    }                                                                 \
  }                                                                   \
@ -199,7 +203,7 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
  if (offload) {
    fix->start_watch(TIME_PACK);
    buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
-    buffers->grow_nbor(list, atom->nlocal, aend);
+    buffers->grow_nbor(list, atom->nlocal, comm->nthreads, aend);

    ATOM_T biga;
    biga.x = INTEL_BIGP;
@ -335,7 +339,7 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
    signal(tag)
  #endif
  {
-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
    *timer_compute = MIC_Wtime();
    #endif

@ -494,12 +498,12 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
 	    if (j >= nlocal) {
 	      if (j == nall)
 		jlist[jj] = nall_offset;
-	      else if (which > 0)
+	      else if (which) 
 		jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
 	      else jlist[jj]-=ghost_offset;
            } else
            #endif
-	      if (which > 0) jlist[jj] = j ^ (which << SBBITS);
+	      if (which) jlist[jj] = j ^ (which << SBBITS);
          }
        }
      }
@ -520,7 +524,7 @@ void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
      }
      #endif
    } // end omp
-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
    *timer_compute = MIC_Wtime() - *timer_compute;
    #endif
  } // end offload
@ -688,7 +692,7 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
  if (offload) {
    fix->start_watch(TIME_PACK);
    buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
-    buffers->grow_nbor(list, atom->nlocal, aend);
+    buffers->grow_nbor(list, atom->nlocal, comm->nthreads, aend);

    ATOM_T biga;
    biga.x = INTEL_BIGP;
@ -827,7 +831,7 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
    signal(tag)
  #endif
  {
-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
    *timer_compute = MIC_Wtime();
    #endif

@ -848,33 +852,47 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,

      const int num = aend - astart;
      int tid, ifrom, ito;
+
+      #ifdef OUTER_CHUNK
+      const int swidth = ip_simd::SIMD_type<flt_t>::width();
+      IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, swidth);
+      ifrom += astart;
+      ito += astart;
+      int e_ito = ito;
+      if (ito == num) {
+	int imod = ito % swidth;
+	if (imod) e_ito += swidth - e_ito;
+      }
+      const int list_size = (e_ito + tid + 1) * maxnbors;
+      #else
+      const int swidth = 1;
      IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
      ifrom += astart;
      ito += astart;
+      const int list_size = (ito + tid + 1) * maxnbors;
+      #endif

      int which;

-      const int list_size = (ito + tid + 1) * maxnbors;
+      int pack_offset = maxnbors * swidth;
      int ct = (ifrom + tid) * maxnbors;
      int *neighptr = firstneigh + ct;
+
+      int max_chunk = 0;
+      int lane = 0;
      for (int i = ifrom; i < ito; i++) {
-        int j, k, n, n2, itype, jtype, ibin;
-        double xtmp, ytmp, ztmp, delx, dely, delz, rsq;
-
-        n = 0;
-	n2 = maxnbors;
-
-        xtmp = x[i].x;
-        ytmp = x[i].y;
-        ztmp = x[i].z;
-        itype = x[i].w;
+        const flt_t xtmp = x[i].x;
+        const flt_t ytmp = x[i].y;
+        const flt_t ztmp = x[i].z;
+        const int itype = x[i].w;
        const int ioffset = ntypes * itype;

        // loop over rest of atoms in i's bin, ghosts are at end of linked list
        // if j is owned atom, store it, since j is beyond i in linked list
        // if j is ghost, only store if j coords are "above/to the right" of i

-        for (j = bins[i]; j >= 0; j = bins[j]) {
+	int raw_count = pack_offset;
+        for (int j = bins[i]; j >= 0; j = bins[j]) {
          if (j >= nlocal) {
            if (offload_noghost && offload) continue;
            if (x[j].z < ztmp) continue;
@ -884,116 +902,145 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
            }
          } else if (offload_noghost && i < offload_end) continue;

-          jtype = x[j].w;
          #ifndef _LMP_INTEL_OFFLOAD
-          if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+          if (exclude) {
+	    const int jtype = x[j].w;
+	    if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
+	  }
 	  #endif

-          delx = xtmp - x[j].x;
-          dely = ytmp - x[j].y;
-          delz = ztmp - x[j].z;
-          rsq = delx * delx + dely * dely + delz * delz;
+	  neighptr[raw_count++] = j;
+	}

-          if (rsq <= cutneighsq[ioffset + jtype]) {
-	    if (j < nlocal) {
-              if (need_ic) {
-                int no_special;
-                ominimum_image_check(no_special, delx, dely, delz);
-                if (no_special)
-                  neighptr[n++] = -j - 1;
-                else
-                  neighptr[n++] = j;
-              } else
-	        neighptr[n++] = j;
-	      #ifdef _LMP_INTEL_OFFLOAD
-	      if (j < lmin) lmin = j;
-	      if (j > lmax) lmax = j;
-              #endif
-	    } else {
-              if (need_ic) {
-                int no_special;
-                ominimum_image_check(no_special, delx, dely, delz);
-                if (no_special)
-                  neighptr[n2++] = -j - 1;
-                else
-                  neighptr[n2++] = j;
-              } else
-	        neighptr[n2++] = j;
-	      #ifdef _LMP_INTEL_OFFLOAD
-	      if (j < gmin) gmin = j;
-	      if (j > gmax) gmax = j;
-              #endif
-            }
-	  }
-        }
        // loop over all atoms in other bins in stencil, store every pair

-        ibin = atombin[i];
-
-        for (k = 0; k < nstencil; k++) {
-          for (j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) {
+        const int ibin = atombin[i];
+        for (int k = 0; k < nstencil; k++) {
+          for (int j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) {
            if (offload_noghost) {
              if (j < nlocal) {
                if (i < offload_end) continue;
              } else if (offload) continue;
            }

-            jtype = x[j].w;
            #ifndef _LMP_INTEL_OFFLOAD
-            if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+            if (exclude) {
+	      const int jtype = x[j].w;
+	      if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
+	    }
 	    #endif

-            delx = xtmp - x[j].x;
-            dely = ytmp - x[j].y;
-            delz = ztmp - x[j].z;
-            rsq = delx * delx + dely * dely + delz * delz;
-            if (rsq <= cutneighsq[ioffset + jtype]) {
-	      if (j < nlocal) {
-                if (need_ic) {
-                  int no_special;
-                  ominimum_image_check(no_special, delx, dely, delz);
-                  if (no_special)
-                    neighptr[n++] = -j - 1;
-		  else
-                    neighptr[n++] = j;
-                } else
-  		  neighptr[n++] = j;
-                #ifdef _LMP_INTEL_OFFLOAD
-		if (j < lmin) lmin = j;
-		if (j > lmax) lmax = j;
-                #endif
-	      } else {
-		if (need_ic) {
-		  int no_special;
-		  ominimum_image_check(no_special, delx, dely, delz);
-                  if (no_special)
-                    neighptr[n2++] = -j - 1;
-		  else
-                    neighptr[n2++] = j;
-                } else
-  		  neighptr[n2++] = j;
-	        #ifdef _LMP_INTEL_OFFLOAD
-		if (j < gmin) gmin = j;
-		if (j > gmax) gmax = j;
-                #endif
-	      }
-	    }
-          }
-        }
-        ilist[i] = i;
-
-        cnumneigh[i] = ct;
-        if (n > maxnbors) *overflow = 1;
-	for (k = maxnbors; k < n2; k++) neighptr[n++] = neighptr[k];
-        while( (n % pad_width) != 0 ) neighptr[n++] = e_nall;
-        numneigh[i] = n;
-	while((n % (INTEL_DATA_ALIGN / sizeof(int))) != 0) n++;
-        ct += n;
-        neighptr += n;
-	if (ct + n + maxnbors > list_size) {
-	  *overflow = 1;
-	  ct = (ifrom + tid) * maxnbors;
+	    neighptr[raw_count++] = j;
+	  }
 	}
+
+        #if defined(LMP_SIMD_COMPILER)
+	#ifdef _LMP_INTEL_OFFLOAD
+	int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
+	#pragma vector aligned
+        #pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
+	#else
+	#pragma vector aligned
+        #pragma simd
+	#endif
+	#endif
+	for (int u = pack_offset; u < raw_count; u++) {
+          int j = neighptr[u];
+          const flt_t delx = xtmp - x[j].x;
+          const flt_t dely = ytmp - x[j].y;
+          const flt_t delz = ztmp - x[j].z;
+	  const int jtype = x[j].w;
+          const flt_t rsq = delx * delx + dely * dely + delz * delz;
+          if (rsq > cutneighsq[ioffset + jtype]) 
+	    neighptr[u] = e_nall;
+	  else {
+	    if (need_ic) {
+	      int no_special;
+	      ominimum_image_check(no_special, delx, dely, delz);
+	      if (no_special)
+		neighptr[u] = -j - 1;
+	    }
+            #ifdef _LMP_INTEL_OFFLOAD
+            if (j < nlocal) {
+              if (j < vlmin) vlmin = j;
+              if (j > vlmax) vlmax = j;
+            } else {
+              if (j < vgmin) vgmin = j;
+              if (j > vgmax) vgmax = j;
+            }
+            #endif
+	  }
+	}
+        #ifdef _LMP_INTEL_OFFLOAD
+	lmin = MIN(lmin,vlmin);
+	gmin = MIN(gmin,vgmin);
+	lmax = MAX(lmax,vlmax);
+	gmax = MAX(gmax,vgmax);
+        #endif
+
+        int n = lane, n2 = pack_offset;
+	for (int u = pack_offset; u < raw_count; u++) {
+	  const int j = neighptr[u];
+	  int pj = j;
+	  if (pj < e_nall) {
+	    if (need_ic)
+	      if (pj < 0) pj = -pj - 1;
+
+	    if (pj < nlocal) {
+	      neighptr[n] = j;
+	      n += swidth;
+	    } else
+	      neighptr[n2++] = j;
+	  }
+	}
+	int ns = (n - lane) / swidth;
+	if (ns > maxnbors || n2 > list_size) *overflow = 1;
+	for (int u = pack_offset; u < n2; u++) {
+	  neighptr[n] = neighptr[u];
+	  n += swidth;
+	}
+
+        ilist[i] = i;
+        cnumneigh[i] = ct + lane;
+	ns += n2 - pack_offset;
+	#ifndef OUTER_CHUNK
+        while( (ns % pad_width) != 0 ) neighptr[ns++] = e_nall;
+	#endif
+        numneigh[i] = ns;
+
+	#ifdef OUTER_CHUNK
+	if (ns > max_chunk) max_chunk = ns;
+	lane++;
+	pack_offset -= maxnbors;
+	if (lane == swidth) {
+	  ct += max_chunk * swidth;
+	  const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
+	  const int edge = (ct % alignb);
+	  if (edge) ct += alignb - edge;
+	  neighptr = firstneigh + ct;
+	  max_chunk = 0;
+	  pack_offset = maxnbors * swidth;
+	  lane = 0;
+	  if (ct + pack_offset + maxnbors > list_size) {
+            if (i < ito - 1) {
+	      *overflow = 1;
+	      ct = (ifrom + tid) * maxnbors;
+            }
+	  }
+	}
+	#else
+	ct += ns;
+	const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
+	const int edge = (ct % alignb);
+	if (edge) ct += alignb - edge;
+	neighptr = firstneigh + ct;
+	if (ct + pack_offset + maxnbors > list_size) {
+	  if (i < ito - 1) {
+	    *overflow = 1;
+	    ct = (ifrom + tid) * maxnbors;
+	  }
+	}
+	#endif
      }

      if (*overflow == 1)
@ -1032,7 +1079,16 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
        for (int i = ifrom; i < ito; ++i) {
          int * _noalias jlist = firstneigh + cnumneigh[i];
          const int jnum = numneigh[i];
+	  #ifndef OUTER_CHUNK
+          #if defined(LMP_SIMD_COMPILER)
+	  #pragma vector aligned
+          #pragma simd
+	  #endif
          for (int jj = 0; jj < jnum; jj++) {
+	  #else
+	  const int trip = jnum * swidth;
+          for (int jj = 0; jj < trip; jj+= swidth) {
+	  #endif
            const int j = jlist[jj];
 	    if (need_ic && j < 0) {
 	      which = 0;
@ -1044,12 +1100,12 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
 	    if (j >= nlocal) {
 	      if (j == e_nall)
 		jlist[jj] = nall_offset;
-	      else if (which > 0)
+	      else if (which) 
 		jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
 	      else jlist[jj]-=ghost_offset;
            } else
 	    #endif
-            if (which > 0) jlist[jj] = j ^ (which << SBBITS);
+            if (which) jlist[jj] = j ^ (which << SBBITS);
          }
        }
      }
@ -1070,7 +1126,7 @@ void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
      }
      #endif
    } // end omp
-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
    *timer_compute = MIC_Wtime() - *timer_compute;
    #endif
  } // end offload
@ -1238,7 +1294,7 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
  if (offload) {
    fix->start_watch(TIME_PACK);
    buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
-    buffers->grow_nbor(list, atom->nlocal, aend);
+    buffers->grow_nbor(list, atom->nlocal, comm->nthreads, aend);

    ATOM_T biga;
    biga.x = INTEL_BIGP;
@ -1377,7 +1433,7 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
    signal(tag)
  #endif
  {
-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
    *timer_compute = MIC_Wtime();
    #endif

@ -1550,12 +1606,12 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
 	    if (j >= nlocal) {
 	      if (j == e_nall)
 		jlist[jj] = nall_offset;
-	      else if (which > 0)
+	      else if (which) 
 		jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
 	      else jlist[jj]-=ghost_offset;
            } else
            #endif
-	      if (which > 0) jlist[jj] = j ^ (which << SBBITS);
+	      if (which) jlist[jj] = j ^ (which << SBBITS);
          }
        }
      }
@ -1576,7 +1632,7 @@ void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
      }
      #endif
    } // end omp
-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
    *timer_compute = MIC_Wtime() - *timer_compute;
    #endif
  } // end offload
@ -1741,10 +1797,12 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
  const int nall = atom->nlocal + atom->nghost;
  int pad = 1;

+  const int pack_width = fix->nbor_pack_width();
+
  if (offload) {
    fix->start_watch(TIME_PACK);
    buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
-    buffers->grow_nbor(list, atom->nlocal, aend);
+    buffers->grow_nbor(list, atom->nlocal, comm->nthreads, aend, pack_width);

    ATOM_T biga;
    biga.x = INTEL_BIGP;
@ -1871,7 +1929,7 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
    in(atombin:length(aend) alloc_if(0) free_if(0)) \
    in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
    in(special_flag:length(0) alloc_if(0) free_if(0)) \
-    in(maxnbors,nthreads,maxspecial,nstencil,e_nall,offload) \
+    in(maxnbors,nthreads,maxspecial,nstencil,e_nall,offload,pack_width)	\
    in(offload_end,separate_buffers,astart, aend, nlocal, molecular, ntypes) \
    in(xperiodic, yperiodic, zperiodic, xprd_half, yprd_half, zprd_half) \
    out(overflow:length(5) alloc_if(0) free_if(0)) \
@ -1879,7 +1937,7 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
    signal(tag)
  #endif
  {
-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
    *timer_compute = MIC_Wtime();
    #endif

@ -1900,36 +1958,40 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,

      const int num = aend - astart;
      int tid, ifrom, ito;
-      IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
+
+      IP_PRE_omp_range_id_vec(ifrom, ito, tid, num, nthreads, pack_width);
      ifrom += astart;
      ito += astart;
+      int e_ito = ito;
+      if (ito == num) {
+	int imod = ito % pack_width;
+	if (imod) e_ito += pack_width - e_ito;
+      }
+      const int list_size = (e_ito + tid + 1) * maxnbors;

      int which;

-      const int list_size = (ito + tid + 1) * maxnbors;
+      int pack_offset = maxnbors * pack_width;
      int ct = (ifrom + tid) * maxnbors;
      int *neighptr = firstneigh + ct;
+
+      int max_chunk = 0;
+      int lane = 0;
      for (int i = ifrom; i < ito; i++) {
-        int j, k, n, n2, itype, jtype, ibin;
-        double xtmp, ytmp, ztmp, delx, dely, delz, rsq;
-
-        n = 0;
-        n2 = maxnbors;
-
-        xtmp = x[i].x;
-        ytmp = x[i].y;
-        ztmp = x[i].z;
-        itype = x[i].w;
+        const flt_t xtmp = x[i].x;
+        const flt_t ytmp = x[i].y;
+        const flt_t ztmp = x[i].z;
+        const int itype = x[i].w;
        const tagint itag = tag[i];
        const int ioffset = ntypes * itype;

+        const int ibin = atombin[i];
+	int raw_count = pack_offset;
+
        // loop over all atoms in surrounding bins in stencil including self
        // skip i = j
-
-        ibin = atombin[i];
-
-        for (k = 0; k < nstencil; k++) {
-          for (j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) {
+        for (int k = 0; k < nstencil; k++) {
+          for (int j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) {
            if (i == j) continue;

            if (offload_noghost) {
@ -1938,76 +2000,121 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
              } else if (offload) continue;
            }

-            jtype = x[j].w;
            #ifndef _LMP_INTEL_OFFLOAD
-            if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
-            #endif
-
-            delx = xtmp - x[j].x;
-            dely = ytmp - x[j].y;
-            delz = ztmp - x[j].z;
-            rsq = delx * delx + dely * dely + delz * delz;
-            if (rsq <= cutneighsq[ioffset + jtype]) {
-	      const int jtag = tag[j];
-              int flist = 0;
-	      if (itag > jtag) {
-		if ((itag+jtag) % 2 == 0) flist = 1;
-	      } else if (itag < jtag) {
-		if ((itag+jtag) % 2 == 1) flist = 1;
-	      } else {
-		if (x[j].z < ztmp) flist = 1;
-		else if (x[j].z == ztmp && x[j].y < ytmp) flist = 1;
-		else if (x[j].z == ztmp && x[j].y == ytmp && x[j].x < xtmp)
-		  flist = 1;
-	      }
-	      if (flist) {
-                if (need_ic) {
-                  int no_special;
-                  ominimum_image_check(no_special, delx, dely, delz);
-                  if (no_special)
-                    neighptr[n2++] = -j - 1;
-		  else
-                    neighptr[n2++] = j;
-                } else
- 		  neighptr[n2++] = j;
-              } else {
-                if (need_ic) {
-                  int no_special;
-                  ominimum_image_check(no_special, delx, dely, delz);
-                  if (no_special)
-                    neighptr[n++] = -j - 1;
-		  else
-                    neighptr[n++] = j;
-                } else
- 		  neighptr[n++] = j;
-              }
-
-              #ifdef _LMP_INTEL_OFFLOAD
-	      if (j < nlocal) {
-                if (j < lmin) lmin = j;
-                if (j > lmax) lmax = j;
-              } else {
-                if (j < gmin) gmin = j;
-                if (j > gmax) gmax = j;
-              }
-              #endif
+            if (exclude) {
+              const int jtype = x[j].w;
+              if (exclusion(i,j,itype,jtype,mask,molecule)) continue;
            }
+            #endif
+	    
+	    neighptr[raw_count++] = j;
          }
        }
-        ilist[i] = i;

-        cnumneigh[i] = ct;
-        if (n > maxnbors) *overflow = 1;
-	atombin[i] = n;
-        for (k = maxnbors; k < n2; k++) neighptr[n++] = neighptr[k];
-        numneigh[i] = n;
-        while((n % (INTEL_DATA_ALIGN / sizeof(int))) != 0) n++;
-        ct += n;
-        neighptr += n;
-        if (ct + n + maxnbors > list_size) {
-          *overflow = 1;
-          ct = (ifrom + tid) * maxnbors;
+        #if defined(LMP_SIMD_COMPILER)
+	#ifdef _LMP_INTEL_OFFLOAD
+	int vlmin = lmin, vlmax = lmax, vgmin = gmin, vgmax = gmax;
+	#pragma vector aligned
+        #pragma simd reduction(max:vlmax,vgmax) reduction(min:vlmin, vgmin)
+	#else
+	#pragma vector aligned
+        #pragma simd
+	#endif
+	#endif
+	for (int u = pack_offset; u < raw_count; u++) {
+          int j = neighptr[u];
+          const flt_t delx = xtmp - x[j].x;
+          const flt_t dely = ytmp - x[j].y;
+          const flt_t delz = ztmp - x[j].z;
+          const int jtype = x[j].w;
+          const flt_t rsq = delx * delx + dely * dely + delz * delz;
+          if (rsq > cutneighsq[ioffset + jtype]) 
+	    neighptr[u] = e_nall;
+	  else {
+            if (need_ic) {
+              int no_special;
+              ominimum_image_check(no_special, delx, dely, delz);
+              if (no_special)
+                neighptr[u] = -j - 1;
+            }
+            #ifdef _LMP_INTEL_OFFLOAD
+            if (j < nlocal) {
+              if (j < vlmin) vlmin = j;
+              if (j > vlmax) vlmax = j;
+            } else {
+              if (j < vgmin) vgmin = j;
+              if (j > vgmax) vgmax = j;
+            }
+            #endif
+	  }
+	}
+        #ifdef _LMP_INTEL_OFFLOAD
+	lmin = MIN(lmin,vlmin);
+	gmin = MIN(gmin,vgmin);
+	lmax = MAX(lmax,vlmax);
+	gmax = MAX(gmax,vgmax);
+        #endif
+
+        int n = lane, n2 = pack_offset;
+	for (int u = pack_offset; u < raw_count; u++) {
+	  const int j = neighptr[u];
+	  int pj = j;
+	  if (pj < e_nall) {
+	    if (need_ic)
+	      if (pj < 0) pj = -pj - 1;
+	
+	    const int jtag = tag[pj];
+	    int flist = 0;
+	    if (itag > jtag) {
+	      if ((itag+jtag) % 2 == 0) flist = 1;
+	    } else if (itag < jtag) {
+	      if ((itag+jtag) % 2 == 1) flist = 1;
+	    } else {
+              if (x[pj].z < ztmp) flist = 1;
+	      else if (x[pj].z == ztmp && x[pj].y < ytmp) flist = 1;
+	      else if (x[pj].z == ztmp && x[pj].y == ytmp && x[pj].x < xtmp) 
+	      flist = 1;
+	    }
+	    if (flist) {
+	      neighptr[n2++] = j;
+	    } else {
+	      neighptr[n] = j;
+	      n += pack_width;
+	    }
+          }
        }
+	int ns = (n - lane) / pack_width;
+	if (ns > maxnbors || n2 > list_size) *overflow = 1;
+	atombin[i] = ns;
+	for (int u = pack_offset; u < n2; u++) {
+	  neighptr[n] = neighptr[u];
+	  n += pack_width;
+	}
+
+        ilist[i] = i;
+        cnumneigh[i] = ct + lane;
+	ns += n2 - pack_offset;
+        numneigh[i] = ns;
+
+	if (ns > max_chunk) max_chunk = ns;
+	lane++;
+	pack_offset -= maxnbors;
+	if (lane == pack_width) {
+	  ct += max_chunk * pack_width;
+	  const int alignb = (INTEL_DATA_ALIGN / sizeof(int));
+	  const int edge = (ct % alignb);
+	  if (edge) ct += alignb - edge;
+	  neighptr = firstneigh + ct;
+	  max_chunk = 0;
+	  pack_offset = maxnbors * pack_width;
+	  lane = 0;
+	  if (ct + pack_offset + maxnbors > list_size) {
+  	    if (i < ito - 1) {
+	      *overflow = 1;
+	      ct = (ifrom + tid) * maxnbors;
+	    }
+          }
+	}
      }

      if (*overflow == 1)
@ -2046,7 +2153,9 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
        for (int i = ifrom; i < ito; ++i) {
          int * _noalias jlist = firstneigh + cnumneigh[i];
          const int jnum = numneigh[i];
-          for (int jj = 0; jj < jnum; jj++) {
+
+	  const int trip = jnum * pack_width;
+          for (int jj = 0; jj < trip; jj+=pack_width) {
            const int j = jlist[jj];
 	    if (need_ic && j < 0) {
 	      which = 0;
@ -2058,12 +2167,12 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
            if (j >= nlocal) {
              if (j == e_nall)
                jlist[jj] = nall_offset;
-              else if (which > 0)
+              else if (which)
                jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
              else jlist[jj]-=ghost_offset;
            } else
            #endif
-            if (which > 0) jlist[jj] = j ^ (which << SBBITS);
+            if (which) jlist[jj] = j ^ (which << SBBITS);
          }
        }
      }
@ -2083,7 +2192,7 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
      }
      #endif
    } // end omp
-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
    *timer_compute = MIC_Wtime() - *timer_compute;
    #endif
  } // end offload
@ -2113,3 +2222,4 @@ void Neighbor::fbi(const int offload, NeighList *list, void *buffers_in,
    #endif
  }
 }
+
--- a/src/USER-INTEL/pair_gayberne_intel.cpp
+++ b/src/USER-INTEL/pair_gayberne_intel.cpp
@ -12,9 +12,17 @@
   Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */

-#include <math.h>
 #include "pair_gayberne_intel.h"
 #include "math_extra_intel.h"
+
+#ifdef _LMP_INTEL_OFFLOAD
+#pragma offload_attribute(push,target(mic))
+#endif
+#include <cmath>
+#ifdef _LMP_INTEL_OFFLOAD
+#pragma offload_attribute(pop)
+#endif
+
 #include "atom.h"
 #include "comm.h"
 #include "atom_vec_ellipsoid.h"
@ -295,7 +303,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
    signal(f_start)
  #endif
  {
-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
    *timer_compute=MIC_Wtime();
    #endif

@ -335,8 +343,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,

    acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
    if (EVFLAG) {
-      oevdwl = (acc_t)0;
-      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+      oevdwl = (acc_t)0.0;
+      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
    }

    // loop over neighbors of my atoms
@ -394,8 +402,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
        fxtmp = fytmp = fztmp = t1tmp = t2tmp = t3tmp = (acc_t)0.0;

        if (EVFLAG) {
-          if (EFLAG) fwtmp = sevdwl = (acc_t)0;
-          if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+          if (EFLAG) fwtmp = sevdwl = (acc_t)0.0;
+          if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
        }

        bool multiple_forms = false;
@ -485,14 +493,14 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
          tempv_1 = kappa_1 * inv_r;
          tempv_2 = kappa_2 * inv_r;
          flt_t sigma12 = ME_dot3(r12hat, tempv);
-          sigma12 = pow((flt_t)0.5 * sigma12,(flt_t) - 0.5);
+          sigma12 = std::pow((flt_t)0.5 * sigma12,(flt_t) - 0.5);
          flt_t h12 = r - sigma12;

          // energy
          // compute u_r

          flt_t varrho = sigma / (h12 + gamma * sigma);
-          flt_t varrho6 = pow(varrho, (flt_t)6.0);
+          flt_t varrho6 = std::pow(varrho, (flt_t)6.0);
          flt_t varrho12 = varrho6 * varrho6;
          flt_t u_r = (flt_t)4.0 * epsilon * (varrho12 - varrho6);

@ -500,7 +508,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,

          flt_t eta = (flt_t)2.0 * ijci[jtype].lshape;
          flt_t det_g12 = ME_det3(g12);
-          eta = pow(eta / det_g12, upsilon);
+          eta = std::pow(eta / det_g12, upsilon);

          // compute chi_12

@ -516,7 +524,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
          tempv_1 = iota_1 * inv_r;
          tempv_2 = iota_2 * inv_r;
          flt_t chi = ME_dot3(r12hat, tempv);
-          chi = pow(chi * (flt_t)2.0, mu);
+          chi = std::pow(chi * (flt_t)2.0, mu);

          // force
          // compute dUr/dr
@ -524,7 +532,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
          temp1 = ((flt_t)2.0 * varrho12 * varrho - varrho6 * varrho) /
 	    sigma;
          temp1 = temp1 * (flt_t)24.0 * epsilon;
-          flt_t u_slj = temp1 * pow(sigma12, (flt_t)3.0) * (flt_t)0.5;
+          flt_t u_slj = temp1 * std::pow(sigma12, (flt_t)3.0) * (flt_t)0.5;
          flt_t dUr_0, dUr_1, dUr_2;
          temp2 = ME_dot3(kappa, r12hat);
          flt_t uslj_rsq = u_slj / rsq_form[jj];
@ -536,8 +544,8 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,

          flt_t dchi_0, dchi_1, dchi_2;
          temp1 = ME_dot3(iota, r12hat);
-          temp2 = (flt_t)-4.0 / rsq_form[jj] * mu *
-	    pow(chi, (mu - (flt_t)1.0) / mu);
+          temp2 = (flt_t)-4.0 / rsq_form[jj] * mu * 
+	    std::pow(chi, (mu - (flt_t)1.0) / mu);
          dchi_0 = temp2 * (iota_0 - temp1 * r12hat_0);
          dchi_1 = temp2 * (iota_1 - temp1 * r12hat_1);
          dchi_2 = temp2 * (iota_2 - temp1 * r12hat_2);
@ -714,7 +722,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
 	    }

 	    if (EVFLAG) {
-	      flt_t ev_pre = (flt_t)0;
+	      flt_t ev_pre = (flt_t)0.0;
 	      if (NEWTON_PAIR || i < nlocal)
 		ev_pre += (flt_t)0.5;
 	      if (NEWTON_PAIR || j < nlocal)
@ -863,7 +871,7 @@ void PairGayBerneIntel::eval(const int offload, const int vflag,
      }
    }

-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
    *timer_compute = MIC_Wtime() - *timer_compute;
    #endif
  } // offload
--- a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
@ -217,7 +217,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
    ITABLE_IN signal(f_start)
  #endif
  {
-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
    *timer_compute = MIC_Wtime();
    #endif

@ -459,7 +459,7 @@ void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
        ev_global[7] = ov5;
      }
    }
-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
    *timer_compute = MIC_Wtime() - *timer_compute;
    #endif
  } // end of offload region
--- a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
+++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
@ -212,7 +212,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
    ITABLE_IN signal(f_start)
  #endif
  {
-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
    *timer_compute = MIC_Wtime();
    #endif

@ -263,7 +263,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
 	  if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 	}

-        #if defined(__INTEL_COMPILER)
+        #if defined(LMP_SIMD_COMPILER)
 	#pragma vector aligned
 	#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
 	                       sv0, sv1, sv2, sv3, sv4, sv5)
@ -283,7 +283,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,

          const flt_t r2inv = (flt_t)1.0 / rsq;

-	  #ifdef __MIC__
+	  #ifdef INTEL_VMASK
 	  if (rsq < c_forcei[jtype].cutsq) {
          #endif
            #ifdef INTEL_ALLOW_TABLE
@ -335,11 +335,11 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
              }
            }
            #endif
-	  #ifdef __MIC__
+	  #ifdef INTEL_VMASK
 	  }
 	  #endif

-	  #ifdef __MIC__
+	  #ifdef INTEL_VMASK
 	  if (rsq < c_forcei[jtype].cut_ljsq) {
 	  #endif
            flt_t r6inv = r2inv * r2inv * r2inv;
@ -354,7 +354,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
              forcelj *= factor_lj;
              if (EFLAG) evdwl *= factor_lj;
            }
-	  #ifdef __MIC__
+	  #ifdef INTEL_VMASK
 	  }
 	  #else
 	  if (rsq > c_forcei[jtype].cutsq)
@ -363,7 +363,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
 	    { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
 	  #endif

-	  #ifdef __MIC__
+	  #ifdef INTEL_VMASK
 	  if (rsq < c_forcei[jtype].cutsq) {
 	  #endif
            const flt_t fpair = (forcecoul + forcelj) * r2inv;
@ -395,7 +395,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
              }
 	      IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz);
            }
-          #ifdef __MIC__
+          #ifdef INTEL_VMASK
 	  }
 	  #endif
        } // for jj
@ -426,7 +426,7 @@ void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
        ev_global[7] = ov5;
      }
    }
-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
    *timer_compute = MIC_Wtime() - *timer_compute;
    #endif
  } // end of offload region
--- a/src/USER-INTEL/pair_lj_cut_intel.cpp
+++ b/src/USER-INTEL/pair_lj_cut_intel.cpp
@ -88,39 +88,73 @@ void PairLJCutIntel::compute(int eflag, int vflag,
    fix->stop_watch(TIME_PACK);
  }

-  if (evflag || vflag_fdotr) {
-    int ovflag = 0;
-    if (vflag_fdotr) ovflag = 2;
-    else if (vflag) ovflag = 1;
-    if (eflag) {
-      if (force->newton_pair) {
-        eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
-        eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
+  if (_onetype) {
+    if (evflag || vflag_fdotr) {
+      int ovflag = 0;
+      if (vflag_fdotr) ovflag = 2;
+      else if (vflag) ovflag = 1;
+      if (eflag) {
+	if (force->newton_pair) {
+	  eval<1,1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+	  eval<1,1,1,1>(0, ovflag, buffers, fc, host_start, inum);
+	} else {
+	  eval<1,1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+	  eval<1,1,1,0>(0, ovflag, buffers, fc, host_start, inum);
+	}
      } else {
-        eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
-        eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
+	if (force->newton_pair) {
+	  eval<1,1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+	  eval<1,1,0,1>(0, ovflag, buffers, fc, host_start, inum);
+	} else {
+	  eval<1,1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+	  eval<1,1,0,0>(0, ovflag, buffers, fc, host_start, inum);
+	}
      }
    } else {
      if (force->newton_pair) {
-        eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
-        eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
+	eval<1,0,0,1>(1, 0, buffers, fc, 0, offload_end);
+	eval<1,0,0,1>(0, 0, buffers, fc, host_start, inum);
      } else {
-        eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
-        eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
+	eval<1,0,0,0>(1, 0, buffers, fc, 0, offload_end);
+	eval<1,0,0,0>(0, 0, buffers, fc, host_start, inum);
      }
    }
  } else {
-    if (force->newton_pair) {
-      eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
-      eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
+    if (evflag || vflag_fdotr) {
+      int ovflag = 0;
+      if (vflag_fdotr) ovflag = 2;
+      else if (vflag) ovflag = 1;
+      if (eflag) {
+	if (force->newton_pair) {
+	  eval<0,1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+	  eval<0,1,1,1>(0, ovflag, buffers, fc, host_start, inum);
+	} else {
+	  eval<0,1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+	  eval<0,1,1,0>(0, ovflag, buffers, fc, host_start, inum);
+	}
+      } else {
+	if (force->newton_pair) {
+	  eval<0,1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+	  eval<0,1,0,1>(0, ovflag, buffers, fc, host_start, inum);
+	} else {
+	  eval<0,1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+	  eval<0,1,0,0>(0, ovflag, buffers, fc, host_start, inum);
+	}
+      }
    } else {
-      eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
-      eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
+      if (force->newton_pair) {
+	eval<0,0,0,1>(1, 0, buffers, fc, 0, offload_end);
+	eval<0,0,0,1>(0, 0, buffers, fc, host_start, inum);
+      } else {
+	eval<0,0,0,0>(1, 0, buffers, fc, 0, offload_end);
+	eval<0,0,0,0>(0, 0, buffers, fc, host_start, inum);
+      }
    }
  }
 }

-template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+template <int ONETYPE, int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, 
+	  class acc_t>
 void PairLJCutIntel::eval(const int offload, const int vflag,
                          IntelBuffers<flt_t,acc_t> *buffers,
                          const ForceConst<flt_t> &fc,
@ -159,7 +193,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
  const int nthreads = tc;
  int *overflow = fix->get_off_overflow_flag();
  {
-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
    *timer_compute = MIC_Wtime();
    #endif

@ -187,12 +221,25 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
      FORCE_T * _noalias const f = f_start - minlocal + (tid * f_stride);
      memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));

+      flt_t cutsq, lj1, lj2, lj3, lj4, offset;
+      if (ONETYPE) {
+	cutsq = ljc12o[3].cutsq;
+	lj1 = ljc12o[3].lj1;
+	lj2 = ljc12o[3].lj2;
+	lj3 = lj34[3].lj3;
+	lj4 = lj34[3].lj4;
+	offset = ljc12o[3].offset;
+      }
      for (int i = iifrom; i < iito; ++i) {
-        const int itype = x[i].w;
-
-        const int ptr_off = itype * ntypes;
-        const FC_PACKED1_T * _noalias const ljc12oi = ljc12o + ptr_off;
-        const FC_PACKED2_T * _noalias const lj34i = lj34 + ptr_off;
+        int itype, ptr_off;
+        const FC_PACKED1_T * _noalias ljc12oi;
+        const FC_PACKED2_T * _noalias lj34i;
+	if (!ONETYPE) {
+	  itype = x[i].w;
+          ptr_off = itype * ntypes;
+          ljc12oi = ljc12o + ptr_off;
+          lj34i = lj34 + ptr_off;
+	}

        const int * _noalias const jlist = firstneigh + cnumneigh[i];
        const int jnum = numneigh[i];
@ -218,25 +265,42 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
          flt_t forcelj, evdwl;
          forcelj = evdwl = (flt_t)0.0;

-          const int sbindex = jlist[jj] >> SBBITS & 3;
-          const int j = jlist[jj] & NEIGHMASK;
+	  int j, jtype, sbindex;
+	  if (!ONETYPE) {
+	    sbindex = jlist[jj] >> SBBITS & 3;
+	    j = jlist[jj] & NEIGHMASK;
+	  } else
+	    j = jlist[jj];
+
          const flt_t delx = xtmp - x[j].x;
          const flt_t dely = ytmp - x[j].y;
          const flt_t delz = ztmp - x[j].z;
-          const int jtype = x[j].w;
+          if (!ONETYPE) {
+	    jtype = x[j].w;
+            cutsq = ljc12oi[jtype].cutsq;
+	  }
          const flt_t rsq = delx * delx + dely * dely + delz * delz;

          #ifdef INTEL_VMASK
-          if (rsq < ljc12oi[jtype].cutsq) {
+          if (rsq < cutsq) {
 	  #endif
-            flt_t factor_lj = special_lj[sbindex];
+            flt_t factor_lj;
+	    if (!ONETYPE) factor_lj = special_lj[sbindex];
            flt_t r2inv = 1.0 / rsq;
            flt_t r6inv = r2inv * r2inv * r2inv;
            #ifndef INTEL_VMASK
-	    if (rsq > ljc12oi[jtype].cutsq) r6inv = (flt_t)0.0;
+	    if (rsq > cutsq) r6inv = (flt_t)0.0;
 	    #endif
-            forcelj = r6inv * (ljc12oi[jtype].lj1 * r6inv - ljc12oi[jtype].lj2);
-            flt_t fpair = factor_lj * forcelj * r2inv;
+	    if (!ONETYPE) {
+	      lj1 = ljc12oi[jtype].lj1;
+	      lj2 = ljc12oi[jtype].lj2;
+	    }
+            forcelj = r6inv * (lj1 * r6inv - lj2);
+            flt_t fpair;
+	    if (!ONETYPE)
+	      fpair = factor_lj * forcelj * r2inv;
+	    else
+	      fpair = forcelj * r2inv;

            fxtmp += delx * fpair;
            fytmp += dely * fpair;
@ -255,9 +319,13 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
                ev_pre += (flt_t)0.5;

              if (EFLAG) {
-                evdwl = r6inv * (lj34i[jtype].lj3 * r6inv-lj34i[jtype].lj4) -
-                    ljc12oi[jtype].offset;
-                evdwl *= factor_lj;
+		if (!ONETYPE) {
+		  lj3 = lj34i[jtype].lj3;
+		  lj4 = lj34i[jtype].lj4;
+		  offset = ljc12oi[jtype].offset;
+		}
+                evdwl = r6inv * (lj3 * r6inv - lj4) - offset;
+                if (!ONETYPE) evdwl *= factor_lj;
                sevdwl += ev_pre*evdwl;
                if (eatom) {
                  if (NEWTON_PAIR || i < nlocal)
@ -302,7 +370,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
        ev_global[7] = ov5;
      }
    }
-    #ifdef __MIC__
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
    *timer_compute = MIC_Wtime() - *timer_compute;
    #endif
  } // end offload
@ -352,6 +420,9 @@ template <class flt_t, class acc_t>
 void PairLJCutIntel::pack_force_const(ForceConst<flt_t> &fc,
                                      IntelBuffers<flt_t,acc_t> *buffers)
 {
+  _onetype = 0;
+  if (atom->ntypes == 1 && !atom->molecular) _onetype = 1;
+
  int tp1 = atom->ntypes + 1;
  fc.set_ntypes(tp1,memory,_cop);
  buffers->set_ntypes(tp1);
--- a/src/USER-INTEL/pair_lj_cut_intel.h
+++ b/src/USER-INTEL/pair_lj_cut_intel.h
@ -39,13 +39,14 @@ class PairLJCutIntel : public PairLJCut {

 private:
  FixIntel *fix;
-  int _cop;
+  int _cop, _onetype;

  template <class flt_t> class ForceConst;
  template <class flt_t, class acc_t>
  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
               const ForceConst<flt_t> &fc);
-  template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+  template <int ONETYPE, int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, 
+	    class acc_t>
  void eval(const int offload, const int vflag,
            IntelBuffers<flt_t,acc_t> * buffers,
            const ForceConst<flt_t> &fc, const int astart, const int aend);
--- a/src/USER-INTEL/pair_sw_intel.cpp
+++ b/src/USER-INTEL/pair_sw_intel.cpp
--- a/src/USER-INTEL/pair_sw_intel.h
+++ b/src/USER-INTEL/pair_sw_intel.h
@ -11,6 +11,10 @@
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */

+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
 #ifdef PAIR_CLASS

 PairStyle(sw/intel,PairSWIntel)
@ -42,7 +46,7 @@ class PairSWIntel : public PairSW {
  template <class flt_t, class acc_t>
  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
               const ForceConst<flt_t> &fc);
-  template <int SPQ, int EVFLAG, int EFLAG, class flt_t, class acc_t>
+  template <int SPQ,int ONETYPE,int EVFLAG,int EFLAG,class flt_t,class acc_t>
  void eval(const int offload, const int vflag,
            IntelBuffers<flt_t,acc_t> * buffers, const ForceConst<flt_t> &fc,
 	    const int astart, const int aend, const int pad_width);
@ -51,7 +55,10 @@ class PairSWIntel : public PairSW {
  void pack_force_const(ForceConst<flt_t> &fc,
                        IntelBuffers<flt_t, acc_t> *buffers);

-  int _ccache_stride, _host_pad, _offload_pad, _spq;
+  int _ccache_stride, _host_pad, _offload_pad, _spq, _onetype;
+  #ifdef LMP_USE_AVXCD
+  int _ccache_stride3;
+  #endif

  // ----------------------------------------------------------------------

@ -62,8 +69,11 @@ class PairSWIntel : public PairSW {
      flt_t cutsq, cut, sigma_gamma, pad;
    } fc_packed0;
    typedef struct {
-      flt_t powerp, powerq, cut, sigma, c1, c2, c3, c4;
+      flt_t powerp, powerq, cut, sigma;
    } fc_packed1;
+    typedef struct {
+      flt_t c1, c2, c3, c4;
+    } fc_packed1p2;
    typedef struct {
      flt_t c5, c6;
    } fc_packed2;
@ -73,6 +83,7 @@ class PairSWIntel : public PairSW {

    fc_packed0 **p2;
    fc_packed1 **p2f;
+    fc_packed1p2 **p2f2;
    fc_packed2 **p2e;
    fc_packed3 ***p3;