diff --git a/src/Depend.sh b/src/Depend.sh
index 3786172d64..632aa4a3e8 100644
--- a/src/Depend.sh
+++ b/src/Depend.sh
@@ -44,6 +44,7 @@ depend () {
 if (test $1 = "ASPHERE") then
   depend GPU
   depend USER-OMP
+  depend USER-INTEL
 fi
 
 if (test $1 = "CLASS2") then
@@ -72,6 +73,7 @@ if (test $1 = "KSPACE") then
   depend OPT
   depend USER-CUDA
   depend USER-OMP
+  depend USER-INTEL
   depend USER-PHONON
 fi
 
@@ -88,6 +90,7 @@ if (test $1 = "MOLECULE") then
   depend USER-CUDA
   depend USER-MISC
   depend USER-OMP
+  depend USER-INTEL
 fi
 
 if (test $1 = "PERI") then
diff --git a/src/GRANULAR/pair_gran_hooke_history.cpp b/src/GRANULAR/pair_gran_hooke_history.cpp
index 77d2f0d70a..7f98cfa4e7 100644
--- a/src/GRANULAR/pair_gran_hooke_history.cpp
+++ b/src/GRANULAR/pair_gran_hooke_history.cpp
@@ -45,7 +45,6 @@ PairGranHookeHistory::PairGranHookeHistory(LAMMPS *lmp) : Pair(lmp)
   no_virial_fdotr_compute = 1;
   history = 1;
   fix_history = NULL;
-  suffix = NULL;
 
   single_extra = 4;
   svector = new double[4];
@@ -67,7 +66,6 @@ PairGranHookeHistory::~PairGranHookeHistory()
 {
   delete [] svector;
   if (fix_history) modify->delete_fix("SHEAR_HISTORY");
-  if (suffix) delete[] suffix;
 
   if (allocated) {
     memory->destroy(setflag);
@@ -436,7 +434,7 @@ void PairGranHookeHistory::init_style()
     fixarg[0] = (char *) "SHEAR_HISTORY";
     fixarg[1] = (char *) "all";
     fixarg[2] = (char *) "SHEAR_HISTORY";
-    modify->add_fix(3,fixarg,suffix);
+    modify->add_fix(3,fixarg,1);
     delete [] fixarg;
     fix_history = (FixShearHistory *) modify->fix[modify->nfix-1];
     fix_history->pair = this;
diff --git a/src/GRANULAR/pair_gran_hooke_history.h b/src/GRANULAR/pair_gran_hooke_history.h
index 4e2e51a4cc..25762ca65d 100644
--- a/src/GRANULAR/pair_gran_hooke_history.h
+++ b/src/GRANULAR/pair_gran_hooke_history.h
@@ -54,7 +54,6 @@ class PairGranHookeHistory : public Pair {
   int freeze_group_bit;
   int history;
 
-  char *suffix;
   int neighprev;
   double *onerad_dynamic,*onerad_frozen;
   double *maxrad_dynamic,*maxrad_frozen;
diff --git a/src/KSPACE/fix_tune_kspace.cpp b/src/KSPACE/fix_tune_kspace.cpp
index 9abfc9d1b4..e3d5a5b5cd 100644
--- a/src/KSPACE/fix_tune_kspace.cpp
+++ b/src/KSPACE/fix_tune_kspace.cpp
@@ -218,7 +218,8 @@ void FixTuneKspace::store_old_kspace_settings()
    update the pair style if necessary, preserving the settings
 ------------------------------------------------------------------------- */
 
-void FixTuneKspace::update_pair_style(char *new_pair_style, double pair_cut_coul)
+void FixTuneKspace::update_pair_style(char *new_pair_style, 
+                                      double pair_cut_coul)
 {
   int itmp;
   double *p_cutoff = (double *) force->pair->extract("cut_coul",itmp);
@@ -235,7 +236,7 @@ void FixTuneKspace::update_pair_style(char *new_pair_style, double pair_cut_coul
 
   cout << "Creating new pair style: " << new_pair_style << endl;
   // delete old pair style and create new one
-  force->create_pair(new_pair_style,lmp->suffix);
+  force->create_pair(new_pair_style,1);
 
   // restore current pair settings from temporary file
   force->pair->read_restart(p_pair_settings_file);
@@ -252,7 +253,8 @@ void FixTuneKspace::update_pair_style(char *new_pair_style, double pair_cut_coul
    update the kspace style if necessary
 ------------------------------------------------------------------------- */
 
-void FixTuneKspace::update_kspace_style(char *new_kspace_style, char *new_acc_str)
+void FixTuneKspace::update_kspace_style(char *new_kspace_style, 
+                                        char *new_acc_str)
 {
   // create kspace style char string
 
@@ -269,8 +271,7 @@ void FixTuneKspace::update_kspace_style(char *new_kspace_style, char *new_acc_st
 
   // delete old kspace style and create new one
 
-  force->create_kspace(narg,arg,lmp->suffix);
-
+  force->create_kspace(narg,arg,1);
   force->kspace->differentiation_flag = old_differentiation_flag;
   force->kspace->slabflag = old_slabflag;
   force->kspace->slab_volfactor = old_slab_volfactor;
diff --git a/src/MAKE/Makefile.beacon b/src/MAKE/Makefile.beacon
new file mode 100755
index 0000000000..98e816a430
--- /dev/null
+++ b/src/MAKE/Makefile.beacon
@@ -0,0 +1,109 @@
+# linux = RedHat Linux box, Intel icc, MPICH2, FFTW
+
+SHELL = /bin/sh
+
+# ---------------------------------------------------------------------
+# compiler/linker settings
+# specify flags and libraries needed for your compiler
+
+CC =		mpiicpc -openmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64
+MIC_OPT =       -offload-option,mic,compiler,"-fp-model fast=2 -mGLOB_default_function_attrs=\"gather_scatter_loop_unroll=4\""
+CCFLAGS =	-O3 -xAVX -fno-alias -ansi-alias -restrict -override-limits $(MIC_OPT)
+SHFLAGS =	-fPIC
+DEPFLAGS =	-M
+
+LINK =		mpiicpc -openmp
+LINKFLAGS =	-O3 -xAVX
+LIB =           
+SIZE =		size
+
+ARCHIVE =	ar
+ARFLAGS =	-rc
+SHLIBFLAGS =	-shared
+
+# ---------------------------------------------------------------------
+# LAMMPS-specific settings
+# specify settings for LAMMPS features you will use
+# if you change any -D setting, do full re-compile after "make clean"
+
+# LAMMPS ifdef settings, OPTIONAL
+# see possible settings in doc/Section_start.html#2_2 (step 4)
+
+LMP_INC =	-DLAMMPS_GZIP -DLAMMPS_JPEG
+
+# MPI library, REQUIRED
+# see discussion in doc/Section_start.html#2_2 (step 5)
+# can point to dummy MPI library in src/STUBS as in Makefile.serial
+# INC = path for mpi.h, MPI compiler settings
+# PATH = path for MPI library
+# LIB = name of MPI library
+
+MPI_INC =       -DMPICH_SKIP_MPICXX 
+MPI_PATH = 
+MPI_LIB =  
+
+# FFT library, OPTIONAL
+# see discussion in doc/Section_start.html#2_2 (step 6)
+# can be left blank to use provided KISS FFT library
+# INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings
+# PATH = path for FFT library
+# LIB = name of FFT library
+
+FFT_INC =      -DFFT_MKL -DFFT_SINGLE -I$(MKLROOT)
+FFT_PATH = 
+FFT_LIB =	-L$(MKLROOT) -lmkl_intel_ilp64 -lmkl_intel_thread -lmkl_core
+
+# JPEG and/or PNG library, OPTIONAL
+# see discussion in doc/Section_start.html#2_2 (step 7)
+# only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC
+# INC = path(s) for jpeglib.h and/or png.h
+# PATH = path(s) for JPEG library and/or PNG library
+# LIB = name(s) of JPEG library and/or PNG library
+
+JPG_INC =       
+JPG_PATH = 	
+JPG_LIB =	-ljpeg
+
+# ---------------------------------------------------------------------
+# build rules and dependencies
+# no need to edit this section
+
+include	Makefile.package.settings
+include	Makefile.package
+
+EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC)
+EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH)
+EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB)
+
+# Path to src files
+
+vpath %.cpp ..
+vpath %.h ..
+
+# Link target
+
+$(EXE):	$(OBJ)
+	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
+	$(SIZE) $(EXE)
+
+# Library targets
+
+lib:	$(OBJ)
+	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
+
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
+# Compilation rules
+
+%.o:%.cpp
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
+
+%.d:%.cpp
+	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
+
+# Individual dependencies
+
+DEPENDS = $(OBJ:.o=.d)
+sinclude $(DEPENDS)
diff --git a/src/MAKE/Makefile.g++_openmpi b/src/MAKE/Makefile.g++_openmpi
new file mode 100755
index 0000000000..c8912f1713
--- /dev/null
+++ b/src/MAKE/Makefile.g++_openmpi
@@ -0,0 +1,108 @@
+# g++ = RedHat Linux box, g++4, OpenMPI, FFTW
+
+SHELL = /bin/sh
+
+# ---------------------------------------------------------------------
+# compiler/linker settings
+# specify flags and libraries needed for your compiler
+
+CC =		g++
+CCFLAGS =	-g -O # -Wunused
+SHFLAGS =	-fPIC
+DEPFLAGS =	-M
+
+LINK =		g++
+LINKFLAGS =	-g -O
+LIB = 
+SIZE =		size
+
+ARCHIVE =	ar
+ARFLAGS =	-rc
+SHLIBFLAGS =	-shared
+
+# ---------------------------------------------------------------------
+# LAMMPS-specific settings
+# specify settings for LAMMPS features you will use
+# if you change any -D setting, do full re-compile after "make clean"
+
+# LAMMPS ifdef settings, OPTIONAL
+# see possible settings in doc/Section_start.html#2_2 (step 4)
+
+LMP_INC =	-DLAMMPS_GZIP -DLAMMPS_JPEG
+
+# MPI library, REQUIRED
+# see discussion in doc/Section_start.html#2_2 (step 5)
+# can point to dummy MPI library in src/STUBS as in Makefile.serial
+# INC = path for mpi.h, MPI compiler settings
+# PATH = path for MPI library
+# LIB = name of MPI library
+
+MPI_INC =       -DMPICH_SKIP_MPICXX -I/usr/local/openmpi/include
+MPI_PATH =      -L/usr/local/openmpi/lib
+MPI_LIB =	-lmpi -lmpi_cxx
+
+# FFT library, OPTIONAL
+# see discussion in doc/Section_start.html#2_2 (step 6)
+# can be left blank to use provided KISS FFT library
+# INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings
+# PATH = path for FFT library
+# LIB = name of FFT library
+
+FFT_INC =    	-DFFT_FFTW
+FFT_PATH = 
+FFT_LIB =	-lfftw
+
+# JPEG and/or PNG library, OPTIONAL
+# see discussion in doc/Section_start.html#2_2 (step 7)
+# only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC
+# INC = path(s) for jpeglib.h and/or png.h
+# PATH = path(s) for JPEG library and/or PNG library
+# LIB = name(s) of JPEG library and/or PNG library
+
+JPG_INC =       
+JPG_PATH = 	
+JPG_LIB =	-ljpeg
+
+# ---------------------------------------------------------------------
+# build rules and dependencies
+# no need to edit this section
+
+include	Makefile.package.settings
+include	Makefile.package
+
+EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC)
+EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH)
+EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB)
+
+# Path to src files
+
+vpath %.cpp ..
+vpath %.h ..
+
+# Link target
+
+$(EXE):	$(OBJ)
+	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
+	$(SIZE) $(EXE)
+
+# Library targets
+
+lib:	$(OBJ)
+	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
+
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
+# Compilation rules
+
+%.o:%.cpp
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
+
+%.d:%.cpp
+	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
+
+# Individual dependencies
+
+DEPENDS = $(OBJ:.o=.d)
+sinclude $(DEPENDS)
diff --git a/src/MAKE/Makefile.intel b/src/MAKE/Makefile.intel
new file mode 100755
index 0000000000..2b209e27b0
--- /dev/null
+++ b/src/MAKE/Makefile.intel
@@ -0,0 +1,108 @@
+# Intel compiler, Intel MPI, MKL FFT, no offload to coprocessor
+
+SHELL = /bin/sh
+
+# ---------------------------------------------------------------------
+# compiler/linker settings
+# specify flags and libraries needed for your compiler
+
+CC =		mpiicpc -openmp -DLAMMPS_MEMALIGN=64 -no-offload
+CCFLAGS =	-O3 -xHost -fno-alias -ansi-alias -restrict -override-limits
+SHFLAGS =	-fPIC
+DEPFLAGS =	-M
+
+LINK =		mpiicpc -openmp
+LINKFLAGS =	-O3 -xHost
+LIB =           
+SIZE =		size
+
+ARCHIVE =	ar
+ARFLAGS =	-rc
+SHLIBFLAGS =	-shared
+
+# ---------------------------------------------------------------------
+# LAMMPS-specific settings
+# specify settings for LAMMPS features you will use
+# if you change any -D setting, do full re-compile after "make clean"
+
+# LAMMPS ifdef settings, OPTIONAL
+# see possible settings in doc/Section_start.html#2_2 (step 4)
+
+LMP_INC =	-DLAMMPS_GZIP -DLAMMPS_JPEG
+
+# MPI library, REQUIRED
+# see discussion in doc/Section_start.html#2_2 (step 5)
+# can point to dummy MPI library in src/STUBS as in Makefile.serial
+# INC = path for mpi.h, MPI compiler settings
+# PATH = path for MPI library
+# LIB = name of MPI library
+
+MPI_INC =       -DMPICH_SKIP_MPICXX 
+MPI_PATH = 
+MPI_LIB =
+
+# FFT library, OPTIONAL
+# see discussion in doc/Section_start.html#2_2 (step 6)
+# can be left blank to use provided KISS FFT library
+# INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings
+# PATH = path for FFT library
+# LIB = name of FFT library
+
+FFT_INC =      -DFFT_MKL -DFFT_SINGLE
+FFT_PATH = 
+FFT_LIB = -L$MKLROOT/lib/intel64/ -lmkl_intel_ilp64 -lmkl_intel_thread -lmkl_core	
+
+# JPEG and/or PNG library, OPTIONAL
+# see discussion in doc/Section_start.html#2_2 (step 7)
+# only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC
+# INC = path(s) for jpeglib.h and/or png.h
+# PATH = path(s) for JPEG library and/or PNG library
+# LIB = name(s) of JPEG library and/or PNG library
+
+JPG_INC =       
+JPG_PATH = 	
+JPG_LIB =	-ljpeg
+
+# ---------------------------------------------------------------------
+# build rules and dependencies
+# no need to edit this section
+
+include	Makefile.package.settings
+include	Makefile.package
+
+EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC)
+EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH)
+EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB)
+
+# Path to src files
+
+vpath %.cpp ..
+vpath %.h ..
+
+# Link target
+
+$(EXE):	$(OBJ)
+	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
+	$(SIZE) $(EXE)
+
+# Library targets
+
+lib:	$(OBJ)
+	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
+
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
+# Compilation rules
+
+%.o:%.cpp
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
+
+%.d:%.cpp
+	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
+
+# Individual dependencies
+
+DEPENDS = $(OBJ:.o=.d)
+sinclude $(DEPENDS)
diff --git a/src/MAKE/Makefile.intel_offload b/src/MAKE/Makefile.intel_offload
new file mode 100755
index 0000000000..eb4415fc8a
--- /dev/null
+++ b/src/MAKE/Makefile.intel_offload
@@ -0,0 +1,109 @@
+# Intel compiler, Intel MPI, MKL FFT, no offload to coprocessor
+
+SHELL = /bin/sh
+
+# ---------------------------------------------------------------------
+# compiler/linker settings
+# specify flags and libraries needed for your compiler
+
+CC =		mpiicpc -openmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64
+MIC_OPT =       -offload-option,mic,compiler,"-fp-model fast=2 -mGLOB_default_function_attrs=\"gather_scatter_loop_unroll=4\""
+CCFLAGS =	-g -O3 -xHost -fno-alias -ansi-alias -restrict -override-limits $(MIC_OPT)
+SHFLAGS =	-fPIC
+DEPFLAGS =	-M
+
+LINK =		mpiicpc -openmp -offload
+LINKFLAGS =	-O3 -xHost
+LIB =           
+SIZE =		size
+
+ARCHIVE =	ar
+ARFLAGS =	-rc
+SHLIBFLAGS =	-shared
+
+# ---------------------------------------------------------------------
+# LAMMPS-specific settings
+# specify settings for LAMMPS features you will use
+# if you change any -D setting, do full re-compile after "make clean"
+
+# LAMMPS ifdef settings, OPTIONAL
+# see possible settings in doc/Section_start.html#2_2 (step 4)
+
+LMP_INC =	-DLAMMPS_GZIP -DLAMMPS_JPEG
+
+# MPI library, REQUIRED
+# see discussion in doc/Section_start.html#2_2 (step 5)
+# can point to dummy MPI library in src/STUBS as in Makefile.serial
+# INC = path for mpi.h, MPI compiler settings
+# PATH = path for MPI library
+# LIB = name of MPI library
+
+MPI_INC =       -DMPICH_SKIP_MPICXX 
+MPI_PATH = 
+MPI_LIB =
+
+# FFT library, OPTIONAL
+# see discussion in doc/Section_start.html#2_2 (step 6)
+# can be left blank to use provided KISS FFT library
+# INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings
+# PATH = path for FFT library
+# LIB = name of FFT library
+
+FFT_INC =      -DFFT_MKL -DFFT_SINGLE
+FFT_PATH = 
+FFT_LIB =	-L$(MKLROOT)/lib/intel64/ -lmkl_intel_ilp64 -lmkl_intel_thread -lmkl_core
+
+# JPEG and/or PNG library, OPTIONAL
+# see discussion in doc/Section_start.html#2_2 (step 7)
+# only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC
+# INC = path(s) for jpeglib.h and/or png.h
+# PATH = path(s) for JPEG library and/or PNG library
+# LIB = name(s) of JPEG library and/or PNG library
+
+JPG_INC =       
+JPG_PATH = 	
+JPG_LIB =	-ljpeg
+
+# ---------------------------------------------------------------------
+# build rules and dependencies
+# no need to edit this section
+
+include	Makefile.package.settings
+include	Makefile.package
+
+EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC)
+EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH)
+EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB)
+
+# Path to src files
+
+vpath %.cpp ..
+vpath %.h ..
+
+# Link target
+
+$(EXE):	$(OBJ)
+	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
+	$(SIZE) $(EXE)
+
+# Library targets
+
+lib:	$(OBJ)
+	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
+
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
+# Compilation rules
+
+%.o:%.cpp
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
+
+%.d:%.cpp
+	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
+
+# Individual dependencies
+
+DEPENDS = $(OBJ:.o=.d)
+sinclude $(DEPENDS)
diff --git a/src/MAKE/Makefile.linux b/src/MAKE/Makefile.linux
index c4264dc223..d835bed045 100755
--- a/src/MAKE/Makefile.linux
+++ b/src/MAKE/Makefile.linux
@@ -7,12 +7,12 @@ SHELL = /bin/sh
 # specify flags and libraries needed for your compiler
 
 CC =		icc
-CCFLAGS =	-O
+CCFLAGS =	-O -DLAMMPS_MEMALIGN=64 -openmp -restrict
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		icc
-LINKFLAGS =	-O
+LINKFLAGS =	-O -openmp
 LIB =           -lstdc++
 SIZE =		size
 
diff --git a/src/MAKE/Makefile.stampede b/src/MAKE/Makefile.stampede
new file mode 100755
index 0000000000..8c9591d112
--- /dev/null
+++ b/src/MAKE/Makefile.stampede
@@ -0,0 +1,109 @@
+# Stampede, Intel Compiler, MKL FFT, Offload to Xeon Phi
+
+SHELL = /bin/sh
+
+# ---------------------------------------------------------------------
+# compiler/linker settings
+# specify flags and libraries needed for your compiler
+
+CC =		mpicc -openmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64
+MIC_OPT =       -offload-option,mic,compiler,"-fp-model fast=2 -mGLOB_default_function_attrs=\"gather_scatter_loop_unroll=4\""
+CCFLAGS =	-O3 -xAVX -fno-alias -ansi-alias -restrict -override-limits $(MIC_OPT)
+SHFLAGS =	-fPIC
+DEPFLAGS =	-M
+
+LINK =		mpicc -openmp
+LINKFLAGS =	-O3 -xAVX
+LIB =           
+SIZE =		size
+
+ARCHIVE =	ar
+ARFLAGS =	-rc
+SHLIBFLAGS =	-shared
+
+# ---------------------------------------------------------------------
+# LAMMPS-specific settings
+# specify settings for LAMMPS features you will use
+# if you change any -D setting, do full re-compile after "make clean"
+
+# LAMMPS ifdef settings, OPTIONAL
+# see possible settings in doc/Section_start.html#2_2 (step 4)
+
+LMP_INC =	-DLAMMPS_GZIP -DLAMMPS_JPEG
+
+# MPI library, REQUIRED
+# see discussion in doc/Section_start.html#2_2 (step 5)
+# can point to dummy MPI library in src/STUBS as in Makefile.serial
+# INC = path for mpi.h, MPI compiler settings
+# PATH = path for MPI library
+# LIB = name of MPI library
+
+MPI_INC =       -DMPICH_SKIP_MPICXX 
+MPI_PATH = 
+MPI_LIB =
+
+# FFT library, OPTIONAL
+# see discussion in doc/Section_start.html#2_2 (step 6)
+# can be left blank to use provided KISS FFT library
+# INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings
+# PATH = path for FFT library
+# LIB = name of FFT library
+
+FFT_INC =      -DFFT_MKL -DFFT_SINGLE -I$(TACC_MKL_INC)
+FFT_PATH = 
+FFT_LIB =	-L$(TACC_MKL_LIB) -lmkl_intel_ilp64 -lmkl_intel_thread -lmkl_core
+
+# JPEG and/or PNG library, OPTIONAL
+# see discussion in doc/Section_start.html#2_2 (step 7)
+# only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC
+# INC = path(s) for jpeglib.h and/or png.h
+# PATH = path(s) for JPEG library and/or PNG library
+# LIB = name(s) of JPEG library and/or PNG library
+
+JPG_INC =       
+JPG_PATH = 	
+JPG_LIB =	-ljpeg
+
+# ---------------------------------------------------------------------
+# build rules and dependencies
+# no need to edit this section
+
+include	Makefile.package.settings
+include	Makefile.package
+
+EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC)
+EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH)
+EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB)
+
+# Path to src files
+
+vpath %.cpp ..
+vpath %.h ..
+
+# Link target
+
+$(EXE):	$(OBJ)
+	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
+	$(SIZE) $(EXE)
+
+# Library targets
+
+lib:	$(OBJ)
+	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
+
+shlib:	$(OBJ)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
+# Compilation rules
+
+%.o:%.cpp
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
+
+%.d:%.cpp
+	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
+
+# Individual dependencies
+
+DEPENDS = $(OBJ:.o=.d)
+sinclude $(DEPENDS)
diff --git a/src/Makefile b/src/Makefile
index f8e70a94dc..2c4bb15fa2 100755
--- a/src/Makefile
+++ b/src/Makefile
@@ -18,8 +18,8 @@ PACKAGE = asphere body class2 colloid dipole fld gpu granular kim \
 	  reax replica rigid shock srd voronoi xtc
 
 PACKUSER = user-atc user-awpmd user-cg-cmm user-colvars \
-	   user-cuda user-eff user-fep user-lb user-misc user-molfile \
-	   user-omp user-phonon user-qmmm user-reaxc user-sph
+	   user-cuda user-eff user-fep user-intel user-lb user-misc \
+	   user-molfile user-omp user-phonon user-qmmm user-reaxc user-sph
 
 PACKLIB = gpu kim meam poems reax voronoi \
 	  user-atc user-awpmd user-colvars user-qmmm user-cuda user-molfile
diff --git a/src/USER-INTEL/Install.sh b/src/USER-INTEL/Install.sh
new file mode 100644
index 0000000000..70fc48306a
--- /dev/null
+++ b/src/USER-INTEL/Install.sh
@@ -0,0 +1,107 @@
+# Install/unInstall package files in LAMMPS
+# mode = 0/1/2 for uninstall/install/update
+
+mode=$1
+
+# arg1 = file, arg2 = file it depends on
+
+action () {
+  if (test $mode = 0) then
+    rm -f ../$1
+  elif (! cmp -s $1 ../$1) then
+    if (test -z "$2" || test -e ../$2) then
+      cp $1 ..
+      if (test $mode = 2) then
+        echo "  updating src/$1"
+      fi
+    fi
+  elif (test -n "$2") then
+    if (test ! -e ../$2) then
+      rm -f ../$1
+    fi
+  fi
+}
+
+# step 1: process all *_intel.cpp and *_intel.h files.
+# do not install child files if parent does not exist
+
+for file in *_intel.cpp; do
+  test $file = thr_intel.cpp && continue
+  dep=`echo $file | sed 's/neigh_full_intel/neigh_full/g' | \
+      sed 's/_offload_intel//g' | sed 's/_intel//g'`
+  action $file $dep
+done
+
+for file in *_intel.h; do
+  test $file = thr_intel.h && continue
+  dep=`echo $file | sed 's/_offload_intel//g' | sed 's/_intel//g'`
+  action $file $dep
+done
+
+action intel_preprocess.h
+action intel_buffers.h
+action intel_buffers.cpp
+action math_extra_intel.h
+
+# step 2: handle cases and tasks not handled in step 1.
+
+if (test $mode = 1) then
+
+  if (test -e ../Makefile.package) then
+    sed -i -e 's/[^ \t]*INTEL[^ \t]* //' ../Makefile.package
+    sed -i -e 's|^PKG_INC =[ \t]*|&-DLMP_USER_INTEL |' ../Makefile.package
+  fi
+
+  # force rebuild of files with LMP_USER_INTEL switch
+
+  touch ../accelerator_intel.h
+
+elif (test $mode = 0) then
+
+  if (test -e ../Makefile.package) then
+    sed -i -e 's/[^ \t]*INTEL[^ \t]* //' ../Makefile.package
+  fi
+
+  # force rebuild of files with LMP_USER_INTEL switch
+
+  touch ../accelerator_intel.h
+
+fi
+
+# step 3: map omp styles that are not in the intel package to intel suffix
+
+#if (test $mode = 0) then
+#
+#  rm -f ../*ompinto_intel*
+#
+#else
+#
+#  echo "  The 'intel' suffix will use the USER-OMP package for all"
+#  echo "  angle, bond, dihedral, kspace, and improper styles:"
+#  stylelist="pair fix angle bond dihedral improper"
+#  for header in $stylelist; do
+#    HEADER=`echo $header | sed 's/\(.*\)/\U\1/'`
+#    outfile=../$header"_ompinto_intel.h"
+#    echo "    Creating $header style map: $outfile"
+#    echo -n "// -- Header to map USER-OMP " > $outfile  
+#    echo "styles to the intel suffix" >> $outfile
+#    echo >> $outfile
+#    echo "#ifdef "$HEADER"_CLASS" >> $outfile
+#    grep -h 'Style(' ../$header*_omp.h | grep -v 'charmm/coul/long' | \
+#	grep -v 'lj/cut' | grep -v 'gayberne' | \
+#	sed 's/\/omp/\/intel/g' >> $outfile
+#    echo "#endif" >> $outfile
+#  done
+#
+#  header="kspace"
+#  HEADER="KSPACE"
+#  outfile=../$header"_ompinto_intel.h"
+#  echo "    Creating $header style map: $outfile"
+#  echo -n "// -- Header to map USER-OMP " > $outfile  
+#  echo "styles to the intel suffix" >> $outfile
+#  echo >> $outfile
+#  echo "#ifdef "$HEADER"_CLASS" >> $outfile
+#  grep -h 'KSpaceStyle(' ../*_omp.h | sed 's/\/omp/\/intel/g' >> $outfile
+#  echo "#endif" >> $outfile
+#
+#fi
diff --git a/src/USER-INTEL/README b/src/USER-INTEL/README
new file mode 100644
index 0000000000..0b38928b2e
--- /dev/null
+++ b/src/USER-INTEL/README
@@ -0,0 +1,35 @@
+
+                     --------------------------------
+                          LAMMPS Intel Package
+                     --------------------------------
+                     
+                        W. Michael Brown (Intel)
+                       michael.w.brown at intel.com
+
+-----------------------------------------------------------------------------
+
+This package is based on the USER-OMP package and provides LAMMPS styles that:
+
+   1. include support for single and mixed precision in addition to double.
+   2. include modifications to support vectorization for key routines
+   3. include modifications to support offload to Xeon Phi coprocessors
+
+-----------------------------------------------------------------------------
+
+When using the suffix command with "intel", intel styles will be used if they
+exist; if they do not, and an omp version exists, that style will be used.
+This is accomplished through the files *ompinto_intel.h that are created
+in the src directory when the intel package is installed. For example,
+
+      kspace_style     pppm/intel 1e-4
+
+is equivalent to:
+
+      kspace_style     pppm/omp   1e-4
+
+because no pppm style has been implemented for the Intel package.
+
+-----------------------------------------------------------------------------
+
+In order to use offload to Xeon Phi, the flag -DLMP_INTEL_OFFLOAD should be
+set in the Makefile. Offload requires the use of Intel compilers.
diff --git a/src/USER-INTEL/fix_intel.cpp b/src/USER-INTEL/fix_intel.cpp
new file mode 100644
index 0000000000..8fd3003b49
--- /dev/null
+++ b/src/USER-INTEL/fix_intel.cpp
@@ -0,0 +1,530 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_request.h"
+#include "pair.h"
+#include "pair_hybrid.h"
+#include "pair_hybrid_overlay.h"
+#include "timer.h"
+#include "universe.h"
+#include "update.h"
+#include "fix_intel.h"
+
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "suffix.h"
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+#ifdef __INTEL_OFFLOAD
+#ifndef _LMP_INTEL_OFFLOAD
+#warning "Not building Intel package with Xeon Phi offload support."
+#endif
+#endif
+
+enum{NSQ,BIN,MULTI};
+
+/* ---------------------------------------------------------------------- */
+
+FixIntel::FixIntel(LAMMPS *lmp, int narg, char **arg) :  Fix(lmp, narg, arg)
+{
+  if (narg < 4)
+    error->all(FLERR, "Illegal package intel command");
+  if (strcmp(arg[1],"all") != 0)
+    error->all(FLERR, "fix Intel has to operate on group 'all'");
+
+  _precision_mode = PREC_MODE_MIXED;
+  _offload_balance = 1.0;
+  _overflow_flag[LMP_OVERFLOW] = 0;
+  _off_overflow_flag[LMP_OVERFLOW] = 0;
+
+  _offload_affinity_balanced = 0;
+  _offload_threads = 1;
+  _offload_tpc = 4;
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  _offload_affinity_set = 0;
+  _off_force_array_s = 0;
+  _off_force_array_m = 0;
+  _off_force_array_d = 0;
+  _off_ev_array_s = 0;
+  _off_ev_array_d = 0;
+  _balance_fixed = 0.0;
+
+  _cop = 0;
+
+  int max_offload_threads, offload_cores;
+  #pragma offload target(mic:_cop) mandatory \
+    out(max_offload_threads,offload_cores)
+  {
+    offload_cores = omp_get_num_procs();
+    omp_set_num_threads(offload_cores);
+    max_offload_threads = omp_get_max_threads();
+  }
+  _max_offload_threads = max_offload_threads;
+  _offload_cores = offload_cores;
+  _offload_threads = offload_cores;
+  #endif
+  int ncops = 1;
+  _allow_separate_buffers = 1;
+  _offload_ghost = -1;
+
+  int iarg = 4;
+  while (iarg < narg) {
+    if (strcmp(arg[iarg], "mixed") == 0)
+      _precision_mode = PREC_MODE_MIXED;
+    else if (strcmp(arg[iarg], "double") == 0)
+      _precision_mode = PREC_MODE_DOUBLE;
+    else if (strcmp(arg[iarg], "single") == 0)
+      _precision_mode = PREC_MODE_SINGLE;
+    else if (strcmp(arg[iarg], "offload_affinity_balanced") == 0)
+      _offload_affinity_balanced = 1;
+    else if (strcmp(arg[iarg], "balance") == 0) {
+      if (iarg == narg - 1)
+        error->all(FLERR, "Illegal package intel mode requested");
+      ++iarg;
+      _offload_balance = force->numeric(FLERR,arg[iarg]);
+    } else if (strcmp(arg[iarg], "offload_threads") == 0) {
+      if (iarg == narg - 1)
+        error->all(FLERR, "Illegal package intel mode requested");
+      ++iarg;
+      _offload_threads = atoi(arg[iarg]);
+    } else if (strcmp(arg[iarg], "offload_tpc") == 0) {
+      if (iarg == narg - 1)
+        error->all(FLERR, "Illegal package intel mode requested");
+      ++iarg;
+      _offload_tpc = atoi(arg[iarg]);
+    } else if (strcmp(arg[iarg], "offload_cards") == 0) {
+      if (iarg == narg - 1)
+        error->all(FLERR, "Illegal package intel mode requested");
+      ++iarg;
+      ncops = atoi(arg[iarg]);
+    } else if (strcmp(arg[iarg], "buffers") == 0) {
+      if (iarg == narg - 1)
+        error->all(FLERR, "Illegal package intel mode requested");
+      ++iarg;
+      _allow_separate_buffers = atoi(arg[iarg]);
+    } else if (strcmp(arg[iarg], "offload_ghost") == 0) {
+      if (iarg == narg - 1)
+        error->all(FLERR, "Illegal package intel mode requested");
+      ++iarg;
+      _offload_ghost = atoi(arg[iarg]);
+    } else
+      error->all(FLERR, "Illegal package intel mode requested");
+    ++iarg;
+  }
+
+  if (_offload_balance > 1.0 || _offload_threads <= 0 ||
+      _offload_tpc <= 0 || _offload_tpc > 4)
+    error->all(FLERR, "Illegal package intel mode requested");
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  _ncops = ncops;
+  if (_offload_balance < 0.0) {
+    _balance_neighbor = 0.9;
+    _balance_pair = 0.9;
+  } else {
+    _balance_neighbor = _offload_balance;
+    _balance_pair = _offload_balance;
+  }
+
+  _tscreen = screen;
+  zero_timers();
+  _setup_time_cleared = false;
+  _timers_allocated = false;
+  #else
+  _offload_balance = 0.0;
+  #endif
+
+  if (_precision_mode == PREC_MODE_SINGLE)
+    _single_buffers = new IntelBuffers<float,float>(lmp);
+  else if (_precision_mode == PREC_MODE_MIXED)
+    _mixed_buffers = new IntelBuffers<float,double>(lmp);
+  else
+    _double_buffers = new IntelBuffers<double,double>(lmp);
+}
+
+/* ---------------------------------------------------------------------- */
+
+FixIntel::~FixIntel()
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  output_timing_data();
+  if (_timers_allocated) {
+    double *time1 = off_watch_pair();
+    double *time2 = off_watch_neighbor();
+    int *overflow = get_off_overflow_flag();
+    if (time1 != NULL && time2 != NULL && overflow != NULL) {
+      #pragma offload_transfer target(mic:_cop) \
+        nocopy(time1,time2,overflow:alloc_if(0) free_if(1))
+    }
+  }
+  #endif
+
+  if (_precision_mode == PREC_MODE_SINGLE)
+    delete _single_buffers;
+  else if (_precision_mode == PREC_MODE_MIXED)
+    delete _mixed_buffers;
+  else
+    delete _double_buffers;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixIntel::setmask()
+{
+  int mask = 0;
+  return mask;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixIntel::init()
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_offload_balance != 0.0) atom->sortfreq = 1;
+  
+  if (force->newton_pair == 0)
+    _offload_noghost = 0;
+  else if (_offload_ghost == 0)
+    _offload_noghost = 1;
+
+  set_offload_affinity();
+
+  output_timing_data();
+  if (!_timers_allocated) {
+    double *time1 = off_watch_pair();
+    double *time2 = off_watch_neighbor();
+    int *overflow = get_off_overflow_flag();
+    if (time1 != NULL && time2 != NULL && overflow != NULL) {
+      #pragma offload_transfer target(mic:_cop)  \
+        nocopy(time1,time2:length(1) alloc_if(1) free_if(0)) \
+        in(overflow:length(5) alloc_if(1) free_if(0))
+    }
+    _timers_allocated = true;
+  }
+
+  char kmode[80];
+  if (_precision_mode == PREC_MODE_SINGLE)
+    strcpy(kmode, "single");
+  else if (_precision_mode == PREC_MODE_MIXED)
+    strcpy(kmode, "mixed");
+  else
+    strcpy(kmode, "double");
+
+  // print summary of settings
+  if (comm->me == 0) {
+    if (screen) {
+      #ifdef _LMP_INTEL_OFFLOAD
+      if (_offload_balance != 0.0) {
+        fprintf(screen,"using offload with %d threads per core, ",_offload_tpc);
+        fprintf(screen,"%d threads per task\n",_offload_threads);
+      }
+      #endif
+    }
+  }
+  if (update->whichflag == 2 && _offload_balance != 0.0) {
+    if (_offload_balance == 1.0 && _offload_noghost == 0)
+      _sync_at_pair = 1;
+    else
+      _sync_at_pair = 2;
+  } else {
+    _sync_at_pair = 0;
+    if (strstr(update->integrate_style,"intel") == 0)
+      error->all(FLERR,
+		 "Specified run_style does not support the Intel package.");
+  }
+  #endif
+
+  if (neighbor->style != BIN)
+    error->all(FLERR,
+	    "Currently, neighbor style BIN must be used with Intel package.");
+  if (neighbor->exclude_setting() != 0)
+    error->all(FLERR,
+	    "Currently, cannot use neigh_modify exclude with Intel package.");
+  int nstyles = 0;
+  if (force->pair_match("hybrid", 1) != NULL) {
+    PairHybrid *hybrid = (PairHybrid *) force->pair;
+    for (int i = 0; i < hybrid->nstyles; i++)
+      if (strstr(hybrid->keywords[i], "/intel") == NULL)
+        nstyles++;
+  } else if (force->pair_match("hybrid/overlay", 1) != NULL) {
+    PairHybridOverlay *hybrid = (PairHybridOverlay *) force->pair;
+    for (int i = 0; i < hybrid->nstyles; i++)
+      if (strstr(hybrid->keywords[i], "/intel") == NULL)
+        nstyles++;
+      else
+	force->pair->no_virial_fdotr_compute = 1;
+  }
+  if (nstyles > 1)
+    error->all(FLERR,
+	       "Currently, cannot use more than one intel style with hybrid.");
+
+  neighbor->fix_intel = (void *)this;
+  _nthreads = comm->nthreads;
+
+  check_neighbor_intel();
+  if (_precision_mode == PREC_MODE_SINGLE)
+    _single_buffers->zero_ev();
+  else if (_precision_mode == PREC_MODE_MIXED)
+    _mixed_buffers->zero_ev();
+  else
+    _double_buffers->zero_ev();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixIntel::check_neighbor_intel()
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  _full_host_list = 0;
+  #endif
+  const int nrequest = neighbor->nrequest;
+
+  for (int i = 0; i < nrequest; ++i) {
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (_offload_balance != 0.0 && neighbor->requests[i]->intel == 0) {
+      _full_host_list = 1;
+      _offload_noghost = 0;
+    }	
+    #endif
+    if (neighbor->requests[i]->skip)
+      error->all(FLERR, "Cannot yet use hybrid styles with Intel package.");
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixIntel::sync_coprocessor()
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_offload_balance != 0.0) {
+    if (_off_force_array_m != 0) {
+      add_off_results(_off_force_array_m, _off_ev_array_d);
+      _off_force_array_m = 0;
+    } else if (_off_force_array_d != 0) {
+      add_off_results(_off_force_array_d, _off_ev_array_d);
+      _off_force_array_d = 0;
+    } else if (_off_force_array_s != 0) {
+      add_off_results(_off_force_array_s, _off_ev_array_s);
+      _off_force_array_s = 0;
+    }
+  }
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+double FixIntel::memory_usage()
+{
+  double bytes;
+  if (_precision_mode == PREC_MODE_SINGLE)
+    bytes = _single_buffers->memory_usage(_nthreads);
+  else if (_precision_mode == PREC_MODE_MIXED)
+    bytes = _mixed_buffers->memory_usage(_nthreads);
+  else
+    bytes = _double_buffers->memory_usage(_nthreads);
+
+  return bytes;
+}
+
+/* ---------------------------------------------------------------------- */
+
+#ifdef _LMP_INTEL_OFFLOAD
+
+void FixIntel::output_timing_data() {
+  if (_im_real_space_task == 0 || _offload_affinity_set == 0) return;
+
+  double timer_total = 0.0;
+  int size, rank;
+  double timers[NUM_ITIMERS];
+  MPI_Comm_size(_real_space_comm, &size);
+  MPI_Comm_rank(_real_space_comm, &rank);
+  MPI_Allreduce(&_timers, &timers, NUM_ITIMERS, MPI_DOUBLE, MPI_SUM,
+                _real_space_comm);
+  for (int i=0; i < NUM_ITIMERS; i++) {
+    timers[i] /= size;
+    timer_total += timers[i];
+  }
+  #ifdef TIME_BALANCE
+  double timers_min[NUM_ITIMERS], timers_max[NUM_ITIMERS];
+  MPI_Allreduce(&_timers, &timers_max, NUM_ITIMERS, MPI_DOUBLE, MPI_MAX,
+                _real_space_comm);
+  MPI_Allreduce(&_timers, &timers_min, NUM_ITIMERS, MPI_DOUBLE, MPI_MIN,
+                _real_space_comm);
+  #endif
+
+  if (timer_total > 0.0) {
+    double balance_out[2], balance_in[2];
+    balance_out[0] = _balance_pair;
+    balance_out[1] = _balance_neighbor;
+    MPI_Reduce(balance_out, balance_in, 2, MPI_DOUBLE, MPI_SUM,
+	       0, _real_space_comm);
+    balance_in[0] /= size;
+    balance_in[1] /= size;
+
+    if (rank == 0 && _tscreen) {
+      fprintf(_tscreen, "\n------------------------------------------------\n");
+      fprintf(_tscreen, "               Offload Timing Data\n");
+      fprintf(_tscreen, "------------------------------------------------\n");
+      fprintf(_tscreen, "  Data Pack/Cast Seconds    %f\n",
+              timers[TIME_PACK]);
+      if (_offload_balance != 0.0) {
+        fprintf(_tscreen, "  Host Neighbor Seconds     %f\n",
+                timers[TIME_HOST_NEIGHBOR]);
+        fprintf(_tscreen, "  Host Pair Seconds         %f\n",
+                timers[TIME_HOST_PAIR]);
+        fprintf(_tscreen, "  Offload Neighbor Seconds  %f\n",
+                timers[TIME_OFFLOAD_NEIGHBOR]);
+        fprintf(_tscreen, "  Offload Pair Seconds      %f\n",
+                timers[TIME_OFFLOAD_PAIR]);
+        fprintf(_tscreen, "  Offload Wait Seconds      %f\n",
+                timers[TIME_OFFLOAD_WAIT]);
+        fprintf(_tscreen, "  Offload Latency Seconds   %f\n",
+                timers[TIME_OFFLOAD_LATENCY]);
+        fprintf(_tscreen, "  Offload Neighbor Balance  %f\n",
+                balance_in[1]);
+        fprintf(_tscreen, "  Offload Pair Balance      %f\n",
+                balance_in[0]);
+	fprintf(_tscreen, "  Offload Ghost Atoms       ");
+	if (_offload_noghost) fprintf(_tscreen,"No\n");
+	else fprintf(_tscreen,"Yes\n");
+        #ifdef TIME_BALANCE
+        fprintf(_tscreen, "  Offload Imbalance Seconds %f\n",
+                timers[TIME_IMBALANCE]);
+	fprintf(_tscreen, "  Offload Min/Max Seconds   ");
+	for (int i = 0; i < NUM_ITIMERS; i++)
+	  fprintf(_tscreen, "[%f, %f] ",timers_min[i],timers_max[i]);
+	fprintf(_tscreen, "\n");
+        #endif
+      }
+      fprintf(_tscreen, "------------------------------------------------\n");
+    }
+    zero_timers();
+    _setup_time_cleared = false;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixIntel::get_ppn(int &node_rank) {
+  int nprocs;
+  int rank;
+  MPI_Comm_size(_real_space_comm, &nprocs);
+  MPI_Comm_rank(_real_space_comm, &rank);
+
+  int name_length;
+  char node_name[MPI_MAX_PROCESSOR_NAME];
+  MPI_Get_processor_name(node_name,&name_length);
+  node_name[name_length] = '\0';
+  char *node_names = new char[MPI_MAX_PROCESSOR_NAME*nprocs];
+  MPI_Allgather(node_name, MPI_MAX_PROCESSOR_NAME, MPI_CHAR, node_names,
+		MPI_MAX_PROCESSOR_NAME, MPI_CHAR, _real_space_comm);
+  int ppn = 0;
+  node_rank = 0;
+  for (int i = 0; i < nprocs; i++) {
+    if (strcmp(node_name, node_names + i * MPI_MAX_PROCESSOR_NAME) == 0) {
+      ppn++;
+      if (i < rank)
+	node_rank++;
+    }
+  }
+
+  return ppn;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixIntel::set_offload_affinity()
+{
+  _separate_buffers = 0;
+  if (_allow_separate_buffers)
+    if (_offload_balance != 0.0 && _offload_balance < 1.0)
+      _separate_buffers = 1;
+
+  _im_real_space_task = 1;
+  if (strncmp(update->integrate_style,"verlet/split",12) == 0) {
+    _real_space_comm = world;
+    if (universe->iworld != 0) {
+      _im_real_space_task = 0;
+      return;
+    }
+  } else
+    _real_space_comm = universe->uworld;
+
+  if (_offload_balance == 0.0) _cop = -1;
+  if (_offload_balance == 0.0 || _offload_affinity_set == 1)
+    return;
+
+  _offload_affinity_set = 1;
+  int node_rank;
+  int ppn = get_ppn(node_rank);
+
+  if (ppn % _ncops != 0)
+    error->all(FLERR, "MPI tasks per node must be multiple of offload_cards");
+  ppn = ppn / _ncops;
+  _cop = node_rank / ppn;
+  node_rank = node_rank % ppn;
+
+  int max_threads_per_task = _offload_cores / 4 * _offload_tpc / ppn;
+  if (_offload_threads > max_threads_per_task)
+    _offload_threads = max_threads_per_task;
+  if (_offload_threads > _max_offload_threads)
+    _offload_threads = _max_offload_threads;
+
+  int offload_threads = _offload_threads;
+  int offload_tpc = _offload_tpc;
+  int offload_affinity_balanced = _offload_affinity_balanced;
+  #pragma offload target(mic:_cop) mandatory \
+    in(node_rank,offload_threads,offload_tpc,offload_affinity_balanced)
+  {
+    omp_set_num_threads(offload_threads);
+    #pragma omp parallel
+    {
+      int tnum = omp_get_thread_num();
+      kmp_affinity_mask_t mask;
+      kmp_create_affinity_mask(&mask);
+      int proc;
+      if (offload_affinity_balanced) {
+	proc = offload_threads * node_rank + tnum;
+	proc = proc * 4 - (proc / 60) * 240 + proc / 60 + 1;
+      } else {
+	proc = offload_threads * node_rank + tnum;
+	proc += (proc / 4) * (4 - offload_tpc) + 1;
+      }
+      kmp_set_affinity_mask_proc(proc, &mask);
+      if (kmp_set_affinity(&mask) != 0)
+	printf("Could not set affinity on rank %d thread %d to %d\n",
+	       node_rank, tnum, proc);
+    }
+  }
+  if (_precision_mode == PREC_MODE_SINGLE)
+    _single_buffers->set_off_params(offload_threads, _cop, _separate_buffers);
+  else if (_precision_mode == PREC_MODE_MIXED)
+    _mixed_buffers->set_off_params(offload_threads, _cop, _separate_buffers);
+  else
+    _double_buffers->set_off_params(offload_threads, _cop, _separate_buffers);
+}
+
+#endif
diff --git a/src/USER-INTEL/fix_intel.h b/src/USER-INTEL/fix_intel.h
new file mode 100644
index 0000000000..82ebc734a2
--- /dev/null
+++ b/src/USER-INTEL/fix_intel.h
@@ -0,0 +1,593 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(Intel,FixIntel)
+
+#else
+
+#ifndef LMP_FIX_INTEL_H
+#define LMP_FIX_INTEL_H
+
+#include "fix.h"
+#include "intel_buffers.h"
+#include "force.h"
+#include "pair.h"
+#include "error.h"
+#include "update.h"
+
+namespace LAMMPS_NS {
+
+class IntelData;
+template <class flt_t, class acc_t> class IntelBuffers;
+ 
+class FixIntel : public Fix {
+ public:
+  FixIntel(class LAMMPS *, int, char **);
+  virtual ~FixIntel();
+  virtual int setmask();
+  virtual void init();
+
+  // Get all forces, calculation results from coprocesser
+  void sync_coprocessor();
+
+  double memory_usage();
+
+  typedef struct { double x,y,z; } lmp_ft;
+
+  enum {PREC_MODE_SINGLE, PREC_MODE_MIXED, PREC_MODE_DOUBLE};
+  
+  inline int precision() { return _precision_mode; }
+  inline IntelBuffers<float,float> * get_single_buffers() 
+    { return _single_buffers; }
+  inline IntelBuffers<float,double> * get_mixed_buffers() 
+    { return _mixed_buffers; }
+  inline IntelBuffers<double,double> * get_double_buffers() 
+    { return _double_buffers; }
+
+ protected:
+  IntelBuffers<float,float> *_single_buffers;
+  IntelBuffers<float,double> *_mixed_buffers;
+  IntelBuffers<double,double> *_double_buffers;
+
+  int _precision_mode, _nthreads;
+
+ public:
+  inline int* get_overflow_flag() { return _overflow_flag; }
+  inline int* get_off_overflow_flag() { return _off_overflow_flag; }
+  inline void add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
+                               double *ev_in, const int offload,
+                               const int eatom = 0, const int vatom = 0);
+  inline void add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
+                               double *ev_in, const int offload,
+                               const int eatom = 0, const int vatom = 0);
+  inline void add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
+                               float *ev_in, const int offload,
+                               const int eatom = 0, const int vatom = 0);
+  inline void get_buffern(const int offload, int &nlocal, int &nall, 
+			  int &minlocal);
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  inline int coprocessor_number() { return _cop; }
+  inline int full_host_list() { return _full_host_list; }
+  void set_offload_affinity();
+  inline double offload_balance() { return _offload_balance; }
+  inline int offload_end_neighbor() { return _balance_neighbor * atom->nlocal; }
+  inline int offload_end_pair();
+  inline int host_start_neighbor()
+    { if (_offload_noghost) return 0; else return offload_end_neighbor(); }
+  inline int host_start_pair()
+    { if (_offload_noghost) return 0; else return offload_end_pair(); }
+  inline int offload_nlocal() { return _offload_nlocal; }
+  inline int offload_nall() { return _offload_nall; }
+  inline int offload_min_ghost() { return _offload_min_ghost; }
+  inline int host_min_local() { return _host_min_local; }
+  inline int host_min_ghost() { return _host_min_ghost; }
+  inline int host_used_local() { return _host_used_local; }
+  inline int host_used_ghost() { return _host_used_ghost; }
+  inline int host_nall() { return _host_nall; }
+  inline int separate_buffers() { return _separate_buffers; }
+  inline int offload_noghost() { return _offload_noghost; }
+  inline void set_offload_noghost(const int v) 
+    { if (_offload_ghost < 0) _offload_noghost = v; }
+  inline void set_neighbor_host_sizes();
+
+  inline void zero_timers()
+    { memset(_timers, 0, sizeof(double) * NUM_ITIMERS); }
+  inline void start_watch(const int which) { _stopwatch[which] = MPI_Wtime(); }
+  inline double stop_watch(const int which);
+  inline double * off_watch_pair() { return _stopwatch_offload_pair; }
+  inline double * off_watch_neighbor() { return _stopwatch_offload_neighbor; }
+  inline void balance_stamp();
+  inline void acc_timers();
+  #else
+  inline int offload_end_neighbor() { return 0; }
+  inline int offload_end_pair() { return 0; }
+  inline int host_start_neighbor() { return 0; }
+  inline int host_start_pair() { return 0; }
+  inline void zero_timers() {}
+  inline void start_watch(const int which) {}
+  inline double stop_watch(const int which) { return 0.0; }
+  double * off_watch_pair() { return NULL; }
+  double * off_watch_neighbor() { return NULL; }
+  inline void balance_stamp() {}
+  inline void acc_timers() {}
+  inline int separate_buffers() { return 0; }
+  #endif
+
+ protected:
+  int _overflow_flag[5];
+  __declspec(align(64)) int _off_overflow_flag[5];
+  int _allow_separate_buffers, _offload_ghost;
+  #ifdef _LMP_INTEL_OFFLOAD
+  double _balance_pair_time, _balance_other_time;
+  int _offload_nlocal, _offload_nall, _offload_min_ghost, _offload_nghost;
+  int _host_min_local, _host_min_ghost, _host_nall;
+  int _host_used_local, _host_used_ghost;
+  int _separate_buffers, _offload_noghost, _sync_at_pair;
+  bool _setup_time_cleared, _timers_allocated;
+  void output_timing_data();
+  FILE *_tscreen;
+
+  IntelBuffers<float,float>::vec3_acc_t *_off_force_array_s;
+  IntelBuffers<float,double>::vec3_acc_t *_off_force_array_m;
+  IntelBuffers<double,double>::vec3_acc_t *_off_force_array_d;
+  float *_off_ev_array_s;
+  double *_off_ev_array_d;
+  int _off_results_eatom, _off_results_vatom;
+  int _full_host_list, _cop, _ncops;
+
+  int get_ppn(int &);
+  #endif
+  void check_neighbor_intel();
+
+  double _offload_balance, _balance_neighbor, _balance_pair, _balance_fixed;
+  double _timers[NUM_ITIMERS];
+  double _stopwatch[NUM_ITIMERS];
+  __declspec(align(64)) double _stopwatch_offload_neighbor[1];
+  __declspec(align(64)) double _stopwatch_offload_pair[1];
+
+  template <class ft, class acc_t>
+  inline void add_results(const ft * restrict const f_in,
+                          const acc_t * restrict const ev_global,
+                          const int eatom, const int vatom,
+			  const int offload);
+
+  template <class ft, class acc_t>
+  inline void add_oresults(const ft * restrict const f_in,
+			   const acc_t * restrict const ev_global,
+			   const int eatom, const int vatom,
+			   const int out_offset, const int nall);
+
+  int _offload_affinity_balanced, _offload_threads, _offload_tpc;
+  #ifdef _LMP_INTEL_OFFLOAD
+  int _max_offload_threads, _offload_cores, _offload_affinity_set;
+  int _im_real_space_task;
+  MPI_Comm _real_space_comm;
+  template <class ft, class acc_t>
+  inline void add_off_results(const ft * restrict const f_in,
+                              const acc_t * restrict const ev_global);
+  #endif
+};
+
+/* ---------------------------------------------------------------------- */
+
+void FixIntel::get_buffern(const int offload, int &nlocal, int &nall,
+			   int &minlocal) {
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_separate_buffers) {
+    if (offload) {
+      if (neighbor->ago != 0) {
+	nlocal = _offload_nlocal;
+	nall = _offload_nall;
+      } else {
+	nlocal = atom->nlocal;
+	nall = nlocal + atom->nghost;
+      }
+      minlocal = 0;
+    } else {
+      nlocal = atom->nlocal;
+      nall = _host_nall;
+      minlocal = _host_min_local;
+    }
+    return;
+  }
+  if (_offload_noghost && offload)
+    nall = atom->nlocal;
+  else
+  #endif
+    nall = atom->nlocal + atom->nghost;
+  nlocal = atom->nlocal;
+  minlocal = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixIntel::add_result_array(IntelBuffers<double,double>::vec3_acc_t *f_in,
+                                double *ev_in, const int offload,
+                                const int eatom, const int vatom) {
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (offload) {
+    _off_results_eatom = eatom;
+    _off_results_vatom = vatom;
+    _off_force_array_d = f_in;
+    _off_ev_array_d = ev_in;
+    if (_sync_at_pair == 1) sync_coprocessor();
+    return;
+  }
+  #endif
+  add_results(f_in, ev_in, eatom, vatom, 0);
+  if (_overflow_flag[LMP_OVERFLOW])
+    error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_sync_at_pair) sync_coprocessor();
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixIntel::add_result_array(IntelBuffers<float,double>::vec3_acc_t *f_in,
+                                double *ev_in, const int offload,
+                                const int eatom, const int vatom) {
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (offload) {
+    _off_results_eatom = eatom;
+    _off_results_vatom = vatom;
+    _off_force_array_m = f_in;
+    _off_ev_array_d = ev_in;
+    if (_sync_at_pair == 1) sync_coprocessor();
+    return;
+  }
+  #endif
+  add_results(f_in, ev_in, eatom, vatom, 0);
+  if (_overflow_flag[LMP_OVERFLOW])
+    error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_sync_at_pair) sync_coprocessor();
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixIntel::add_result_array(IntelBuffers<float,float>::vec3_acc_t *f_in,
+                                float *ev_in, const int offload,
+                                const int eatom, const int vatom) {
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (offload) {
+    _off_results_eatom = eatom;
+    _off_results_vatom = vatom;
+    _off_force_array_s = f_in;
+    _off_ev_array_s = ev_in;
+    if (_sync_at_pair == 1) sync_coprocessor();
+    return;
+  }
+  #endif
+  add_results(f_in, ev_in, eatom, vatom, 0);
+  if (_overflow_flag[LMP_OVERFLOW])
+    error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_sync_at_pair) sync_coprocessor();
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class ft, class acc_t>
+void FixIntel::add_results(const ft * restrict const f_in,
+                           const acc_t * restrict const ev_global,
+                           const int eatom, const int vatom,
+			   const int offload) {
+  start_watch(TIME_PACK);
+  int f_length;
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_separate_buffers) {
+    if (offload) {
+      add_oresults(f_in, ev_global, eatom, vatom, 0, _offload_nlocal);
+      if (force->newton_pair) {
+	const acc_t * restrict const enull = 0;
+	int offset = _offload_nlocal;
+	if (atom->torque) offset *= 2;
+	add_oresults(f_in + offset, enull, eatom, vatom, 
+		     _offload_min_ghost, _offload_nghost);
+      }
+    } else {
+      add_oresults(f_in, ev_global, eatom, vatom,
+		   _host_min_local, _host_used_local);
+      if (force->newton_pair) {
+	const acc_t * restrict const enull = 0;
+	int offset = _host_used_local;
+	if (atom->torque) offset *= 2;
+	add_oresults(f_in + offset, enull, eatom, 
+		     vatom, _host_min_ghost, _host_used_ghost);
+      }
+    }
+    stop_watch(TIME_PACK);
+    return;
+  }
+  if (force->newton_pair && (_offload_noghost == 0 || offload == 0))
+    f_length = atom->nlocal + atom->nghost;
+  else
+    f_length = atom->nlocal;
+  #else
+  if (force->newton_pair)
+    f_length = atom->nlocal + atom->nghost;
+  else
+    f_length = atom->nlocal;
+  #endif
+
+  add_oresults(f_in, ev_global, eatom, vatom, 0, f_length);
+  stop_watch(TIME_PACK);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class ft, class acc_t>
+void FixIntel::add_oresults(const ft * restrict const f_in,
+			    const acc_t * restrict const ev_global,
+			    const int eatom, const int vatom,
+			    const int out_offset, const int nall) {
+  lmp_ft * restrict const f = (lmp_ft *) lmp->atom->f[0] + out_offset;
+  if (atom->torque) {
+    if (f_in[1].w)
+      if (f_in[1].w == 1)
+        error->all(FLERR,"Bad matrix inversion in mldivide3");
+      else
+        error->all(FLERR,
+                   "Sphere particles not yet supported for gayberne/intel");
+  }
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none)
+  #endif
+  {
+    const int tid = omp_get_thread_num();
+    int ifrom, ito;
+    IP_PRE_omp_range_align(ifrom, ito, tid, nall, _nthreads, sizeof(acc_t));
+    if (atom->torque) {
+      int ii = ifrom * 2;
+      lmp_ft * restrict const tor = (lmp_ft *) lmp->atom->torque[0] +
+	out_offset;
+      if (eatom) {
+        for (int i = ifrom; i < ito; i++) {
+          f[i].x += f_in[ii].x;
+          f[i].y += f_in[ii].y;
+          f[i].z += f_in[ii].z;
+          force->pair->eatom[i] += f_in[ii].w;
+          tor[i].x += f_in[ii+1].x;
+          tor[i].y += f_in[ii+1].y;
+          tor[i].z += f_in[ii+1].z;
+          ii += 2;
+        }
+      } else {
+        for (int i = ifrom; i < ito; i++) {
+          f[i].x += f_in[ii].x;
+          f[i].y += f_in[ii].y;
+          f[i].z += f_in[ii].z;
+          tor[i].x += f_in[ii+1].x;
+          tor[i].y += f_in[ii+1].y;
+          tor[i].z += f_in[ii+1].z;
+          ii += 2;
+        }
+      }
+    } else {
+      if (eatom) {
+        for (int i = ifrom; i < ito; i++) {
+          f[i].x += f_in[i].x;
+          f[i].y += f_in[i].y;
+          f[i].z += f_in[i].z;
+          force->pair->eatom[i] += f_in[i].w;
+        }
+      } else {
+        for (int i = ifrom; i < ito; i++) {
+          f[i].x += f_in[i].x;
+          f[i].y += f_in[i].y;
+          f[i].z += f_in[i].z;
+        }
+      }
+    }
+  }
+
+  if (ev_global != NULL) {
+    force->pair->eng_vdwl += ev_global[0];
+    force->pair->eng_coul += ev_global[1];
+    force->pair->virial[0] += ev_global[2];
+    force->pair->virial[1] += ev_global[3];
+    force->pair->virial[2] += ev_global[4];
+    force->pair->virial[3] += ev_global[5];
+    force->pair->virial[4] += ev_global[6];
+    force->pair->virial[5] += ev_global[7];
+  }
+}
+
+#ifdef _LMP_INTEL_OFFLOAD
+
+/* ---------------------------------------------------------------------- */
+
+int FixIntel::offload_end_pair() {
+  if (neighbor->ago == 0) return _balance_neighbor * atom->nlocal;
+  else return _balance_pair * atom->nlocal;
+}
+
+/* ---------------------------------------------------------------------- */
+
+double FixIntel::stop_watch(const int which) {
+  double elapsed = MPI_Wtime() - _stopwatch[which];
+  _timers[which] += elapsed;
+  return elapsed;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixIntel::balance_stamp() {
+  if (_offload_balance < 0.0) {
+    double ct = MPI_Wtime();
+    _balance_other_time = ct;
+    _balance_pair_time = ct - _stopwatch[TIME_HOST_PAIR];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixIntel::acc_timers() {
+  if (neighbor->ago == 0) {
+    _timers[TIME_OFFLOAD_NEIGHBOR] += *_stopwatch_offload_neighbor;
+    if (_setup_time_cleared == false) {
+      zero_timers();
+      _setup_time_cleared = true;
+    }
+  }
+  _timers[TIME_OFFLOAD_PAIR] += *_stopwatch_offload_pair;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixIntel::set_neighbor_host_sizes() {
+  _host_min_local = _overflow_flag[LMP_LOCAL_MIN];
+  _host_min_ghost = _overflow_flag[LMP_GHOST_MIN];
+  _host_used_local = atom->nlocal - _host_min_local;
+  _host_used_ghost = _overflow_flag[LMP_GHOST_MAX] + 1 - _host_min_ghost;
+  if (_host_used_ghost < 0) _host_used_ghost = 0;
+  _host_nall = atom->nlocal + _host_used_ghost;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class ft, class acc_t>
+void FixIntel::add_off_results(const ft * restrict const f_in,
+                               const acc_t * restrict const ev_global) {
+  if (_offload_balance < 0.0)
+    _balance_other_time = MPI_Wtime() - _balance_other_time;
+
+  start_watch(TIME_OFFLOAD_WAIT);
+  #ifdef _LMP_INTEL_OFFLOAD
+  #pragma offload_wait target(mic:_cop) wait(f_in)
+  #endif
+  double wait_time = stop_watch(TIME_OFFLOAD_WAIT);
+
+  if (neighbor->ago == 0) {
+    if (_off_overflow_flag[LMP_OVERFLOW])
+      error->one(FLERR, "Neighbor list overflow, boost neigh_modify one");
+    _offload_nlocal = _off_overflow_flag[LMP_LOCAL_MAX] + 1;
+    _offload_min_ghost = _off_overflow_flag[LMP_GHOST_MIN];
+    _offload_nghost = _off_overflow_flag[LMP_GHOST_MAX] + 1 -
+      _offload_min_ghost;
+    if (_offload_nghost < 0) _offload_nghost = 0;
+    _offload_nall = _offload_nlocal + _offload_nghost;
+      _offload_nlocal;
+  }
+  
+  int nlocal = atom->nlocal;
+  // Load balance?
+  if (_offload_balance < 0.0) {
+    if (neighbor->ago == 0)
+      _balance_pair = _balance_neighbor;
+    double mic_time;
+    mic_time = *_stopwatch_offload_pair;
+    if (_balance_pair_time + _balance_other_time < mic_time) {
+      double ft = _balance_pair_time + _balance_other_time + wait_time -
+          mic_time;
+      _balance_fixed = (1.0 - INTEL_LB_MEAN_WEIGHT) * _balance_fixed +
+          INTEL_LB_MEAN_WEIGHT * ft;
+    }
+
+    double ctps = _balance_pair_time / (1.0-_balance_pair);
+    double otps = mic_time / _balance_pair;
+    double new_balance = (ctps + _balance_other_time - _balance_fixed) /
+        (otps + ctps);
+    if (new_balance < 0.01) new_balance = 0.01;
+    else if (new_balance > 0.99) new_balance = 0.99;
+    _balance_neighbor = (1.0 - INTEL_LB_MEAN_WEIGHT) *_balance_neighbor +
+        INTEL_LB_MEAN_WEIGHT * new_balance;
+  }
+
+  #ifdef TIME_BALANCE
+  start_watch(TIME_IMBALANCE);
+  MPI_Barrier(_real_space_comm);
+  stop_watch(TIME_IMBALANCE);
+  #endif
+  acc_timers();
+  if (atom->torque)
+    if (f_in[1].w < 0.0)
+      error->all(FLERR, "Bad matrix inversion in mldivide3");
+  add_results(f_in, ev_global, _off_results_eatom, _off_results_vatom, 1);
+}
+
+#endif
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: The 'package intel' command is required for /intel styles
+
+Self-explanatory.
+
+E: Neighbor list overflow, boost neigh_modify one
+
+Increase the value for neigh_modify one to allow for larger allocations for
+neighbor list builds. The value required can be different for the Intel
+package in order to support offload to a coprocessor.
+
+E: Bad matrix inversion in mldivide3
+
+This error should not occur unless the matrix is badly formed.
+
+E: Illegal package intel command
+
+The format for the package intel command is incorrect. Please see the
+documentation.
+
+E: fix intel has to operate on group 'all'
+
+Self explanatory.
+
+E: Illegal package intel mode requested
+
+The format for the package intel command is incorrect. Please see the
+documentation.
+
+E: Specified run_style does not support the Intel package.
+
+When using offload to a coprocessor, the Intel package requires a run style
+with the intel suffix.
+
+E: Currently, neighbor style BIN must be used with Intel package.
+
+This is the only neighbor style that has been implemented for the Intel
+package.
+
+E: Currently, cannot use neigh_modify exclude with Intel package.
+
+This is a current restriction of the Intel package.
+
+E: Currently, cannot use more than one intel style with hybrid.
+
+Currently, hybrid pair styles can only use the intel suffix for one of the
+pair styles.
+
+E: Cannot yet use hybrid styles with Intel package.
+
+The hybrid pair style configuration is not yet supported by the Intel 
+package. Support is limited to hybrid/overlay or a hybrid style that does 
+not require a skip list.
+
+E: MPI tasks per node must be multiple of offload_cards
+
+For offload to multiple coprocessors on a single node, the Intel package
+requires that each coprocessor is used by the same number of MPI tasks.
+
+*/
diff --git a/src/USER-INTEL/intel_buffers.cpp b/src/USER-INTEL/intel_buffers.cpp
new file mode 100644
index 0000000000..a541f0f359
--- /dev/null
+++ b/src/USER-INTEL/intel_buffers.cpp
@@ -0,0 +1,432 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include "intel_buffers.h"
+#include "force.h"
+#include "memory.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+IntelBuffers<flt_t, acc_t>::IntelBuffers(class LAMMPS *lmp_in) :
+    lmp(lmp_in), _x(0), _q(0), _quat(0), _f(0), _buf_size(0),
+    _buf_local_size(0), _off_threads(0) {
+  _list_alloc_atoms = 0;
+  _ntypes = 0;
+  _off_map_maxlocal = 0;
+  #ifdef _LMP_INTEL_OFFLOAD
+  _separate_buffers = 0;
+  _off_f = 0;
+  _off_map_ilist = 0;
+  _off_map_nmax = 0;
+  _off_map_maxhead = 0;
+  _off_list_alloc = false;
+  _off_threads = 0;
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+IntelBuffers<flt_t, acc_t>::~IntelBuffers()
+{
+  free_buffers();
+  free_all_nbor_buffers();
+  set_ntypes(0);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::free_buffers()
+{
+  if (_buf_size > 0) {
+    atom_t * x = get_x();
+    flt_t * q = get_q();
+    quat_t * quat = get_quat();
+
+    #ifdef _LMP_INTEL_OFFLOAD
+    vec3_acc_t * f_start = get_off_f();
+    if (f_start != 0) {
+      acc_t * ev_global = get_ev_global();
+      if (ev_global != 0) {
+        #pragma offload_transfer target(mic:_cop) \
+          nocopy(x:alloc_if(0) free_if(1)) \
+	  nocopy(f_start:alloc_if(0) free_if(1)) \
+	  nocopy(ev_global:alloc_if(0) free_if(1))
+      }
+
+      if (q != 0) {
+        #pragma offload_transfer target (mic:_cop) \
+          nocopy(q:alloc_if(0) free_if(1))
+      }
+      if (quat != 0) {
+        #pragma offload_transfer target (mic:_cop) \
+          nocopy(quat:alloc_if(0) free_if(1))
+      }
+      lmp->memory->destroy(f_start);
+    }
+
+    if (_separate_buffers) {
+      lmp->memory->destroy(_host_x);
+      if (q != 0) lmp->memory->destroy(_host_q);
+      if (quat != 0) lmp->memory->destroy(_host_quat);
+    }
+    #endif
+
+    lmp->memory->destroy(x);
+    if (q != 0) lmp->memory->destroy(q);
+    if (quat != 0) lmp->memory->destroy(quat);
+    lmp->memory->destroy(_f);
+    _buf_size = _buf_local_size = 0;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::_grow(const int nall, const int nlocal, 
+				       const int nthreads,
+				       const int offload_end)
+{
+  free_buffers();
+  _buf_size = static_cast<double>(nall) * 1.1 + 1;
+  if (lmp->force->newton_pair)
+    _buf_local_size = _buf_size;
+  else
+    _buf_local_size = static_cast<double>(nlocal) * 1.1 + 1;
+  if (lmp->atom->torque)
+    _buf_local_size *= 2;
+  const int f_stride = get_stride(_buf_local_size);
+  lmp->memory->create(_x, _buf_size,"intel_x");
+  if (lmp->atom->q != NULL)
+    lmp->memory->create(_q, _buf_size, "intel_q");
+  if (lmp->atom->ellipsoid != NULL)
+    lmp->memory->create(_quat, _buf_size, "intel_quat");
+  lmp->memory->create(_f, f_stride * nthreads, "intel_f");
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_separate_buffers) {
+    lmp->memory->create(_host_x, _buf_size,"intel_host_x");
+    if (lmp->atom->q != NULL)
+      lmp->memory->create(_host_q, _buf_size, "intel_host_q");
+    if (lmp->atom->ellipsoid != NULL)
+      lmp->memory->create(_host_quat, _buf_size, "intel_host_quat");
+  }
+    
+  if (offload_end > 0) {
+    lmp->memory->create(_off_f, f_stride * _off_threads, "intel_off_f");
+    const atom_t * const x = get_x();
+    const flt_t * const q = get_q();
+    const vec3_acc_t * f_start = get_off_f();
+    acc_t * ev_global = get_ev_global();
+    if (lmp->atom->q != NULL) {
+      if (x != NULL && q != NULL && f_start != NULL && ev_global != NULL) {
+        #pragma offload_transfer target(mic:_cop) \
+          nocopy(x,q:length(_buf_size) alloc_if(1) free_if(0)) \
+	  nocopy(f_start:length(f_stride*_off_threads) alloc_if(1) free_if(0))\
+	  nocopy(ev_global:length(8) alloc_if(1) free_if(0))
+      }
+    } else {
+      if (x != NULL && f_start != NULL && ev_global != NULL) {
+        #pragma offload_transfer target(mic:_cop) \
+          nocopy(x:length(_buf_size) alloc_if(1) free_if(0)) \
+          nocopy(f_start:length(f_stride*_off_threads) alloc_if(1) free_if(0))\
+	  nocopy(ev_global:length(8) alloc_if(1) free_if(0))
+      }
+    }
+    if (lmp->atom->ellipsoid != NULL) {
+      const quat_t * const quat = get_quat();
+      if (quat != NULL) {
+        #pragma offload_transfer target(mic:_cop) \
+          nocopy(quat:length(_buf_size) alloc_if(1) free_if(0))
+      }
+    }
+  }
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::free_nmax()
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_off_map_nmax > 0) {
+    const int * tag = _off_map_tag;
+    const int * special = _off_map_special;
+    const int * nspecial = _off_map_nspecial;
+    const int * bins = _off_map_bins;
+    if (tag != 0 && special != 0 && nspecial !=0 && bins != 0) {
+      #pragma offload_transfer target(mic:_cop) \
+        nocopy(tag:alloc_if(0) free_if(1)) \
+	nocopy(special,nspecial:alloc_if(0) free_if(1)) \
+	nocopy(bins:alloc_if(0) free_if(1))
+    }
+    _off_map_nmax = 0;
+  }
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::_grow_nmax()
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  free_nmax();
+  int *special, *nspecial;
+  int tag_length, special_length, nspecial_length;
+  int size = lmp->atom->nmax;
+  if (lmp->atom->molecular) {
+    special = lmp->atom->special[0];
+    nspecial = lmp->atom->nspecial[0];
+    special_length = size * lmp->atom->maxspecial;
+    nspecial_length = size * 3;
+    tag_length = size;
+  } else {
+    special = &_special_holder;
+    nspecial = &_nspecial_holder;
+    special_length = 1;
+    nspecial_length = 1;
+    tag_length = 1;
+  }
+  int *tag = lmp->atom->tag;
+  int *bins = lmp->neighbor->bins;
+  #pragma offload_transfer target(mic:_cop) \
+    nocopy(bins:length(size) alloc_if(1) free_if(0)) \
+    nocopy(tag:length(tag_length) alloc_if(1) free_if(0)) \
+    nocopy(special:length(special_length) alloc_if(1) free_if(0)) \
+    nocopy(nspecial:length(nspecial_length) alloc_if(1) free_if(0))
+  _off_map_tag = tag;
+  _off_map_special = special;
+  _off_map_nspecial = nspecial;
+  _off_map_nmax = size;
+  _off_map_bins = bins;
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::free_local()
+{
+  if (_off_map_maxlocal > 0) {
+    int * cnumneigh = _cnumneigh;
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (_off_map_ilist != NULL) {
+      const int * ilist = _off_map_ilist;
+      const int * numneigh = _off_map_numneigh;
+      _off_map_ilist = NULL;
+      if (numneigh != 0 && ilist != 0) {
+        #pragma offload_transfer target(mic:_cop) \
+          nocopy(ilist,numneigh,cnumneigh:alloc_if(0) free_if(1))
+      }
+    }
+    #endif
+    lmp->memory->destroy(cnumneigh);
+    _off_map_maxlocal = 0;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::_grow_local(NeighList *list, 
+					     const int offload_end)
+{
+  free_local();
+  int size = list->get_maxlocal();
+  lmp->memory->create(_cnumneigh, size, "_cnumneigh");
+  _off_map_maxlocal = size;
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (offload_end > 0) {
+    int * numneigh = list->numneigh;
+    int * ilist = list->ilist;
+    int * cnumneigh = _cnumneigh;
+    if (cnumneigh != 0) {
+      #pragma offload_transfer target(mic:_cop) \
+        nocopy(ilist:length(size) alloc_if(1) free_if(0)) \
+	nocopy(numneigh:length(size) alloc_if(1) free_if(0)) \
+	nocopy(cnumneigh:length(size) alloc_if(1) free_if(0))
+    }
+    _off_map_ilist = ilist;
+    _off_map_numneigh = numneigh;
+  }
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::free_binhead()
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_off_map_maxhead > 0) {
+    const int * binhead = _off_map_binhead;
+    if (binhead !=0) {
+      #pragma offload_transfer target(mic:_cop) \
+        nocopy(binhead:alloc_if(0) free_if(1))
+    }
+    _off_map_maxhead = 0;
+  }
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::_grow_binhead()
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  free_binhead();
+  int * binhead = lmp->neighbor->binhead;
+  const int maxhead = lmp->neighbor->maxhead;
+  #pragma offload_transfer target(mic:_cop) \
+    nocopy(binhead:length(maxhead) alloc_if(1) free_if(0))
+  _off_map_binhead = binhead;
+  _off_map_maxhead = maxhead;
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::free_nbor_list()
+{
+  if (_list_alloc_atoms > 0) {
+    lmp->memory->destroy(_list_alloc);
+    _list_alloc_atoms = 0;
+
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (_off_list_alloc) {
+      int * list_alloc = _list_alloc;
+      int * special_flag = lmp->neighbor->special_flag_alloc();
+      int * stencil = _off_map_stencil;
+      if (list_alloc != 0 && special_flag != 0 && stencil != 0) {
+        #pragma offload_transfer target(mic:_cop) \
+          nocopy(special_flag,stencil:alloc_if(0) free_if(1)) \
+          nocopy(list_alloc:alloc_if(0) free_if(1))
+      }
+      _off_list_alloc = false;
+    }
+    #endif
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::_grow_nbor_list(NeighList *list, 
+						 const int nlocal,
+						 const int offload_end)
+{
+  free_nbor_list();
+  _list_alloc_atoms = 1.10 * nlocal;
+  int list_alloc_size = (_list_alloc_atoms + _off_threads) * get_max_nbors();
+  lmp->memory->create(_list_alloc, list_alloc_size, "_list_alloc");
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (offload_end > 0) {
+    int * list_alloc =_list_alloc;
+    int * special_flag = lmp->neighbor->special_flag;
+    int * stencil = list->stencil;
+
+    if (special_flag != NULL && list_alloc != NULL) {
+      #pragma offload_transfer target(mic:_cop) \
+        in(special_flag:length(4) alloc_if(1) free_if(0)) \
+	in(stencil:length(list->maxstencil) alloc_if(1) free_if(0)) \
+	nocopy(list_alloc:length(list_alloc_size) alloc_if(1) free_if(0))
+      _off_map_stencil = stencil;
+      _off_list_alloc = true;
+    }
+  }
+  #endif
+}
+
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::_grow_stencil(NeighList *list)
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  int * stencil = _off_map_stencil;
+  #pragma offload_transfer target(mic:_cop) \
+    nocopy(stencil:alloc_if(0) free_if(1))
+  stencil = list->stencil;
+  #pragma offload_transfer target(mic:_cop) \
+    in(stencil:length(list->maxstencil) alloc_if(1) free_if(0))
+  _off_map_stencil = stencil;
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void IntelBuffers<flt_t, acc_t>::set_ntypes(const int ntypes)
+{
+  if (ntypes != _ntypes) {
+    if (_ntypes > 0) {
+      #ifdef _LMP_INTEL_OFFLOAD
+      flt_t * cutneighsqo = _cutneighsq[0];
+      if (cutneighsqo != 0) {
+        #pragma offload_transfer target(mic:_cop) \
+          nocopy(cutneighsqo:alloc_if(0) free_if(1))
+      }
+      #endif
+      lmp->memory->destroy(_cutneighsq);
+    }
+    if (ntypes > 0) {
+      lmp->memory->create(_cutneighsq, ntypes, ntypes, "_cutneighsq");
+      #ifdef _LMP_INTEL_OFFLOAD
+      flt_t * cutneighsqo = _cutneighsq[0];
+      if (cutneighsqo != NULL) {
+        #pragma offload_transfer target(mic:_cop) \
+          nocopy(cutneighsqo:length(ntypes * ntypes) alloc_if(1) free_if(0))
+      }
+      #endif
+    }
+    _ntypes = ntypes;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+double IntelBuffers<flt_t, acc_t>::memory_usage(const int nthreads)
+{
+  double tmem = sizeof(atom_t);
+  if (lmp->atom->q) tmem += sizeof(flt_t);
+  if (lmp->atom->torque) tmem += sizeof(quat_t);
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_separate_buffers) tmem *= 2;
+  #endif
+  tmem *= _buf_size;
+
+  const int fstride = get_stride(_buf_local_size);
+  tmem += fstride * nthreads * sizeof(vec3_acc_t);
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_off_f) tmem += fstride*_off_threads * sizeof(vec3_acc_t);
+  #endif
+
+  tmem += _off_map_maxlocal * sizeof(int);
+  tmem += (_list_alloc_atoms + _off_threads) * get_max_nbors() * sizeof(int);
+  tmem += _ntypes * _ntypes * sizeof(int);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template class IntelBuffers<float,float>;
+template class IntelBuffers<float,double>;
+template class IntelBuffers<double,double>;
diff --git a/src/USER-INTEL/intel_buffers.h b/src/USER-INTEL/intel_buffers.h
new file mode 100644
index 0000000000..bc1ca9e3b8
--- /dev/null
+++ b/src/USER-INTEL/intel_buffers.h
@@ -0,0 +1,284 @@
+/* -*- c++ -*- -------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifndef LMP_INTEL_BUFFERS_H
+#define LMP_INTEL_BUFFERS_H
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+#include "atom.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "intel_preprocess.h"
+#include <cstring>
+
+namespace LAMMPS_NS {
+
+#define ATOM_T typename IntelBuffers<flt_t,acc_t>::atom_t
+#define QUAT_T typename IntelBuffers<flt_t,acc_t>::quat_t
+#define FORCE_T typename IntelBuffers<flt_t,acc_t>::vec3_acc_t
+
+// May not need a separate force array for mixed/double
+template <class flt_t, class acc_t>
+class IntelBuffers {
+ public:
+  typedef struct { flt_t x,y,z; int w; } atom_t;
+  typedef struct { flt_t w,i,j,k; } quat_t;
+  typedef struct { flt_t x,y,z,w; } vec3_t;  
+  typedef struct { flt_t x,y,z,w; } vec4_t;
+  typedef struct { acc_t x,y,z,w; } vec3_acc_t;
+    
+  IntelBuffers(class LAMMPS *lmp_in);
+  ~IntelBuffers();
+
+  inline int get_stride(int nall) {
+    int stride;
+    IP_PRE_get_stride(stride, nall, sizeof(vec3_acc_t), 
+			 lmp->atom->torque);
+    return stride;
+  }
+
+  void free_buffers();
+
+  inline void grow(const int nall, const int nlocal, const int nthreads,
+                   const int offload_end) {
+    if (nall >= _buf_size || nlocal >= _buf_local_size)
+      _grow(nall, nlocal, nthreads, offload_end);
+  }
+
+  inline void free_all_nbor_buffers() {
+    free_nbor_list();
+    free_nmax();
+    free_binhead();
+    free_local();
+  }
+
+  inline void grow_nbor(NeighList *list, const int nlocal,
+                        const int offload_end) {
+    grow_local(list, offload_end);
+    if (offload_end) {
+      grow_nmax();
+      grow_binhead();
+    }
+    grow_nbor_list(list, nlocal, offload_end);
+  }
+
+  void free_nmax();
+
+  inline void grow_nmax() {
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (lmp->atom->nmax > _off_map_nmax)
+      _grow_nmax();
+    #endif
+  }
+
+  void free_local();
+
+  inline void grow_local(NeighList *list, const int offload_end) {
+    if (list->get_maxlocal() > _off_map_maxlocal)
+      _grow_local(list, offload_end);
+  }
+
+  void free_binhead();
+  
+  inline void grow_binhead() {
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (lmp->neighbor->maxhead > _off_map_maxhead)
+      _grow_binhead();
+    #endif
+  }
+
+  inline int get_max_nbors() {
+    int mn = lmp->neighbor->oneatom * sizeof(int) /
+        (INTEL_ONEATOM_FACTOR * INTEL_DATA_ALIGN);
+    return mn * INTEL_DATA_ALIGN / sizeof(int);
+  }
+  
+  void free_nbor_list();
+
+  inline void grow_nbor_list(NeighList *list, const int nlocal,
+                             const int offload_end) {
+    if (nlocal > _list_alloc_atoms)
+      _grow_nbor_list(list, nlocal, offload_end);
+    #ifdef _LMP_INTEL_OFFLOAD
+    else if (offload_end > 0 && _off_map_stencil != list->stencil)
+      _grow_stencil(list);
+    #endif
+  }
+
+  void set_ntypes(const int ntypes);
+
+  inline int * firstneigh(const NeighList *list) { return _list_alloc; }
+  inline int * cnumneigh(const NeighList *list) { return _cnumneigh; }
+
+  inline atom_t * get_x(const int offload = 1) { 
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (_separate_buffers && offload == 0) return _host_x;
+    #endif
+    return _x; 
+  }
+  inline flt_t * get_q(const int offload = 1) { 
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (_separate_buffers && offload == 0) return _host_q;
+    #endif
+    return _q; 
+  }
+  inline quat_t * get_quat(const int offload = 1) { 
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (_separate_buffers && offload == 0) return _host_quat;
+    #endif
+    return _quat; 
+  }
+  inline vec3_acc_t * get_f() { return _f; }
+  inline acc_t * get_ev_global() { return _ev_global; }
+  inline acc_t * get_ev_global_host() { return _ev_global_host; }
+  inline void zero_ev() 
+    { for (int i = 0; i < 8; i++) _ev_global[i] = _ev_global_host[i] = 0.0; }
+  inline flt_t ** get_cutneighsq() { return _cutneighsq; }
+  inline int get_off_threads() { return _off_threads; }
+  #ifdef _LMP_INTEL_OFFLOAD
+  inline void set_off_params(const int n, const int cop, 
+			     const int separate_buffers) 
+    { _off_threads = n; _cop = cop; _separate_buffers = separate_buffers; } 
+  inline vec3_acc_t * get_off_f() { return _off_f; }
+  #endif
+
+  inline void thr_pack(const int ifrom, const int ito, const int ago) {
+    if (ago == 0) {
+      for (int i = ifrom; i < ito; i++) {
+        _x[i].x = lmp->atom->x[i][0];
+        _x[i].y = lmp->atom->x[i][1];
+        _x[i].z = lmp->atom->x[i][2];
+        _x[i].w = lmp->atom->type[i];
+      }
+      if (lmp->atom->q != NULL)
+        for (int i = ifrom; i < ito; i++)
+          _q[i] = lmp->atom->q[i];
+    } else {
+      for (int i = ifrom; i < ito; i++) {
+        _x[i].x = lmp->atom->x[i][0];
+        _x[i].y = lmp->atom->x[i][1];
+        _x[i].z = lmp->atom->x[i][2];
+      }
+    }
+  }
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  inline void thr_pack_cop(const int ifrom, const int ito, 
+			   const int offset, const bool dotype = false) {
+    double ** x = lmp->atom->x + offset;
+    if (dotype == false) {
+      #pragma vector nontemporal
+      for (int i = ifrom; i < ito; i++) {
+        _x[i].x = x[i][0];
+        _x[i].y = x[i][1];
+        _x[i].z = x[i][2];
+      }
+    } else {
+      int *type = lmp->atom->type + offset;
+      #pragma vector nontemporal
+      for (int i = ifrom; i < ito; i++) {
+	_x[i].x = x[i][0];
+	_x[i].y = x[i][1];
+	_x[i].z = x[i][2];
+	_x[i].w = type[i];
+      }
+    }
+  }
+
+  inline void thr_pack_host(const int ifrom, const int ito, 
+			    const int offset) {
+    double ** x = lmp->atom->x + offset;
+    for (int i = ifrom; i < ito; i++) {
+      _host_x[i].x = x[i][0];
+      _host_x[i].y = x[i][1];
+      _host_x[i].z = x[i][2];
+    }
+  }
+
+  inline void pack_sep_from_single(const int host_min_local, 
+				   const int used_local,
+				   const int host_min_ghost,
+				   const int used_ghost) {
+    memcpy(_host_x + host_min_local, _x + host_min_local,
+	   used_local * sizeof(atom_t));
+    memcpy(_host_x + host_min_local + used_local, _x + host_min_ghost,
+	   used_ghost * sizeof(atom_t));
+    int nall = used_local + used_ghost + host_min_local;
+    _host_x[nall].x = INTEL_BIGP;
+    _host_x[nall].y = INTEL_BIGP;
+    _host_x[nall].z = INTEL_BIGP;
+    _host_x[nall].w = 1;
+    if (lmp->atom->q != NULL) {
+      memcpy(_host_q + host_min_local, _q + host_min_local,
+	     used_local * sizeof(flt_t));
+      memcpy(_host_q + host_min_local + used_local, _q + host_min_ghost,
+	     used_ghost * sizeof(flt_t));
+    }
+  }
+  #endif
+
+  double memory_usage(const int nthreads);
+
+  int _special_holder, _nspecial_holder;
+
+ protected:
+  LAMMPS *lmp;
+  atom_t *_x;
+  flt_t *_q;
+  quat_t *_quat;
+  vec3_acc_t * _f;
+  int _off_threads, _off_map_maxlocal;
+
+  int _list_alloc_atoms;
+  int * _list_alloc;
+  int * _cnumneigh;
+
+  flt_t **_cutneighsq;
+  int _ntypes;
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  int _separate_buffers;
+  atom_t *_host_x;
+  flt_t *_host_q;
+  quat_t *_host_quat;
+  vec3_acc_t *_off_f;
+  int _off_map_nmax, _off_map_maxhead, _cop;
+  int *_off_map_ilist;
+  int *_off_map_stencil, *_off_map_special, *_off_map_nspecial, *_off_map_tag;
+  int *_off_map_binhead, *_off_map_bins, *_off_map_numneigh;
+  bool _off_list_alloc;
+  #endif
+  
+  int _buf_size, _buf_local_size;
+  __declspec(align(64)) acc_t _ev_global[8];
+  __declspec(align(64)) acc_t _ev_global_host[8];
+
+  void _grow(const int nall, const int nlocal, const int nthreads,
+	     const int offload_end);
+  void _grow_nmax();
+  void _grow_local(NeighList *list, const int offload_end);
+  void _grow_binhead();
+  void _grow_nbor_list(NeighList *list, const int nlocal,
+                       const int offload_end);
+  void _grow_stencil(NeighList *list);
+};
+
+}
+
+#endif
diff --git a/src/USER-INTEL/intel_preprocess.h b/src/USER-INTEL/intel_preprocess.h
new file mode 100644
index 0000000000..49e3413e0a
--- /dev/null
+++ b/src/USER-INTEL/intel_preprocess.h
@@ -0,0 +1,391 @@
+/* -*- c++ -*- -------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef __INTEL_OFFLOAD
+#ifdef LMP_INTEL_OFFLOAD
+#define _LMP_INTEL_OFFLOAD
+#endif
+#endif
+
+#ifndef LMP_INTEL_PREPROCESS_H
+#define LMP_INTEL_PREPROCESS_H
+
+#ifndef LAMMPS_MEMALIGN
+#error Please set -DLAMMPS_MEMALIGN=64 in CCFLAGS for your LAMMPS makefile.
+#endif
+
+namespace LAMMPS_NS {
+
+enum {LMP_OVERFLOW, LMP_LOCAL_MIN, LMP_LOCAL_MAX, LMP_GHOST_MIN,
+      LMP_GHOST_MAX};
+enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
+      TIME_OFFLOAD_PAIR, TIME_OFFLOAD_WAIT, TIME_OFFLOAD_LATENCY,
+      TIME_IMBALANCE};
+#define NUM_ITIMERS ( TIME_IMBALANCE + 1 )
+
+#define INTEL_DATA_ALIGN 64
+#define INTEL_ONEATOM_FACTOR 2
+#define INTEL_MIC_VECTOR_WIDTH 16
+#define INTEL_MIC_NBOR_PAD INTEL_MIC_VECTOR_WIDTH
+#define INTEL_VECTOR_WIDTH 8
+#define INTEL_NBOR_PAD INTEL_VECTOR_WIDTH
+#define INTEL_LB_MEAN_WEIGHT 0.1
+#define INTEL_BIGP 1e15
+
+#define IP_PRE_get_stride(stride, n, datasize, torque)	\
+  {								\
+    int blength = n;						\
+    if (torque) blength *= 2;					\
+    const int bytes = blength * datasize;			\
+    stride = INTEL_DATA_ALIGN - (bytes % INTEL_DATA_ALIGN);     \
+    stride = blength + stride / datasize;			\
+  }
+
+#if defined(_OPENMP)
+
+#define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads) 	\
+  {								\
+    const int idelta = 1 + inum/nthreads;			\
+    ifrom = tid * idelta;					\
+    ito = ((ifrom + idelta) > inum) ? inum : ifrom + idelta;	\
+  }
+
+#define IP_PRE_omp_range_id(ifrom, ito, tid, inum, nthreads)	\
+  {								\
+    tid = omp_get_thread_num();         			\
+    IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads);		\
+  }
+
+#define IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads, \
+                             datasize)                          \
+{                                                               \
+  int chunk_size = INTEL_DATA_ALIGN / datasize;                 \
+  int idelta = static_cast<int>(static_cast<float>(inum)	\
+				/chunk_size/nthreads) + 1;	\
+  idelta *= chunk_size;						\
+  ifrom = tid*idelta;                                           \
+  ito = ifrom + idelta;                                         \
+  if (ito > inum) ito = inum;                                   \
+}
+
+#define IP_PRE_omp_range_id_align(ifrom, ito, tid, inum,        \
+				nthreads, datasize)		\
+  {								\
+    tid = omp_get_thread_num();         			\
+    IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads,     \
+			   datasize);				\
+  }
+
+#else
+
+#define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads)	\
+  {								\
+    ifrom = 0;							\
+    ito = inum;						        \
+  }
+
+#define IP_PRE_omp_range_id(ifrom, ito, tid, inum, nthreads)	\
+  {								\
+    tid = 0;							\
+    ifrom = 0;							\
+    ito = inum;							\
+  }
+
+#define IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads, \
+                             datasize)                          \
+{                                                               \
+    ifrom = 0;							\
+    ito = inum;						        \
+}
+
+#define IP_PRE_omp_range_id_align(ifrom, ito, tid, inum,        \
+				nthreads, datasize)		\
+{								\
+  tid = 0;							\
+  ifrom = 0;							\
+  ito = inum;							\
+}
+
+#endif
+
+#ifdef _LMP_INTEL_OFFLOAD
+#include <sys/time.h>
+
+__declspec( target (mic))
+inline double MIC_Wtime() {
+  double time;
+  struct timeval tv;
+
+  gettimeofday(&tv, NULL);
+  time = 1.0 * tv.tv_sec + 1.0e-6 * tv.tv_usec;
+  return time;
+}
+
+#define IP_PRE_pack_separate_buffers(fix, buffers, ago, offload,	\
+				     nlocal, nall)			\
+{									\
+    if (fix->separate_buffers() && ago != 0) {				\
+    fix->start_watch(TIME_PACK);					\
+    if (offload) {							\
+      _Pragma("omp parallel default(none) shared(buffers,nlocal,nall)")	\
+      {									\
+        int ifrom, ito, tid;						\
+	int nthreads = comm->nthreads;					\
+	IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal,		\
+				nthreads, sizeof(flt_t));		\
+	buffers->thr_pack_cop(ifrom, ito, 0);				\
+	int nghost = nall - nlocal;					\
+	if (nghost) {							\
+	  IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal,	\
+				 nthreads, sizeof(flt_t));		\
+	  buffers->thr_pack_cop(ifrom + nlocal, ito + nlocal,		\
+				fix->offload_min_ghost() - nlocal,	\
+				ago == 1);				\
+	}								\
+      }									\
+    } else {								\
+      buffers->thr_pack_host(fix->host_min_local(), nlocal, 0);		\
+      buffers->thr_pack_host(nlocal, nall,				\
+			     fix->host_min_ghost()-nlocal);		\
+    }									\
+    fix->stop_watch(TIME_PACK);						\
+  }									\
+}
+
+#define IP_PRE_get_transfern(ago, newton, evflag, eflag, vflag, 	\
+			     buffers, offload, fix, separate_flag,	\
+			     x_size, q_size, ev_size, f_stride)		\
+{									\
+  separate_flag = 0;							\
+  if (ago == 0) {							\
+    x_size = 0;								\
+    q_size = nall;							\
+    if (offload) {							\
+      if (fix->separate_buffers()) {					\
+	if (lmp->atom->torque)						\
+	  separate_flag = 2;						\
+	else								\
+	  separate_flag = 1;						\
+      } else								\
+	separate_flag = 3;						\
+    }									\
+  } else {								\
+    x_size = nall;							\
+    q_size = 0;								\
+  }									\
+  ev_size = 0;								\
+  if (evflag) {								\
+    if (eflag) ev_size = 2;						\
+    if (vflag) ev_size = 8;						\
+  }									\
+  int f_length;								\
+  if (newton)								\
+    f_length = nall;							\
+  else									\
+    f_length = nlocal;							\
+  f_length -= minlocal;							\
+  f_stride = buffers->get_stride(f_length);				\
+}
+
+#define IP_PRE_get_buffers(offload, buffers, fix, tc, f_start,    	\
+			   ev_global)					\
+{									\
+  if (offload) {							\
+    tc = buffers->get_off_threads();					\
+    f_start = buffers->get_off_f();					\
+    ev_global = buffers->get_ev_global();				\
+  } else {								\
+    tc = comm->nthreads;						\
+    f_start = buffers->get_f();						\
+    fix->start_watch(TIME_HOST_PAIR);					\
+    ev_global = buffers->get_ev_global_host();				\
+  }									\
+}
+
+#define IP_PRE_repack_for_offload(newton, separate_flag, nlocal, nall,	\
+				  f_stride, x, q)			\
+{									\
+  if (separate_flag) {							\
+    if (separate_flag < 3) {						\
+      int all_local = nlocal;						\
+      int ghost_min = overflow[LMP_GHOST_MIN];				\
+      nlocal = overflow[LMP_LOCAL_MAX] + 1;				\
+      int nghost = overflow[LMP_GHOST_MAX] + 1 - ghost_min;		\
+      if (nghost < 0) nghost = 0;					\
+      nall = nlocal + nghost;						\
+      separate_flag--;							\
+      int flength;							\
+      if (NEWTON_PAIR) flength = nall;					\
+      else flength = nlocal;						\
+      IP_PRE_get_stride(f_stride, flength, sizeof(FORCE_T),		\
+			   separate_flag);				\
+      if (nghost) {							\
+	if (nlocal < all_local || ghost_min > all_local) {		\
+	  memmove(x + nlocal, x + ghost_min,				\
+		  (nall - nlocal) * sizeof(ATOM_T));			\
+	  if (q != 0)							\
+	    memmove((void *)(q + nlocal), (void *)(q + ghost_min),	\
+		    (nall - nlocal) * sizeof(flt_t));			\
+	}								\
+      }									\
+    }									\
+    x[nall].x = INTEL_BIGP;						\
+    x[nall].y = INTEL_BIGP;						\
+    x[nall].z = INTEL_BIGP;						\
+  }									\
+}
+
+
+#else
+
+#define MIC_Wtime MPI_Wtime
+#define IP_PRE_pack_separate_buffers(fix, buffers, ago, offload,        \
+                                     nlocal, nall)			
+
+#define IP_PRE_get_transfern(ago, newton, evflag, eflag, vflag, 	\
+			     buffers, offload, fix, separate_flag,	\
+			     x_size, q_size, ev_size, f_stride)		\
+{                                                                       \
+  separate_flag = 0;							\
+  int f_length;                                                         \
+  if (newton)                                                           \
+    f_length = nall;                                                    \
+  else                                                                  \
+    f_length = nlocal;                                                  \
+  f_stride = buffers->get_stride(f_length);				\
+}
+
+#define IP_PRE_get_buffers(offload, buffers, fix, tc, f_start,    	\
+			   ev_global)					\
+{									\
+  tc = comm->nthreads;							\
+  f_start = buffers->get_f();						\
+  fix->start_watch(TIME_HOST_PAIR);					\
+  ev_global = buffers->get_ev_global_host();				\
+}
+
+#define IP_PRE_repack_for_offload(newton, separate_flag, nlocal, nall,	\
+				  f_stride, x, q)			
+
+
+#endif
+
+#define IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz)	\
+{									\
+  if (vflag == 1) {							\
+    sv0 += ev_pre * delx * delx * fpair;				\
+    sv1 += ev_pre * dely * dely * fpair;				\
+    sv2 += ev_pre * delz * delz * fpair;				\
+    sv3 += ev_pre * delx * dely * fpair;				\
+    sv4 += ev_pre * delx * delz * fpair;				\
+    sv5 += ev_pre * dely * delz * fpair;				\
+  }									\
+}
+
+#define IP_PRE_ev_tally_atom(evflag, eflag, vflag, f, fwtmp)    	\
+{									\
+  if (evflag) {								\
+    if (eflag) {							\
+      f[i].w += fwtmp;							\
+      oevdwl += sevdwl;							\
+    }									\
+    if (vflag == 1) {							\
+      ov0 += sv0;							\
+      ov1 += sv1;							\
+      ov2 += sv2;							\
+      ov3 += sv3;							\
+      ov4 += sv4;							\
+      ov5 += sv5;							\
+    }									\
+  }									\
+}
+
+#define IP_PRE_ev_tally_atomq(evflag, eflag, vflag, f, fwtmp)    	\
+{									\
+  if (evflag) {								\
+    if (eflag) {							\
+      f[i].w += fwtmp;							\
+      oevdwl += sevdwl;							\
+      oecoul += secoul;							\
+    }									\
+    if (vflag == 1) {							\
+      ov0 += sv0;							\
+      ov1 += sv1;							\
+      ov2 += sv2;							\
+      ov3 += sv3;							\
+      ov4 += sv4;							\
+      ov5 += sv5;							\
+    }									\
+  }									\
+}
+
+#define IP_PRE_fdotr_acc_force(newton, evflag, eflag, vflag, eatom,	\
+			       nall, nlocal, minlocal, nthreads,	\
+			       f_start, f_stride, x)			\
+{									\
+  int o_range;								\
+  if (newton)								\
+    o_range = nall;							\
+  else									\
+    o_range = nlocal;							\
+  if (offload == 0) o_range -= minlocal;				\
+    IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads,		\
+			 sizeof(acc_t));				\
+									\
+  int t_off = f_stride;						        \
+  if (eflag && eatom) {							\
+    for (int t = 1; t < nthreads; t++) {				\
+      _Pragma("vector nontemporal")					\
+      for (int n = iifrom; n < iito; n++) {				\
+        f_start[n].x += f_start[n + t_off].x;				\ 
+        f_start[n].y += f_start[n + t_off].y;				\
+	f_start[n].z += f_start[n + t_off].z;				\
+	f_start[n].w += f_start[n + t_off].w;				\
+      }									\
+      t_off += f_stride;						\
+    }									\
+  } else {								\
+    for (int t = 1; t < nthreads; t++) {				\
+      _Pragma("vector nontemporal")   					\
+      for (int n = iifrom; n < iito; n++) {                             \ 
+	f_start[n].x += f_start[n + t_off].x;                  	        \
+        f_start[n].y += f_start[n + t_off].y;				\
+        f_start[n].z += f_start[n + t_off].z;				\
+      }									\
+      t_off += f_stride;						\
+    }									\
+  }									\
+									\
+  if (evflag) {								\
+    if (vflag == 2) {							\
+      const ATOM_T * restrict const xo = x + minlocal;			\
+      _Pragma("vector nontemporal")   					\
+      for (int n = iifrom; n < iito; n++) {				\
+	ov0 += f_start[n].x * xo[n].x;					\
+	ov1 += f_start[n].y * xo[n].y;					\
+	ov2 += f_start[n].z * xo[n].z;					\
+	ov3 += f_start[n].y * xo[n].x;					\
+	ov4 += f_start[n].z * xo[n].x;					\
+	ov5 += f_start[n].z * xo[n].y;					\
+      }									\
+    }									\
+  }									\
+}
+
+}
+
+#endif
diff --git a/src/USER-INTEL/math_extra_intel.h b/src/USER-INTEL/math_extra_intel.h
new file mode 100644
index 0000000000..62163b3f60
--- /dev/null
+++ b/src/USER-INTEL/math_extra_intel.h
@@ -0,0 +1,354 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifndef LMP_MATH_EXTRA_INTEL_H
+#define LMP_MATH_EXTRA_INTEL_H
+
+#define ME_quat_to_mat_trans(quat, mat)		\
+{						\
+  flt_t quat_w = quat.w;			\
+  flt_t quat_i = quat.i;			\
+  flt_t quat_j = quat.j;			\
+  flt_t quat_k = quat.k;			\
+  flt_t w2 = quat_w * quat_w;			\
+  flt_t i2 = quat_i * quat_i;			\
+  flt_t j2 = quat_j * quat_j;			\
+  flt_t k2 = quat_k * quat_k;			\
+  flt_t twoij = (flt_t)2.0 * quat_i * quat_j;	\
+  flt_t twoik = (flt_t)2.0 * quat_i * quat_k;	\
+  flt_t twojk = (flt_t)2.0 * quat_j * quat_k;	\
+  flt_t twoiw = (flt_t)2.0 * quat_i * quat_w;	\
+  flt_t twojw = (flt_t)2.0 * quat_j * quat_w;	\
+  flt_t twokw = (flt_t)2.0 * quat_k * quat_w;	\
+  						\
+  mat##_0 = w2 + i2 - j2 - k2;			\
+  mat##_3 = twoij - twokw;			\
+  mat##_6 = twojw + twoik;			\
+  						\
+  mat##_1 = twoij + twokw;			\
+  mat##_4 = w2 - i2 + j2 - k2;			\
+  mat##_7 = twojk - twoiw;			\
+  						\
+  mat##_2 = twoik - twojw;			\
+  mat##_5 = twojk + twoiw;			\
+  mat##_8 = w2 - i2 - j2 + k2;			\
+}
+
+/* ----------------------------------------------------------------------
+   diagonal matrix times a full matrix
+------------------------------------------------------------------------- */
+
+#define ME_diag_times3(d, m, ans)			\
+  {							\
+  ans##_0 = d[0] * m##_0;				\
+  ans##_1 = d[0] * m##_1;				\
+  ans##_2 = d[0] * m##_2;				\
+  ans##_3 = d[1] * m##_3;				\
+  ans##_4 = d[1] * m##_4;				\
+  ans##_5 = d[1] * m##_5;				\
+  ans##_6 = d[2] * m##_6;				\
+  ans##_7 = d[2] * m##_7;				\
+  ans##_8 = d[2] * m##_8;				\
+}
+
+#define ME_diag_times3a(d, m, ans)			\
+  {							\
+  ans##_0 = d##_0 * m##_0;				\
+  ans##_1 = d##_0 * m##_1;				\
+  ans##_2 = d##_0 * m##_2;				\
+  ans##_3 = d##_1 * m##_3;				\
+  ans##_4 = d##_1 * m##_4;				\
+  ans##_5 = d##_1 * m##_5;				\
+  ans##_6 = d##_2 * m##_6;				\
+  ans##_7 = d##_2 * m##_7;				\
+  ans##_8 = d##_2 * m##_8;				\
+}
+
+/* ----------------------------------------------------------------------
+   multiply the transpose of mat1 times mat2
+------------------------------------------------------------------------- */
+
+#define ME_transpose_times3(m1, m2, ans)                	\
+{								\
+  ans##_0 = m1##_0*m2##_0 + m1##_3*m2##_3 + m1##_6*m2##_6;	\
+  ans##_1 = m1##_0*m2##_1 + m1##_3*m2##_4 + m1##_6*m2##_7;	\
+  ans##_2 = m1##_0*m2##_2 + m1##_3*m2##_5 + m1##_6*m2##_8;	\
+  ans##_3 = m1##_1*m2##_0 + m1##_4*m2##_3 + m1##_7*m2##_6;	\
+  ans##_4 = m1##_1*m2##_1 + m1##_4*m2##_4 + m1##_7*m2##_7;	\
+  ans##_5 = m1##_1*m2##_2 + m1##_4*m2##_5 + m1##_7*m2##_8;	\
+  ans##_6 = m1##_2*m2##_0 + m1##_5*m2##_3 + m1##_8*m2##_6;	\
+  ans##_7 = m1##_2*m2##_1 + m1##_5*m2##_4 + m1##_8*m2##_7;	\
+  ans##_8 = m1##_2*m2##_2 + m1##_5*m2##_5 + m1##_8*m2##_8;	\
+}
+
+/* ----------------------------------------------------------------------
+   normalize a vector, return in ans
+------------------------------------------------------------------------- */
+
+#define ME_normalize3(v0, v1, v2, ans)	        	\
+{							\
+  flt_t scale = (flt_t)1.0 / sqrt(v0*v0+v1*v1+v2*v2);	\
+  ans##_0 = v0 * scale;					\
+  ans##_1 = v1 * scale;					\
+  ans##_2 = v2 * scale;					\
+}
+
+/* ----------------------------------------------------------------------
+   add two matrices
+------------------------------------------------------------------------- */
+
+#define ME_plus3(m1, m2, ans)			\
+{						\
+  ans##_0 = m1##_0 + m2##_0;			\
+  ans##_1 = m1##_1 + m2##_1;			\
+  ans##_2 = m1##_2 + m2##_2;			\
+  ans##_3 = m1##_3 + m2##_3;			\
+  ans##_4 = m1##_4 + m2##_4;			\
+  ans##_5 = m1##_5 + m2##_5;			\
+  ans##_6 = m1##_6 + m2##_6;			\
+  ans##_7 = m1##_7 + m2##_7;			\
+  ans##_8 = m1##_8 + m2##_8;			\
+}
+
+/* ----------------------------------------------------------------------
+   dot product of 2 vectors
+------------------------------------------------------------------------- */
+
+#define ME_dot3(v1, v2) \
+  (v1##_0*v2##_0 + v1##_1 * v2##_1 + v1##_2 * v2##_2)
+
+/* ----------------------------------------------------------------------
+   determinant of a matrix
+------------------------------------------------------------------------- */
+
+#define ME_det3(m)				    \
+  ( m##_0 * m##_4 * m##_8 - m##_0 * m##_5 * m##_7 - \
+    m##_3 * m##_1 * m##_8 + m##_3 * m##_2 * m##_7 + \
+    m##_6 * m##_1 * m##_5 - m##_6 * m##_2 * m##_4 )
+
+/* ----------------------------------------------------------------------
+   row vector times matrix
+------------------------------------------------------------------------- */
+
+#define ME_vecmat(v, m, ans)				    \
+{							    \
+  ans##_0 = v##_0 * m##_0 + v##_1 * m##_3 + v##_2 * m##_6;  \
+  ans##_1 = v##_0 * m##_1 + v##_1 * m##_4 + v##_2 * m##_7;  \
+  ans##_2 = v##_0 * m##_2 + v##_1 * m##_5 + v##_2 * m##_8;  \
+}
+
+/* ----------------------------------------------------------------------
+   cross product of 2 vectors
+------------------------------------------------------------------------- */
+
+#define ME_cross3(v1, v2, ans)			\
+{						\
+  ans##_0 = v1##_1 * v2##_2 - v1##_2 * v2##_1;	\
+  ans##_1 = v1##_2 * v2##_0 - v1##_0 * v2##_2;	\
+  ans##_2 = v1##_0 * v2##_1 - v1##_1 * v2##_0;	\
+}
+
+/* ----------------------------------------------------------------------
+   cross product of 2 vectors
+------------------------------------------------------------------------- */
+
+#define ME_mv0_cross3(m1, v2, ans)		\
+{						\
+  ans##_0 = m1##_1 * v2##_2 - m1##_2 * v2##_1;	\
+  ans##_1 = m1##_2 * v2##_0 - m1##_0 * v2##_2;	\
+  ans##_2 = m1##_0 * v2##_1 - m1##_1 * v2##_0;	\
+}
+
+#define ME_mv1_cross3(m1, v2, ans)		\
+{						\
+  ans##_0 = m1##_4 * v2##_2 - m1##_5 * v2##_1;	\
+  ans##_1 = m1##_5 * v2##_0 - m1##_3 * v2##_2;	\
+  ans##_2 = m1##_3 * v2##_1 - m1##_4 * v2##_0;	\
+}
+
+#define ME_mv2_cross3(m1, v2, ans)		\
+{						\
+  ans##_0 = m1##_7 * v2##_2 - m1##_8 * v2##_1;	\
+  ans##_1 = m1##_8 * v2##_0 - m1##_6 * v2##_2;	\
+  ans##_2 = m1##_6 * v2##_1 - m1##_7 * v2##_0;	\
+}
+
+
+#define ME_compute_eta_torque(m1, m2, s1, ans)                              \
+{								            \
+  flt_t den = m1##_3*m1##_2*m1##_7-m1##_0*m1##_5*m1##_7-		    \
+    m1##_2*m1##_6*m1##_4+m1##_1*m1##_6*m1##_5-				    \
+    m1##_3*m1##_1*m1##_8+m1##_0*m1##_4*m1##_8;				    \
+  den = (flt_t)1.0 / den;						    \
+									    \
+  ans##_0 = s1##_0*(m1##_5*m1##_1*m2##_2+(flt_t)2.0*m1##_4*m1##_8*m2##_0-   \
+ 		   m1##_4*m2##_2*m1##_2-(flt_t)2.0*m1##_5*m2##_0*m1##_7+    \
+		   m2##_1*m1##_2*m1##_7-m2##_1*m1##_1*m1##_8-	 	    \
+		   m1##_3*m1##_8*m2##_1+m1##_6*m1##_5*m2##_1+		    \
+		   m1##_3*m2##_2*m1##_7-m2##_2*m1##_6*m1##_4)*den;	    \
+									    \
+  ans##_1 = s1##_0*(m1##_2*m2##_0*m1##_7-m1##_8*m2##_0*m1##_1+		    \
+		   (flt_t)2.0*m1##_0*m1##_8*m2##_1-m1##_0*m2##_2*m1##_5-    \
+		   (flt_t)2.0*m1##_6*m1##_2*m2##_1+m2##_2*m1##_3*m1##_2-    \
+		   m1##_8*m1##_3*m2##_0+m1##_6*m2##_0*m1##_5+		    \
+		   m1##_6*m2##_2*m1##_1-m2##_2*m1##_0*m1##_7)*den;	    \
+  									    \
+  ans##_2 = s1##_0*(m1##_1*m1##_5*m2##_0-m1##_2*m2##_0*m1##_4-              \
+		   m1##_0*m1##_5*m2##_1+m1##_3*m1##_2*m2##_1-		    \
+		   m2##_1*m1##_0*m1##_7-m1##_6*m1##_4*m2##_0+		    \
+		   (flt_t)2.0*m1##_4*m1##_0*m2##_2-                         \
+		   (flt_t)2.0*m1##_3*m2##_2*m1##_1+			    \
+		   m1##_3*m1##_7*m2##_0+m1##_6*m2##_1*m1##_1)*den;	    \
+									    \
+  ans##_3 = s1##_1*(-m1##_4*m2##_5*m1##_2+(flt_t)2.0*m1##_4*m1##_8*m2##_3+  \
+		   m1##_5*m1##_1*m2##_5-(flt_t)2.0*m1##_5*m2##_3*m1##_7+    \
+		   m2##_4*m1##_2*m1##_7-m2##_4*m1##_1*m1##_8-		    \
+		   m1##_3*m1##_8*m2##_4+m1##_6*m1##_5*m2##_4- 		    \
+		   m2##_5*m1##_6*m1##_4+m1##_3*m2##_5*m1##_7)*den;	    \
+									    \
+  ans##_4 = s1##_1*(m1##_2*m2##_3*m1##_7-m1##_1*m1##_8*m2##_3+		    \
+		   (flt_t)2.0*m1##_8*m1##_0*m2##_4-m2##_5*m1##_0*m1##_5-    \
+		   (flt_t)2.0*m1##_6*m2##_4*m1##_2-m1##_3*m1##_8*m2##_3+    \
+		   m1##_6*m1##_5*m2##_3+m1##_3*m2##_5*m1##_2-		    \
+		   m1##_0*m2##_5*m1##_7+m2##_5*m1##_1*m1##_6)*den;	    \
+									    \
+  ans##_5 = s1##_1*(m1##_1*m1##_5*m2##_3-m1##_2*m2##_3*m1##_4-		    \
+		   m1##_0*m1##_5*m2##_4+m1##_3*m1##_2*m2##_4+		    \
+		   (flt_t)2.0*m1##_4*m1##_0*m2##_5-m1##_0*m2##_4*m1##_7+    \
+		   m1##_1*m1##_6*m2##_4-m2##_3*m1##_6*m1##_4-		    \
+		   (flt_t)2.0*m1##_3*m1##_1*m2##_5+m1##_3*m2##_3*m1##_7)*   \
+    den;							   	    \
+									    \
+  ans##_6 = s1##_2*(-m1##_4*m1##_2*m2##_8+m1##_1*m1##_5*m2##_8+  	    \
+		   (flt_t)2.0*m1##_4*m2##_6*m1##_8-m1##_1*m2##_7*m1##_8+    \
+		   m1##_2*m1##_7*m2##_7-(flt_t)2.0*m2##_6*m1##_7*m1##_5-    \
+		   m1##_3*m2##_7*m1##_8+m1##_5*m1##_6*m2##_7-		    \
+		   m1##_4*m1##_6*m2##_8+m1##_7*m1##_3*m2##_8)*den;	    \
+									    \
+  ans##_7 = s1##_2*-(m1##_1*m1##_8*m2##_6-m1##_2*m2##_6*m1##_7-		    \
+		    (flt_t)2.0*m2##_7*m1##_0*m1##_8+m1##_5*m2##_8*m1##_0+   \
+		    (flt_t)2.0*m2##_7*m1##_2*m1##_6+m1##_3*m2##_6*m1##_8-   \
+		    m1##_3*m1##_2*m2##_8-m1##_5*m1##_6*m2##_6+		    \
+		    m1##_0*m2##_8*m1##_7-m2##_8*m1##_1*m1##_6)*den;	    \
+									    \
+  ans##_8 = s1##_2*(m1##_1*m1##_5*m2##_6-m1##_2*m2##_6*m1##_4-		    \
+		   m1##_0*m1##_5*m2##_7+m1##_3*m1##_2*m2##_7-		    \
+		   m1##_4*m1##_6*m2##_6-m1##_7*m2##_7*m1##_0+		    \
+		   (flt_t)2.0*m1##_4*m2##_8*m1##_0+m1##_7*m1##_3*m2##_6+    \
+                    m1##_6*m1##_1*m2##_7-(flt_t)2.0*m2##_8*m1##_3*m1##_1)*  \
+    den;								    \
+}
+
+#define ME_vcopy4(dst,src)			\
+  dst##_0 = src##_0;				\
+  dst##_1 = src##_1;				\
+  dst##_2 = src##_2;				\
+  dst##_3 = src##_3;
+
+#define ME_mldivide3(m1, v_0, v_1, v_2, ans, error)	\
+{							\
+  flt_t aug_0, aug_1, aug_2, aug_3, aug_4, aug_5;	\
+  flt_t aug_6, aug_7, aug_8, aug_9, aug_10, aug_11, t;	\
+							\
+  aug_3 = v_0;						\
+  aug_0 = m1##_0;					\
+  aug_1 = m1##_1;					\
+  aug_2 = m1##_2;					\
+  aug_7 = v_1;						\
+  aug_4 = m1##_3;					\
+  aug_5 = m1##_4;					\
+  aug_6 = m1##_5;					\
+  aug_11 = v_2;						\
+  aug_8 = m1##_6;					\
+  aug_9 = m1##_7;					\
+  aug_10 = m1##_8;					\
+							\
+  if (fabs(aug_4) > fabs(aug_0)) {			\
+    flt_t swapt;					\
+    swapt = aug_0; aug_0 = aug_4; aug_4 = swapt;	\
+    swapt = aug_1; aug_1 = aug_5; aug_5 = swapt;	\
+    swapt = aug_2; aug_2 = aug_6; aug_6 = swapt;	\
+    swapt = aug_3; aug_3 = aug_7; aug_7 = swapt;	\
+  }							\
+  if (fabs(aug_8) > fabs(aug_0)) {			\
+    flt_t swapt;					\
+    swapt = aug_0; aug_0 = aug_8; aug_8 = swapt;	\
+    swapt = aug_1; aug_1 = aug_9; aug_9 = swapt;        \ 
+    swapt = aug_2; aug_2 = aug_10; aug_10 = swapt;      \
+    swapt = aug_3; aug_3 = aug_11; aug_11 = swapt;      \
+  }							\
+							\
+  if (aug_0 != (flt_t)0.0) {				\
+  } else if (aug_4 != (flt_t)0.0) {			\
+    flt_t swapt;					\
+    swapt = aug_0; aug_0 = aug_4; aug_4 = swapt;	\
+    swapt = aug_1; aug_1 = aug_5; aug_5 = swapt;	\
+    swapt = aug_2; aug_2 = aug_6; aug_6 = swapt;	\
+    swapt = aug_3; aug_3 = aug_7; aug_7 = swapt;	\
+  } else if (aug_8 != (flt_t)0.0) {			\
+    flt_t swapt;					\
+    swapt = aug_0; aug_0 = aug_8; aug_8 = swapt;	\
+    swapt = aug_1; aug_1 = aug_9; aug_9 = swapt;	\
+    swapt = aug_2; aug_2 = aug_10; aug_10 = swapt;	\
+    swapt = aug_3; aug_3 = aug_11; aug_11 = swapt;	\
+  } else						\
+    error = 1;						\
+							\
+  t = aug_4 / aug_0;					\
+  aug_5 -= t * aug_1;					\
+  aug_6 -= t * aug_2;					\
+  aug_7 -= t * aug_3;					\
+  t = aug_8 / aug_0;					\
+  aug_9 -= t * aug_1;					\
+  aug_10 -= t * aug_2;					\
+  aug_11 -= t * aug_3;					\
+							\
+  if (fabs(aug_9) > fabs(aug_5)) {			\
+    flt_t swapt;					\
+    swapt = aug_4; aug_4 = aug_8; aug_8 = swapt;	\
+    swapt = aug_5; aug_5 = aug_9; aug_9 = swapt;	\
+    swapt = aug_6; aug_6 = aug_10; aug_10 = swapt;	\
+    swapt = aug_7; aug_7 = aug_11; aug_11 = swapt;	\
+  }							\
+							\
+  if (aug_5 != (flt_t)0.0) {				\
+  } else if (aug_9 != (flt_t)0.0) {			\
+    flt_t swapt;					\
+    swapt = aug_4; aug_4 = aug_8; aug_8 = swapt;        \
+    swapt = aug_5; aug_5 = aug_9; aug_9 = swapt;	\
+    swapt = aug_6; aug_6 = aug_10; aug_10 = swapt;	\
+    swapt = aug_7; aug_7 = aug_11; aug_11 = swapt;	\
+  }							\
+							\
+  t = aug_9 / aug_5;					\
+  aug_10 -= t * aug_6;					\
+  aug_11 -= t * aug_7;					\
+							\
+  if (aug_10 == (flt_t)0.0)				\
+    error = 1;						\
+							\
+  ans##_2 = aug_11/aug_10;				\
+  t = (flt_t)0.0;					\
+  t += aug_6 * ans##_2;					\
+  ans##_1 = (aug_7-t) / aug_5;				\
+  t = (flt_t)0.0;					\
+  t += aug_1 * ans##_1;					\
+  t += aug_2 * ans##_2;					\
+  ans##_0 = (aug_3 - t) / aug_0;			\
+}
+
+#endif
diff --git a/src/USER-INTEL/neigh_half_bin_intel.cpp b/src/USER-INTEL/neigh_half_bin_intel.cpp
new file mode 100644
index 0000000000..a5f12a56f9
--- /dev/null
+++ b/src/USER-INTEL/neigh_half_bin_intel.cpp
@@ -0,0 +1,1453 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "atom.h"
+#include "comm.h"
+#include "group.h"
+#include "fix_intel.h"
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
+using namespace LAMMPS_NS;
+
+#ifdef _LMP_INTEL_OFFLOAD
+#pragma offload_attribute(push,target(mic))
+#endif
+
+template <class flt_t>
+inline int mcoord2bin(const flt_t x0, const flt_t x1, const flt_t x2,
+		      const flt_t bboxlo0, const flt_t bboxlo1,
+		      const flt_t bboxlo2, const flt_t bboxhi0,
+		      const flt_t bboxhi1, const flt_t bboxhi2,
+		      const flt_t bininvx, const flt_t bininvy,
+		      const flt_t bininvz, const int nbinx, const int nbiny,
+		      const int nbinz, const int mbinx, const int mbiny,
+		      const int mbinz, const int mbinxlo, const int mbinylo,
+		      const int mbinzlo)
+{
+  int ix, iy, iz;
+
+  if (x0 >= bboxhi0)
+    ix = static_cast<int> ((x0 - bboxhi0) * bininvx) + nbinx;
+  else if (x0 >= bboxlo0) {
+    ix = static_cast<int> ((x0 - bboxlo0) * bininvx);
+    ix = MIN(ix, nbinx-1);
+  } else
+    ix = static_cast<int> ((x0 - bboxlo0) * bininvx) - 1;
+
+  if (x1 >= bboxhi1)
+    iy = static_cast<int> ((x1 - bboxhi1) * bininvy) + nbiny;
+  else if (x1 >= bboxlo1) {
+    iy = static_cast<int> ((x1 - bboxlo1) * bininvy);
+    iy = MIN(iy, nbiny-1);
+  } else
+    iy = static_cast<int> ((x1 - bboxlo1) * bininvy) - 1;
+
+  if (x2 >= bboxhi2)
+    iz = static_cast<int> ((x2 - bboxhi2) * bininvz) + nbinz;
+  else if (x2 >= bboxlo2) {
+    iz = static_cast<int> ((x2 - bboxlo2) * bininvz);
+    iz = MIN(iz, nbinz - 1);
+  } else
+    iz = static_cast<int> ((x2 - bboxlo2) * bininvz) - 1;
+
+  return (iz - mbinzlo) * mbiny * mbinx + (iy - mbinylo) * mbinx +
+    (ix - mbinxlo);
+}
+
+#define ofind_special(which, special, nspecial, i, tag, special_flag) \
+{                                                                     \
+  which = 0;                                                          \
+  const int n1 = nspecial[i * 3];                                     \
+  const int n2 = nspecial[i * 3 + 1];                                 \
+  const int n3 = nspecial[i * 3 + 2];                                 \
+  const int *sptr = special + i * maxspecial;                         \
+  for (int s = 0; s < n3; s++) {                                      \
+    if (sptr[s] == tag) {                                             \
+      if (s < n1) {                                                   \
+        if (special_flag[1] == 0) which = -1;                         \
+        else if (special_flag[1] == 1) which = 0;                     \
+        else which = 1;                                               \
+      } else if (s < n2) {                                            \
+        if (special_flag[2] == 0) which = -1;                         \
+        else if (special_flag[2] == 1) which = 0;                     \
+        else which = 2;                                               \
+      } else {                                                        \
+        if (special_flag[3] == 0) which = -1;                         \
+        else if (special_flag[3] == 1) which = 0;                     \
+        else which = 3;                                               \
+      }                                                               \
+    }                                                                 \
+  }                                                                   \
+}
+
+#ifdef _LMP_INTEL_OFFLOAD
+#pragma offload_attribute(pop)
+#endif
+
+template <class flt_t, class acc_t>
+void Neighbor::bin_atoms(void * xin) {
+  const ATOM_T * restrict const x = (const ATOM_T * restrict const)xin;
+  int nlocal = atom->nlocal;
+  const int nall = nlocal + atom->nghost;
+
+  const flt_t bininvx = this->bininvx;
+  const flt_t bininvy = this->bininvy;
+  const flt_t bininvz = this->bininvz;
+  const flt_t bboxlo0 = this->bboxlo[0];
+  const flt_t bboxlo1 = this->bboxlo[1];
+  const flt_t bboxlo2 = this->bboxlo[2];
+  const flt_t bboxhi0 = this->bboxhi[0];
+  const flt_t bboxhi1 = this->bboxhi[1];
+  const flt_t bboxhi2 = this->bboxhi[2];
+
+  int i, ibin;
+
+  for (i = 0; i < mbins; i++) binhead[i] = -1;
+
+  int *mask = atom->mask;
+
+  if (includegroup) {
+    int bitmask = group->bitmask[includegroup];
+    for (i = nall-1; i >= nlocal; i--) {
+      if (mask[i] & bitmask) {
+        ibin = mcoord2bin(x[i].x, x[i].y, x[i].z, bboxlo0, bboxlo1, bboxlo2,
+            bboxhi0, bboxhi1, bboxhi2, bininvx, bininvy, bininvz, nbinx, nbiny,
+            nbinz, mbinx, mbiny, mbinz, mbinxlo, mbinylo, mbinzlo);
+        bins[i] = binhead[ibin];
+        binhead[ibin] = i;
+      }
+    }
+    for (i = atom->nfirst-1; i >= 0; i--) {
+      ibin = mcoord2bin(x[i].x, x[i].y, x[i].z, bboxlo0, bboxlo1, bboxlo2,
+          bboxhi0, bboxhi1, bboxhi2, bininvx, bininvy, bininvz, nbinx, nbiny,
+          nbinz, mbinx, mbiny, mbinz, mbinxlo, mbinylo, mbinzlo);
+      bins[i] = binhead[ibin];
+      binhead[ibin] = i;
+    }
+  } else {
+    for (i = nall-1; i >= 0; i--) {
+      ibin = mcoord2bin(x[i].x, x[i].y, x[i].z, bboxlo0, bboxlo1, bboxlo2,
+          bboxhi0, bboxhi1, bboxhi2, bininvx, bininvy, bininvz, nbinx, nbiny,
+          nbinz, mbinx, mbiny, mbinz, mbinxlo, mbinylo, mbinzlo);
+      bins[i] = binhead[ibin];
+      binhead[ibin] = i;
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   binned neighbor list construction with partial Newton's 3rd law
+   each owned atom i checks own bin and other bins in stencil
+   pair stored once if i,j are both owned and i < j
+   pair stored by me if j is ghost (also stored by proc owning j)
+------------------------------------------------------------------------- */
+
+void Neighbor::half_bin_no_newton_intel(NeighList *list)
+{
+  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
+  list->inum = nlocal;
+
+  // Get fix for intel stuff
+  FixIntel *fix = static_cast<FixIntel *>(fix_intel);
+
+  const int off_end = fix->offload_end_neighbor();
+  int host_start = off_end;;
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (fix->full_host_list()) host_start = 0;
+  if (exclude) 
+    error->all(FLERR, "Exclusion lists not yet supported for Intel offload");
+  #endif
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+    hbnni<float,double>(1, list, fix->get_mixed_buffers(),
+                        0, off_end, fix);
+    hbnni<float,double>(0, list, fix->get_mixed_buffers(),
+                        host_start, nlocal,fix);
+  } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+    hbnni<double,double>(1, list, fix->get_double_buffers(),
+                         0, off_end, fix);
+    hbnni<double,double>(0, list, fix->get_double_buffers(),
+                         host_start, nlocal, fix);
+  } else {
+    hbnni<float,float>(1, list, fix->get_single_buffers(),
+                       0, off_end, fix);
+    hbnni<float,float>(0, list, fix->get_single_buffers(),
+                       host_start, nlocal, fix);
+  }
+}
+
+template <class flt_t, class acc_t>
+void Neighbor::hbnni(const int offload, NeighList *list, void *buffers_in,
+                     const int astart, const int aend, void *fix_in) {
+  IntelBuffers<flt_t,acc_t> *buffers = (IntelBuffers<flt_t,acc_t> *)buffers_in;
+  FixIntel *fix = (FixIntel *)fix_in;
+  const int nall = atom->nlocal + atom->nghost;
+  int pad = 1;
+
+  if (offload) {
+    fix->start_watch(TIME_PACK);
+    buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
+    buffers->grow_nbor(list, atom->nlocal, aend);
+
+    ATOM_T biga;
+    biga.x = INTEL_BIGP;
+    biga.y = INTEL_BIGP;
+    biga.z = INTEL_BIGP;
+    biga.w = 1;
+    buffers->get_x()[nall] = biga;
+
+    const int nthreads = comm->nthreads;
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) shared(buffers)
+    #endif
+    {
+      int ifrom, ito, tid;
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads, 
+				sizeof(ATOM_T));
+      buffers->thr_pack(ifrom, ito, 0);
+    }
+    fix->stop_watch(TIME_PACK);
+
+    fix->start_watch(TIME_HOST_NEIGHBOR);
+    bin_atoms<flt_t,acc_t>(buffers->get_x());
+    if (INTEL_MIC_NBOR_PAD > 1)
+      pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
+  } else {
+    fix->start_watch(TIME_HOST_NEIGHBOR);
+    if (INTEL_NBOR_PAD > 1)
+      pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
+  }
+  const int pad_width = pad;
+
+  if (aend-astart == 0) {
+    fix->stop_watch(TIME_HOST_NEIGHBOR);
+    return;
+  }
+
+  const ATOM_T * restrict const x = buffers->get_x();
+  int * restrict const firstneigh = buffers->firstneigh(list);
+
+  const int molecular = atom->molecular;
+  int *ns = NULL, *s = NULL;
+  int tag_size, special_size;
+  if (molecular) {
+    s = atom->special[0];
+    ns = atom->nspecial[0];
+    tag_size = nall;
+    special_size = aend;
+  } else {
+    s = &buffers->_special_holder;
+    ns = &buffers->_nspecial_holder;
+    tag_size = 0;
+    special_size = 0;
+  }
+  const int * restrict const special = s;
+  const int * restrict const nspecial = ns;
+  const int maxspecial = atom->maxspecial;
+  const int * restrict const tag = atom->tag;
+
+  int * restrict const ilist = list->ilist;
+  int * restrict numneigh = list->numneigh;
+  int * restrict const cnumneigh = buffers->cnumneigh(list);
+  const int nstencil = list->nstencil;
+  const int * restrict const stencil = list->stencil;
+  const flt_t * restrict const cutneighsq = buffers->get_cutneighsq()[0];
+  const int ntypes = atom->ntypes + 1;
+  const int nlocal = atom->nlocal;
+
+  #ifndef _LMP_INTEL_OFFLOAD
+  int * const mask = atom->mask;
+  int * const molecule = atom->molecule;
+  #endif
+
+  int tnum;
+  int *overflow;
+  double *timer_compute;
+  if (offload) {
+    timer_compute = fix->off_watch_neighbor();
+    tnum = buffers->get_off_threads();
+    overflow = fix->get_off_overflow_flag();
+    fix->stop_watch(TIME_HOST_NEIGHBOR);
+    fix->start_watch(TIME_OFFLOAD_LATENCY);
+  } else {
+    tnum = comm->nthreads;
+    overflow = fix->get_overflow_flag();
+  }
+  const int nthreads = tnum;
+  const int maxnbors = buffers->get_max_nbors();
+
+  const flt_t bboxlo0 = this->bboxlo[0];
+  const flt_t bboxlo1 = this->bboxlo[1];
+  const flt_t bboxlo2 = this->bboxlo[2];
+  const flt_t bboxhi0 = this->bboxhi[0];
+  const flt_t bboxhi1 = this->bboxhi[1];
+  const flt_t bboxhi2 = this->bboxhi[2];
+  const flt_t bininvx = this->bininvx;
+  const flt_t bininvy = this->bininvy;
+  const flt_t bininvz = this->bininvz;
+  
+  // Make sure dummy coordinates to eliminate loop remainder not within cutoff
+  {
+    const flt_t dx = (INTEL_BIGP - bboxhi0);
+    const flt_t dy = (INTEL_BIGP - bboxhi1);
+    const flt_t dz = (INTEL_BIGP - bboxhi2);
+    if (dx * dx + dy * dy + dz * dz < static_cast<flt_t>(cutneighmaxsq))
+      error->one(FLERR,
+	"Intel package expects no atoms within cutoff of {1e15,1e15,1e15}.");
+  }
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  const int * restrict const binhead = this->binhead;
+  const int * restrict const special_flag = this->special_flag;
+  const int nbinx = this->nbinx;
+  const int nbiny = this->nbiny;
+  const int nbinz = this->nbinz;
+  const int mbinxlo = this->mbinxlo;
+  const int mbinylo = this->mbinylo;
+  const int mbinzlo = this->mbinzlo;
+  const int mbinx = this->mbinx;
+  const int mbiny = this->mbiny;
+  const int mbinz = this->mbinz;
+  const int * restrict const bins = this->bins;
+  const int cop = fix->coprocessor_number();
+  const int separate_buffers = fix->separate_buffers();
+  #pragma offload target(mic:cop) if(offload) \
+    in(x:length(nall+1) alloc_if(0) free_if(0)) \
+    in(tag:length(tag_size) alloc_if(0) free_if(0)) \
+    in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
+    in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
+    in(bins:length(nall) alloc_if(0) free_if(0)) \
+    in(binhead:length(mbins) alloc_if(0) free_if(0)) \
+    in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
+    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
+    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
+    out(numneigh:length(0) alloc_if(0) free_if(0)) \
+    in(ilist:length(0) alloc_if(0) free_if(0)) \
+    in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
+    in(special_flag:length(0) alloc_if(0) free_if(0)) \
+    in(maxnbors,nthreads,maxspecial,nstencil,nbinx,nbiny,nbinz) \
+    in(mbinxlo,mbinylo,mbinzlo,mbinx,mbiny,mbinz,pad_width,offload) \
+    in(bininvx,bininvy,bininvz,bboxlo0,bboxlo1,bboxlo2,separate_buffers) \
+    in(bboxhi0, bboxhi1, bboxhi2, astart, aend, nlocal, molecular, ntypes) \
+    out(overflow:length(5) alloc_if(0) free_if(0)) \
+    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
+    signal(numneigh)
+  #endif
+  {
+    #ifdef __MIC__
+    *timer_compute = MIC_Wtime();
+    #endif
+
+    #ifdef _LMP_INTEL_OFFLOAD
+    overflow[LMP_LOCAL_MIN] = astart;
+    overflow[LMP_LOCAL_MAX] = aend - 1;
+    overflow[LMP_GHOST_MIN] = nall;
+    overflow[LMP_GHOST_MAX] = -1;
+    #endif
+
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) shared(numneigh,overflow)
+    #endif
+    {
+      #ifdef _LMP_INTEL_OFFLOAD
+      int lmin = nall, lmax = -1, gmin = nall, gmax = -1;
+      #endif
+
+      const int num = aend - astart;
+      int tid, ifrom, ito;
+      IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
+      ifrom += astart;
+      ito += astart;
+
+      int which;
+
+      const int list_size = (ito + tid + 1) * maxnbors;
+      int ct = (ifrom + tid) * maxnbors;
+      int *neighptr = firstneigh + ct;
+      for (int i = ifrom; i < ito; i++) {
+        int j, k, n, n2, itype, jtype, ibin;
+        double xtmp, ytmp, ztmp, delx, dely, delz, rsq;
+
+        n = 0;
+        n2 = maxnbors;
+
+        xtmp = x[i].x;
+        ytmp = x[i].y;
+        ztmp = x[i].z;
+        itype = x[i].w;
+        const int ioffset = ntypes*itype;
+
+        // loop over all atoms in other bins in stencil including self
+        // only store pair if i < j
+        // stores own/own pairs only once
+        // stores own/ghost pairs on both procs
+
+        ibin = mcoord2bin(x[i].x, x[i].y, x[i].z, bboxlo0, bboxlo1, bboxlo2,
+                          bboxhi0, bboxhi1, bboxhi2, bininvx, bininvy, bininvz,
+                          nbinx, nbiny, nbinz, mbinx, mbiny, mbinz,
+                          mbinxlo, mbinylo, mbinzlo);
+
+        for (k = 0; k < nstencil; k++) {
+          for (j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) {
+            if (j <= i) continue;
+
+            jtype = x[j].w;
+            #ifndef _LMP_INTEL_OFFLOAD
+            if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+            #endif
+
+            delx = xtmp - x[j].x;
+            dely = ytmp - x[j].y;
+            delz = ztmp - x[j].z;
+            rsq = delx * delx + dely * dely + delz * delz;
+            if (rsq <= cutneighsq[ioffset + jtype]) {
+              if (j < nlocal) {
+                neighptr[n++] = j;
+                #ifdef _LMP_INTEL_OFFLOAD
+		if (j < lmin) lmin = j;
+		if (j > lmax) lmax = j;
+                #endif
+              } else {
+                neighptr[n2++] = j;
+	        #ifdef _LMP_INTEL_OFFLOAD
+		if (j < gmin) gmin = j;
+		if (j > gmax) gmax = j;
+                #endif
+	      }
+	    }
+          }
+        }
+        ilist[i] = i;
+
+        cnumneigh[i] = ct;
+        if (n > maxnbors) *overflow = 1;
+        for (k = maxnbors; k < n2; k++) neighptr[n++] = neighptr[k];
+        while( (n % pad_width) != 0 ) neighptr[n++] = nall;
+        numneigh[i] = n;
+        while((n % (INTEL_DATA_ALIGN / sizeof(int))) != 0) n++;
+        ct += n;
+        neighptr += n;
+        if (ct + n + maxnbors > list_size) {
+          *overflow = 1;
+	  ct = (ifrom + tid) * maxnbors;
+        }
+      }
+
+      if (*overflow == 1)
+	for (int i = ifrom; i < ito; i++)
+	  numneigh[i] = 0;
+
+      #ifdef _LMP_INTEL_OFFLOAD
+      if (separate_buffers) {
+        #if defined(_OPENMP)
+        #pragma omp critical
+        #endif
+        {
+          if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
+          if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
+          if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
+          if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
+        }
+        #pragma omp barrier
+      }
+
+      int ghost_offset = 0, nall_offset = nall;
+      if (separate_buffers) {
+        int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
+        if (nghost < 0) nghost = 0;
+        if (offload) {
+          ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
+          nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
+	} else {
+          ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
+          nall_offset = nlocal + nghost;
+        }
+      }
+      #endif
+
+      if (molecular) {
+        for (int i = ifrom; i < ito; ++i) {
+          int * restrict jlist = firstneigh + cnumneigh[i];
+          const int jnum = numneigh[i];
+          for (int jj = 0; jj < jnum; jj++) {
+            const int j = jlist[jj];
+            ofind_special(which, special, nspecial, i, tag[j], special_flag);
+            #ifdef _LMP_INTEL_OFFLOAD
+	    if (j >= nlocal) {
+	      if (j == nall) 
+		jlist[jj] = nall_offset;
+	      else if (which > 0) 
+		jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
+	      else jlist[jj]-=ghost_offset;
+            } else
+            #endif
+	      if (which > 0) jlist[jj] = j ^ (which << SBBITS);
+          }
+        }
+      }
+      #ifdef _LMP_INTEL_OFFLOAD
+      else if (separate_buffers) {
+	for (int i = ifrom; i < ito; ++i) {
+          int * restrict jlist = firstneigh + cnumneigh[i];
+          const int jnum = numneigh[i];
+	  int jj = 0;
+	  for (jj = 0; jj < jnum; jj++)
+	    if (jlist[jj] >= nlocal) break;
+	  while (jj < jnum) {
+	    if (jlist[jj] == nall) jlist[jj] = nall_offset;
+	    else jlist[jj] -= ghost_offset;
+	    jj++;
+	  }
+	}
+      }
+      #endif
+    } // end omp
+    #ifdef __MIC__
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // end offload
+
+  if (offload) {
+    fix->stop_watch(TIME_OFFLOAD_LATENCY);
+    #ifdef _LMP_INTEL_OFFLOAD
+    for (int n = 0; n < aend; n++) {
+      ilist[n] = n;
+      numneigh[n] = 0;
+    }
+    #endif
+  } else {
+    for (int i = astart; i < aend; i++)
+      list->firstneigh[i] = firstneigh + cnumneigh[i];
+    fix->stop_watch(TIME_HOST_NEIGHBOR);
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (separate_buffers) {
+      fix->start_watch(TIME_PACK);
+      fix->set_neighbor_host_sizes();
+      buffers->pack_sep_from_single(fix->host_min_local(),
+				    fix->host_used_local(),
+				    fix->host_min_ghost(),
+				    fix->host_used_ghost());
+      fix->stop_watch(TIME_PACK);
+    }
+    #endif
+  }
+}
+
+/* ----------------------------------------------------------------------
+   binned neighbor list construction with full Newton's 3rd law
+   each owned atom i checks its own bin and other bins in Newton stencil
+   every pair stored exactly once by some processor
+------------------------------------------------------------------------- */
+
+void Neighbor::half_bin_newton_intel(NeighList *list)
+{
+  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
+  list->inum = nlocal;
+
+  // Get fix for intel stuff
+  FixIntel *fix = static_cast<FixIntel *>(fix_intel);
+
+  const int off_end = fix->offload_end_neighbor();
+  int host_start = fix->host_start_neighbor();;
+  int offload_noghost = 0;
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (fix->full_host_list()) host_start = 0;
+  offload_noghost = fix->offload_noghost();
+  if (exclude) 
+    error->all(FLERR, "Exclusion lists not yet supported for Intel offload");
+  #endif
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+    if (offload_noghost) {
+      hbni<float,double,1>(1, list, fix->get_mixed_buffers(),
+                           0, off_end, fix);
+      hbni<float,double,1>(0, list, fix->get_mixed_buffers(),
+                           host_start, nlocal, fix, off_end);
+    } else {
+      hbni<float,double,0>(1, list, fix->get_mixed_buffers(),
+                           0, off_end, fix);
+      hbni<float,double,0>(0, list, fix->get_mixed_buffers(),
+                           host_start, nlocal, fix);
+    }
+  } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+    if (offload_noghost) {
+      hbni<double,double,1>(1, list, fix->get_double_buffers(),
+                            0, off_end, fix);
+      hbni<double,double,1>(0, list, fix->get_double_buffers(),
+                            host_start, nlocal, fix, off_end);
+    } else {
+      hbni<double,double,0>(1, list, fix->get_double_buffers(),
+                            0, off_end, fix);
+      hbni<double,double,0>(0, list, fix->get_double_buffers(),
+                            host_start, nlocal, fix);
+    }
+  } else {
+    if (offload_noghost) {
+      hbni<float,float,1>(1, list, fix->get_single_buffers(), 0, off_end, fix);
+      hbni<float,float,1>(0, list, fix->get_single_buffers(),
+                          host_start, nlocal, fix, off_end);
+    } else {
+      hbni<float,float,0>(1, list, fix->get_single_buffers(), 0, off_end, fix);
+      hbni<float,float,0>(0, list, fix->get_single_buffers(),
+                          host_start, nlocal, fix);
+    }
+  }
+}
+
+template <class flt_t, class acc_t, int offload_noghost>
+void Neighbor::hbni(const int offload, NeighList *list, void *buffers_in,
+                    const int astart, const int aend, void *fix_in,
+                    const int offload_end) {
+  IntelBuffers<flt_t,acc_t> *buffers = (IntelBuffers<flt_t,acc_t> *)buffers_in;
+  FixIntel *fix = (FixIntel *)fix_in;
+  const int nall = atom->nlocal + atom->nghost;
+  int pad = 1;
+
+  if (offload) {
+    fix->start_watch(TIME_PACK);
+    buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
+    buffers->grow_nbor(list, atom->nlocal, aend);
+
+    ATOM_T biga;
+    biga.x = INTEL_BIGP;
+    biga.y = INTEL_BIGP;
+    biga.z = INTEL_BIGP;
+    biga.w = 1;
+    buffers->get_x()[nall]=biga;
+
+    const int nthreads = comm->nthreads;
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) shared(buffers)
+    #endif
+    {
+      int ifrom, ito, tid;
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads, 
+				sizeof(ATOM_T));
+      buffers->thr_pack(ifrom, ito, 0);
+    }
+    fix->stop_watch(TIME_PACK);
+
+    fix->start_watch(TIME_HOST_NEIGHBOR);
+    bin_atoms<flt_t,acc_t>(buffers->get_x());
+    if (INTEL_MIC_NBOR_PAD > 1)
+      pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
+  } else {
+    fix->start_watch(TIME_HOST_NEIGHBOR);
+    if (INTEL_NBOR_PAD > 1)
+      pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
+  }    
+  const int pad_width = pad;
+
+  if (aend-astart == 0) {
+    fix->stop_watch(TIME_HOST_NEIGHBOR);
+    return;
+  }
+
+  const ATOM_T * restrict const x = buffers->get_x();
+  int * restrict const firstneigh = buffers->firstneigh(list);
+  int nall_t = nall;
+  if (offload_noghost && offload) nall_t = atom->nlocal;
+  const int e_nall = nall_t;
+
+  const int molecular = atom->molecular;
+  int *ns = NULL, *s = NULL;
+  int tag_size, special_size;
+  if (molecular) {
+    s = atom->special[0];
+    ns = atom->nspecial[0];
+    tag_size = e_nall;
+    special_size = aend;
+  } else {
+    s = &buffers->_special_holder;
+    ns = &buffers->_nspecial_holder;
+    tag_size = 0;
+    special_size = 0;
+  }
+  const int * restrict const special = s;
+  const int * restrict const nspecial = ns;
+  const int maxspecial = atom->maxspecial;
+  const int * restrict const tag = atom->tag;
+
+  int * restrict const ilist = list->ilist;
+  int * restrict numneigh = list->numneigh;
+  int * restrict const cnumneigh = buffers->cnumneigh(list);
+  const int nstencil = list->nstencil;
+  const int * restrict const stencil = list->stencil;
+  const flt_t * restrict const cutneighsq = buffers->get_cutneighsq()[0];
+  const int ntypes = atom->ntypes + 1;
+  const int nlocal = atom->nlocal;
+
+  #ifndef _LMP_INTEL_OFFLOAD
+  int * const mask = atom->mask;
+  int * const molecule = atom->molecule;
+  #endif
+
+  int tnum;
+  int *overflow;
+  double *timer_compute;
+  if (offload) {
+    timer_compute = fix->off_watch_neighbor();
+    tnum = buffers->get_off_threads();
+    overflow = fix->get_off_overflow_flag();
+    fix->stop_watch(TIME_HOST_NEIGHBOR);
+    fix->start_watch(TIME_OFFLOAD_LATENCY);
+  } else {
+    tnum = comm->nthreads;
+    overflow = fix->get_overflow_flag();
+  }
+  const int nthreads = tnum;
+  const int maxnbors = buffers->get_max_nbors();
+
+  const flt_t bboxlo0 = this->bboxlo[0];
+  const flt_t bboxlo1 = this->bboxlo[1];
+  const flt_t bboxlo2 = this->bboxlo[2];
+  const flt_t bboxhi0 = this->bboxhi[0];
+  const flt_t bboxhi1 = this->bboxhi[1];
+  const flt_t bboxhi2 = this->bboxhi[2];
+  const flt_t bininvx = this->bininvx;
+  const flt_t bininvy = this->bininvy;
+  const flt_t bininvz = this->bininvz;
+  // Make sure dummy coordinates to eliminate loop remainder not within cutoff
+  {
+    const flt_t dx = (INTEL_BIGP - bboxhi0);
+    const flt_t dy = (INTEL_BIGP - bboxhi1);
+    const flt_t dz = (INTEL_BIGP - bboxhi2);
+    if (dx * dx + dy * dy + dz * dz < static_cast<flt_t>(cutneighmaxsq))
+      error->one(FLERR,
+	"Intel package expects no atoms within cutoff of {1e15,1e15,1e15}.");
+  }
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  const int * restrict const binhead = this->binhead;
+  const int * restrict const special_flag = this->special_flag;
+  const int nbinx = this->nbinx;
+  const int nbiny = this->nbiny;
+  const int nbinz = this->nbinz;
+  const int mbinxlo = this->mbinxlo;
+  const int mbinylo = this->mbinylo;
+  const int mbinzlo = this->mbinzlo;
+  const int mbinx = this->mbinx;
+  const int mbiny = this->mbiny;
+  const int mbinz = this->mbinz;
+  const int * restrict const bins = this->bins;
+  const int cop = fix->coprocessor_number();
+  const int separate_buffers = fix->separate_buffers();
+  #pragma offload target(mic:cop) if(offload) \
+    in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
+    in(tag:length(tag_size) alloc_if(0) free_if(0)) \
+    in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
+    in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
+    in(bins:length(nall) alloc_if(0) free_if(0)) \
+    in(binhead:length(mbins) alloc_if(0) free_if(0)) \
+    in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
+    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
+    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
+    out(numneigh:length(0) alloc_if(0) free_if(0)) \
+    in(ilist:length(0) alloc_if(0) free_if(0)) \
+    in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
+    in(special_flag:length(0) alloc_if(0) free_if(0)) \
+    in(maxnbors,nthreads,maxspecial,nstencil,nbinx,nbiny,nbinz,e_nall,offload)\
+    in(mbinxlo,mbinylo,mbinzlo,mbinx,mbiny,mbinz,pad_width,offload_end) \
+    in(bininvx,bininvy,bininvz,bboxlo0,bboxlo1,bboxlo2,separate_buffers) \
+    in(bboxhi0, bboxhi1, bboxhi2, astart, aend, nlocal, molecular, ntypes) \
+    out(overflow:length(5) alloc_if(0) free_if(0)) \
+    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
+    signal(numneigh)
+  #endif
+  {
+    #ifdef __MIC__
+    *timer_compute = MIC_Wtime();
+    #endif
+
+    #ifdef _LMP_INTEL_OFFLOAD
+    overflow[LMP_LOCAL_MIN] = astart;
+    overflow[LMP_LOCAL_MAX] = aend - 1;
+    overflow[LMP_GHOST_MIN] = e_nall;
+    overflow[LMP_GHOST_MAX] = -1;
+    #endif
+
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) shared(numneigh, overflow)
+    #endif
+    {
+      #ifdef _LMP_INTEL_OFFLOAD
+      int lmin = e_nall, lmax = -1, gmin = e_nall, gmax = -1;
+      #endif
+      
+      const int num = aend - astart;
+      int tid, ifrom, ito;
+      IP_PRE_omp_range_id(ifrom, ito, tid, num, nthreads);
+      ifrom += astart;
+      ito += astart;
+
+      int which;
+
+      const int list_size = (ito + tid + 1) * maxnbors;
+      int ct = (ifrom + tid) * maxnbors;
+      int *neighptr = firstneigh + ct;
+      for (int i = ifrom; i < ito; i++) {
+        int j, k, n, n2, itype, jtype, ibin;
+        double xtmp, ytmp, ztmp, delx, dely, delz, rsq;
+
+        n = 0;
+	n2 = maxnbors;
+
+        xtmp = x[i].x;
+        ytmp = x[i].y;
+        ztmp = x[i].z;
+        itype = x[i].w;
+        const int ioffset = ntypes * itype;
+
+        // loop over rest of atoms in i's bin, ghosts are at end of linked list
+        // if j is owned atom, store it, since j is beyond i in linked list
+        // if j is ghost, only store if j coords are "above/to the right" of i
+
+        for (j = bins[i]; j >= 0; j = bins[j]) {
+          if (j >= nlocal) {
+            if (offload_noghost && offload) continue;
+            if (x[j].z < ztmp) continue;
+            if (x[j].z == ztmp) {
+              if (x[j].y < ytmp) continue;
+              if (x[j].y == ytmp && x[j].x < xtmp) continue;
+            }
+          } else if (offload_noghost && i < offload_end) continue;
+
+          jtype = x[j].w;
+          #ifndef _LMP_INTEL_OFFLOAD
+          if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+	  #endif
+
+          delx = xtmp - x[j].x;
+          dely = ytmp - x[j].y;
+          delz = ztmp - x[j].z;
+          rsq = delx * delx + dely * dely + delz * delz;
+
+          if (rsq <= cutneighsq[ioffset + jtype]) {
+	    if (j < nlocal) {
+	      neighptr[n++] = j;
+	      #ifdef _LMP_INTEL_OFFLOAD
+	      if (j < lmin) lmin = j;
+	      if (j > lmax) lmax = j;
+              #endif
+	    } else {
+	      neighptr[n2++] = j;
+	      #ifdef _LMP_INTEL_OFFLOAD
+	      if (j < gmin) gmin = j;
+	      if (j > gmax) gmax = j;
+              #endif
+            }
+	  }
+        }
+        // loop over all atoms in other bins in stencil, store every pair
+
+        ibin = mcoord2bin(x[i].x, x[i].y, x[i].z, bboxlo0, bboxlo1, bboxlo2,
+                          bboxhi0, bboxhi1, bboxhi2, bininvx, bininvy, bininvz,
+                          nbinx, nbiny, nbinz, mbinx, mbiny, mbinz,
+                          mbinxlo, mbinylo, mbinzlo);
+
+        for (k = 0; k < nstencil; k++) {
+          for (j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) {
+            if (offload_noghost) {
+              if (j < nlocal) {
+                if (i < offload_end) continue;
+              } else if (offload) continue;
+            }
+
+            jtype = x[j].w;
+            #ifndef _LMP_INTEL_OFFLOAD
+            if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+	    #endif
+
+            delx = xtmp - x[j].x;
+            dely = ytmp - x[j].y;
+            delz = ztmp - x[j].z;
+            rsq = delx * delx + dely * dely + delz * delz;
+            if (rsq <= cutneighsq[ioffset + jtype]) {
+	      if (j < nlocal) {
+		neighptr[n++] = j;
+                #ifdef _LMP_INTEL_OFFLOAD
+		if (j < lmin) lmin = j;
+		if (j > lmax) lmax = j;
+                #endif
+	      } else {
+		neighptr[n2++] = j;
+	        #ifdef _LMP_INTEL_OFFLOAD
+		if (j < gmin) gmin = j;
+		if (j > gmax) gmax = j;
+                #endif
+	      }
+	    }
+          }
+        }
+        ilist[i] = i;
+
+        cnumneigh[i] = ct;
+        if (n > maxnbors) *overflow = 1;
+	for (k = maxnbors; k < n2; k++) neighptr[n++] = neighptr[k];
+        while( (n % pad_width) != 0 ) neighptr[n++] = e_nall;
+        numneigh[i] = n;
+	while((n % (INTEL_DATA_ALIGN / sizeof(int))) != 0) n++;
+        ct += n;
+        neighptr += n;
+	if (ct + n + maxnbors > list_size) {
+	  *overflow = 1;
+	  ct = (ifrom + tid) * maxnbors;
+	}
+      }
+
+      if (*overflow == 1)
+        for (int i = ifrom; i < ito; i++)
+          numneigh[i] = 0;
+
+      #ifdef _LMP_INTEL_OFFLOAD
+      if (separate_buffers) {
+        #if defined(_OPENMP)
+        #pragma omp critical
+        #endif
+        {
+  	  if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
+	  if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
+	  if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
+	  if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
+        }
+	#pragma omp barrier
+      }
+
+      int ghost_offset = 0, nall_offset = e_nall;
+      if (separate_buffers) {
+	int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
+	if (nghost < 0) nghost = 0;
+	if (offload) {
+	  ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
+	  nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
+	} else {
+	  ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
+	  nall_offset = nlocal + nghost;
+	}
+      }
+      #endif
+
+      if (molecular) {
+        for (int i = ifrom; i < ito; ++i) {
+          int * restrict jlist = firstneigh + cnumneigh[i];
+          const int jnum = numneigh[i];
+          for (int jj = 0; jj < jnum; jj++) {
+            const int j = jlist[jj];
+            ofind_special(which, special, nspecial, i, tag[j],
+                          special_flag);
+	    #ifdef _LMP_INTEL_OFFLOAD
+	    if (j >= nlocal) {
+	      if (j == e_nall)
+		jlist[jj] = nall_offset;
+	      else if (which > 0) 
+		jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
+	      else jlist[jj]-=ghost_offset;
+            } else
+	    #endif
+            if (which > 0) jlist[jj] = j ^ (which << SBBITS);
+          }
+        }
+      }
+      #ifdef _LMP_INTEL_OFFLOAD
+      else if (separate_buffers) {
+	for (int i = ifrom; i < ito; ++i) {
+          int * restrict jlist = firstneigh + cnumneigh[i];
+          const int jnum = numneigh[i];
+	  int jj = 0;
+	  for (jj = 0; jj < jnum; jj++)
+	    if (jlist[jj] >= nlocal) break;
+	  while (jj < jnum) {
+	    if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
+	    else jlist[jj] -= ghost_offset;
+	    jj++;
+	  }
+	}
+      }
+      #endif
+    } // end omp
+    #ifdef __MIC__
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // end offload
+
+  if (offload) {
+    fix->stop_watch(TIME_OFFLOAD_LATENCY);
+    #ifdef _LMP_INTEL_OFFLOAD
+    for (int n = 0; n < aend; n++) {
+      ilist[n] = n;
+      numneigh[n] = 0;
+    }
+    #endif
+  } else {
+    for (int i = astart; i < aend; i++)
+      list->firstneigh[i] = firstneigh + cnumneigh[i];
+    fix->stop_watch(TIME_HOST_NEIGHBOR);
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (separate_buffers) {
+      fix->start_watch(TIME_PACK);
+      fix->set_neighbor_host_sizes();
+      buffers->pack_sep_from_single(fix->host_min_local(),
+				    fix->host_used_local(),
+				    fix->host_min_ghost(),
+				    fix->host_used_ghost());
+      fix->stop_watch(TIME_PACK);
+    }
+    #endif
+  }
+}
+
+/* ----------------------------------------------------------------------
+   binned neighbor list construction with Newton's 3rd law for triclinic
+   each owned atom i checks its own bin and other bins in triclinic stencil
+   every pair stored exactly once by some processor
+------------------------------------------------------------------------- */
+
+void Neighbor::half_bin_newton_tri_intel(NeighList *list)
+{
+  const int nlocal = (includegroup) ? atom->nfirst : atom->nlocal;
+  list->inum = nlocal;
+
+  // Get fix for intel stuff
+  FixIntel *fix = static_cast<FixIntel *>(fix_intel);
+
+  const int off_end = fix->offload_end_neighbor();
+  int host_start = fix->host_start_neighbor();
+  int offload_noghost = 0;
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (fix->full_host_list()) host_start = 0;
+  offload_noghost = fix->offload_noghost();
+  if (exclude) 
+    error->all(FLERR, "Exclusion lists not yet supported for Intel offload");
+  #endif
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+    if (offload_noghost) {
+      hbnti<float,double,1>(1, list, fix->get_mixed_buffers(),
+			    0, off_end, fix);
+      hbnti<float,double,1>(0, list, fix->get_mixed_buffers(),
+			    host_start, nlocal, fix, off_end);
+    } else {
+      hbnti<float,double,0>(1, list, fix->get_mixed_buffers(),
+			    0, off_end, fix);
+      hbnti<float,double,0>(0, list, fix->get_mixed_buffers(),
+			    host_start, nlocal, fix);
+    }
+  } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+    if (offload_noghost) {
+      hbnti<double,double,1>(1, list, fix->get_double_buffers(),
+			     0, off_end, fix);
+      hbnti<double,double,1>(0, list, fix->get_double_buffers(),
+			     host_start, nlocal, fix, off_end);
+    } else {
+      hbnti<double,double,0>(1, list, fix->get_double_buffers(),
+			     0, off_end, fix);
+      hbnti<double,double,0>(0, list, fix->get_double_buffers(),
+			     host_start, nlocal, fix);
+    }
+  } else {
+    if (offload_noghost) {
+      hbnti<float,float,1>(1, list, fix->get_single_buffers(),
+			   0, off_end, fix);
+      hbnti<float,float,1>(0, list, fix->get_single_buffers(),
+			   host_start, nlocal, fix, off_end);
+    } else {
+      hbnti<float,float,0>(1, list, fix->get_single_buffers(),
+			   0, off_end, fix);
+      hbnti<float,float,0>(0, list, fix->get_single_buffers(),
+			   host_start, nlocal, fix);
+    }
+  }
+}
+
+template <class flt_t, class acc_t, int offload_noghost>
+void Neighbor::hbnti(const int offload, NeighList *list, void *buffers_in,
+                     const int astart, const int aend, void *fix_in,
+		     const int offload_end) {
+  IntelBuffers<flt_t,acc_t> *buffers = (IntelBuffers<flt_t,acc_t> *)buffers_in;
+  FixIntel *fix = (FixIntel *)fix_in;
+  const int nall = atom->nlocal + atom->nghost;
+  int pad = 1;
+
+  if (offload) {
+    fix->start_watch(TIME_PACK);
+    buffers->grow(nall, atom->nlocal, comm->nthreads, aend);
+    buffers->grow_nbor(list, atom->nlocal, aend);
+
+    ATOM_T biga;
+    biga.x = INTEL_BIGP;
+    biga.y = INTEL_BIGP;
+    biga.z = INTEL_BIGP;
+    biga.w = 1;
+    buffers->get_x()[nall]=biga;
+
+    const int nthreads = comm->nthreads;
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) shared(buffers)
+    #endif
+    {
+      int ifrom, ito, tid;
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads, 
+				sizeof(ATOM_T));
+      buffers->thr_pack(ifrom, ito, 0);
+    }
+    fix->stop_watch(TIME_PACK);
+
+    fix->start_watch(TIME_HOST_NEIGHBOR);
+    bin_atoms<flt_t,acc_t>(buffers->get_x());
+    if (INTEL_MIC_NBOR_PAD > 1)
+      pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
+  } else {
+    fix->start_watch(TIME_HOST_NEIGHBOR);
+    if (INTEL_NBOR_PAD > 1)
+      pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
+  }
+  const int pad_width = pad;
+
+  if (aend-astart == 0) {
+    fix->stop_watch(TIME_HOST_NEIGHBOR);
+    return;
+  }
+
+  const ATOM_T * restrict const x = buffers->get_x();
+  int * restrict const firstneigh = buffers->firstneigh(list);
+  int nall_t = nall;
+  if (offload_noghost && offload) nall_t = atom->nlocal;
+  const int e_nall = nall_t;
+
+  const int molecular = atom->molecular;
+  int *ns = NULL, *s = NULL;
+  int tag_size, special_size;
+  if (molecular) {
+    s = atom->special[0];
+    ns = atom->nspecial[0];
+    tag_size = e_nall;
+    special_size = aend;
+  } else {
+    s = &buffers->_special_holder;
+    ns = &buffers->_nspecial_holder;
+    tag_size = 0;
+    special_size = 0;
+  }
+  const int * restrict const special = s;
+  const int * restrict const nspecial = ns;
+  const int maxspecial = atom->maxspecial;
+  const int * restrict const tag = atom->tag;
+
+  int * restrict const ilist = list->ilist;
+  int * restrict numneigh = list->numneigh;
+  int * restrict const cnumneigh = buffers->cnumneigh(list);
+  const int nstencil = list->nstencil;
+  const int * restrict const stencil = list->stencil;
+  const flt_t * restrict const cutneighsq = buffers->get_cutneighsq()[0];
+  const int ntypes = atom->ntypes + 1;
+  const int nlocal = atom->nlocal;
+
+  #ifndef _LMP_INTEL_OFFLOAD
+  int * const mask = atom->mask;
+  int * const molecule = atom->molecule;
+  #endif
+
+  int tnum;
+  int *overflow;
+  double *timer_compute;
+  if (offload) {
+    timer_compute = fix->off_watch_neighbor();
+    tnum = buffers->get_off_threads();
+    overflow = fix->get_off_overflow_flag();
+    fix->stop_watch(TIME_HOST_NEIGHBOR);
+    fix->start_watch(TIME_OFFLOAD_LATENCY);
+  } else {
+    tnum = comm->nthreads;
+    overflow = fix->get_overflow_flag();
+  }
+  const int nthreads = tnum;
+  const int maxnbors = buffers->get_max_nbors();
+
+  const flt_t bboxlo0 = this->bboxlo[0];
+  const flt_t bboxlo1 = this->bboxlo[1];
+  const flt_t bboxlo2 = this->bboxlo[2];
+  const flt_t bboxhi0 = this->bboxhi[0];
+  const flt_t bboxhi1 = this->bboxhi[1];
+  const flt_t bboxhi2 = this->bboxhi[2];
+  const flt_t bininvx = this->bininvx;
+  const flt_t bininvy = this->bininvy;
+  const flt_t bininvz = this->bininvz;
+  // Make sure dummy coordinates to eliminate loop remainder not within cutoff
+  {
+    const flt_t dx = (INTEL_BIGP - bboxhi0);
+    const flt_t dy = (INTEL_BIGP - bboxhi1);
+    const flt_t dz = (INTEL_BIGP - bboxhi2);
+    if (dx * dx + dy * dy + dz * dz < static_cast<flt_t>(cutneighmaxsq))
+      error->one(FLERR,
+	"Intel package expects no atoms within cutoff of {1e15,1e15,1e15}.");
+  }
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  const int * restrict const binhead = this->binhead;
+  const int * restrict const special_flag = this->special_flag;
+  const int nbinx = this->nbinx;
+  const int nbiny = this->nbiny;
+  const int nbinz = this->nbinz;
+  const int mbinxlo = this->mbinxlo;
+  const int mbinylo = this->mbinylo;
+  const int mbinzlo = this->mbinzlo;
+  const int mbinx = this->mbinx;
+  const int mbiny = this->mbiny;
+  const int mbinz = this->mbinz;
+  const int * restrict const bins = this->bins;
+  const int cop = fix->coprocessor_number();
+  const int separate_buffers = fix->separate_buffers();
+  #pragma offload target(mic:cop) if(offload) \
+    in(x:length(e_nall+1) alloc_if(0) free_if(0)) \
+    in(tag:length(tag_size) alloc_if(0) free_if(0)) \
+    in(special:length(special_size*maxspecial) alloc_if(0) free_if(0)) \
+    in(nspecial:length(special_size*3) alloc_if(0) free_if(0)) \
+    in(bins:length(nall) alloc_if(0) free_if(0)) \
+    in(binhead:length(mbins) alloc_if(0) free_if(0)) \
+    in(cutneighsq:length(0) alloc_if(0) free_if(0)) \
+    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
+    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
+    out(numneigh:length(0) alloc_if(0) free_if(0)) \
+    in(ilist:length(0) alloc_if(0) free_if(0)) \
+    in(stencil:length(nstencil) alloc_if(0) free_if(0)) \
+    in(special_flag:length(0) alloc_if(0) free_if(0)) \
+    in(maxnbors,nthreads,maxspecial,nstencil,nbinx,nbiny,nbinz,offload_end) \
+    in(mbinxlo,mbinylo,mbinzlo,mbinx,mbiny,mbinz,pad_width,e_nall,offload) \
+    in(bininvx,bininvy,bininvz,bboxlo0,bboxlo1,bboxlo2,separate_buffers) \
+    in(bboxhi0, bboxhi1, bboxhi2, astart, aend, nlocal, molecular, ntypes) \
+    out(overflow:length(5) alloc_if(0) free_if(0)) \
+    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
+    signal(numneigh)
+  #endif
+  {
+    #ifdef __MIC__
+    *timer_compute = MIC_Wtime();
+    #endif
+
+    #ifdef _LMP_INTEL_OFFLOAD
+    overflow[LMP_LOCAL_MIN] = astart;
+    overflow[LMP_LOCAL_MAX] = aend - 1;
+    overflow[LMP_GHOST_MIN] = e_nall;
+    overflow[LMP_GHOST_MAX] = -1;
+    #endif
+
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) shared(numneigh, overflow)
+    #endif
+    {
+      #ifdef _LMP_INTEL_OFFLOAD
+      int lmin = e_nall, lmax = -1, gmin = e_nall, gmax = -1;
+      #endif
+
+      const int num = aend-astart;
+      int tid, ifrom, ito;
+      IP_PRE_omp_range_id(ifrom,ito,tid,num,nthreads);
+      ifrom += astart;
+      ito += astart;
+
+      int which;
+
+      const int list_size = (ito + tid + 1) * maxnbors;
+      int ct = (ifrom + tid) * maxnbors;
+      int *neighptr = firstneigh + ct;
+      for (int i = ifrom; i < ito; i++) {
+        int j, k, n, n2, itype, jtype, ibin;
+        double xtmp, ytmp, ztmp, delx, dely, delz, rsq;
+
+        n = 0;
+        n2 = maxnbors;
+
+        xtmp = x[i].x;
+        ytmp = x[i].y;
+        ztmp = x[i].z;
+        itype = x[i].w;
+        const int ioffset = ntypes * itype;
+
+        // loop over all atoms in bins in stencil
+        // pairs for atoms j "below" i are excluded
+        // below = lower z or (equal z and lower y) or (equal zy and lower x)
+        //         (equal zyx and j <= i)
+        // latter excludes self-self interaction but allows superposed atoms
+
+        ibin = mcoord2bin(x[i].x, x[i].y, x[i].z, bboxlo0, bboxlo1, bboxlo2,
+                          bboxhi0, bboxhi1, bboxhi2, bininvx, bininvy, bininvz,
+                          nbinx, nbiny, nbinz, mbinx, mbiny, mbinz,
+                          mbinxlo, mbinylo, mbinzlo);
+
+        for (k = 0; k < nstencil; k++) {
+          for (j = binhead[ibin + stencil[k]]; j >= 0; j = bins[j]) {
+	    if (offload_noghost) {
+              if (j < nlocal) {
+                if (i < offload_end) continue;
+              } else if (offload) continue;
+            }
+
+            if (x[j].z < ztmp) continue;
+            if (x[j].z == ztmp) {
+              if (x[j].y < ytmp) continue;
+              if (x[j].y == ytmp) {
+                if (x[j].x < xtmp) continue;
+                if (x[j].x == xtmp && j <= i) continue;
+              }
+            }
+
+            jtype = x[j].w;
+            #ifndef _LMP_INTEL_OFFLOAD
+            if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+	    #endif
+
+            delx = xtmp - x[j].x;
+            dely = ytmp - x[j].y;
+            delz = ztmp - x[j].z;
+            rsq = delx * delx + dely * dely + delz * delz;
+            if (rsq <= cutneighsq[ioffset + jtype]) {
+              if (j < nlocal) {
+                neighptr[n++] = j;
+                #ifdef _LMP_INTEL_OFFLOAD
+		if (j < lmin) lmin = j;
+		if (j > lmax) lmax = j;
+                #endif
+	      }  else {
+                neighptr[n2++] = j;
+  	        #ifdef _LMP_INTEL_OFFLOAD
+		if (j < gmin) gmin = j;
+		if (j > gmax) gmax = j;
+                #endif
+	      }
+            }
+          }
+        }
+        ilist[i] = i;
+
+        cnumneigh[i] = ct;
+        if (n > maxnbors) *overflow = 1;
+        for (k = maxnbors; k < n2; k++) neighptr[n++] = neighptr[k];
+        while( (n % pad_width) != 0 ) neighptr[n++] = e_nall;
+        numneigh[i] = n;
+        while((n % (INTEL_DATA_ALIGN / sizeof(int))) != 0) n++;
+        ct += n;
+        neighptr += n;
+        if (ct + n + maxnbors > list_size) {
+          *overflow = 1;
+          ct = (ifrom + tid) * maxnbors;
+        }
+      }
+
+      if (*overflow == 1)
+        for (int i = ifrom; i < ito; i++)
+          numneigh[i] = 0;
+
+      #ifdef _LMP_INTEL_OFFLOAD
+      if (separate_buffers) {
+        #if defined(_OPENMP)
+        #pragma omp critical
+        #endif
+        {
+          if (lmin < overflow[LMP_LOCAL_MIN]) overflow[LMP_LOCAL_MIN] = lmin;
+          if (lmax > overflow[LMP_LOCAL_MAX]) overflow[LMP_LOCAL_MAX] = lmax;
+          if (gmin < overflow[LMP_GHOST_MIN]) overflow[LMP_GHOST_MIN] = gmin;
+          if (gmax > overflow[LMP_GHOST_MAX]) overflow[LMP_GHOST_MAX] = gmax;
+        }
+        #pragma omp barrier
+      }
+
+      int ghost_offset = 0, nall_offset = e_nall;
+      if (separate_buffers) {
+        int nghost = overflow[LMP_GHOST_MAX] + 1 - overflow[LMP_GHOST_MIN];
+        if (nghost < 0) nghost = 0;
+        if (offload) {
+          ghost_offset = overflow[LMP_GHOST_MIN] - overflow[LMP_LOCAL_MAX] - 1;
+          nall_offset = overflow[LMP_LOCAL_MAX] + 1 + nghost;
+	} else {
+          ghost_offset = overflow[LMP_GHOST_MIN] - nlocal;
+          nall_offset = nlocal + nghost;
+        }
+      }
+      #endif
+
+      if (molecular) {
+        for (int i = ifrom; i < ito; ++i) {
+          int * restrict jlist = firstneigh + cnumneigh[i];
+          const int jnum = numneigh[i];
+          for (int jj = 0; jj < jnum; jj++) {
+            const int j = jlist[jj];
+            ofind_special(which, special, nspecial, i, tag[j], special_flag);
+            #ifdef _LMP_INTEL_OFFLOAD
+	    if (j >= nlocal) {
+	      if (j == e_nall) 
+		jlist[jj] = nall_offset;
+	      else if (which > 0) 
+		jlist[jj] = (j-ghost_offset) ^ (which << SBBITS);
+	      else jlist[jj]-=ghost_offset;
+            } else
+            #endif
+	      if (which > 0) jlist[jj] = j ^ (which << SBBITS);
+          }
+        }
+      }
+      #ifdef _LMP_INTEL_OFFLOAD
+      else if (separate_buffers) {
+	for (int i = ifrom; i < ito; ++i) {
+          int * restrict jlist = firstneigh + cnumneigh[i];
+          const int jnum = numneigh[i];
+	  int jj = 0;
+	  for (jj = 0; jj < jnum; jj++)
+	    if (jlist[jj] >= nlocal) break;
+	  while (jj < jnum) {
+	    if (jlist[jj] == e_nall) jlist[jj] = nall_offset;
+	    else jlist[jj] -= ghost_offset;
+	    jj++;
+	  }
+	}
+      }
+      #endif
+    } // end omp
+    #ifdef __MIC__
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // end offload
+
+  if (offload) {
+    fix->stop_watch(TIME_OFFLOAD_LATENCY);
+    #ifdef _LMP_INTEL_OFFLOAD
+    for (int n = 0; n < aend; n++) {
+      ilist[n] = n;
+      numneigh[n] = 0;
+    }
+    #endif
+  } else {
+    for (int i = astart; i < aend; i++)
+      list->firstneigh[i] = firstneigh + cnumneigh[i];
+    fix->stop_watch(TIME_HOST_NEIGHBOR);
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (separate_buffers) {
+      fix->start_watch(TIME_PACK);
+      fix->set_neighbor_host_sizes();
+      buffers->pack_sep_from_single(fix->host_min_local(),
+				    fix->host_used_local(),
+				    fix->host_min_ghost(),
+				    fix->host_used_ghost());
+      fix->stop_watch(TIME_PACK);
+    }
+    #endif
+  }
+}
diff --git a/src/USER-INTEL/pair_gayberne_intel.cpp b/src/USER-INTEL/pair_gayberne_intel.cpp
new file mode 100644
index 0000000000..46e608c92f
--- /dev/null
+++ b/src/USER-INTEL/pair_gayberne_intel.cpp
@@ -0,0 +1,1075 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_gayberne_intel.h"
+#include "math_extra_intel.h"
+#include "atom.h"
+#include "comm.h"
+#include "atom_vec_ellipsoid.h"
+#include "force.h"
+#include "memory.h"
+#include "modify.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define FC_PACKED1_T typename ForceConst<flt_t>::fc_packed1
+#define FC_PACKED2_T typename ForceConst<flt_t>::fc_packed2
+#define FC_PACKED3_T typename ForceConst<flt_t>::fc_packed3
+
+/* ---------------------------------------------------------------------- */
+
+PairGayBerneIntel::PairGayBerneIntel(LAMMPS *lmp) :
+  PairGayBerne(lmp)
+{
+  suffix_flag |= Suffix::INTEL;
+  respa_enable = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairGayBerneIntel::compute(int eflag, int vflag)
+{
+  if (fix->precision()==FixIntel::PREC_MODE_MIXED)
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
+                          force_const_single);
+  else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
+    compute<double,double>(eflag, vflag, fix->get_double_buffers(),
+                           force_const_double);
+  else
+    compute<float,float>(eflag, vflag, fix->get_single_buffers(),
+                         force_const_single);
+
+  fix->balance_stamp();
+  vflag_fdotr = 0;
+}
+
+template <class flt_t, class acc_t>
+void PairGayBerneIntel::compute(int eflag, int vflag,
+                                IntelBuffers<flt_t,acc_t> *buffers,
+                                const ForceConst<flt_t> &fc)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag, vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int inum = list->inum;
+  const int nall = atom->nlocal + atom->nghost;
+  const int nthreads = comm->nthreads;
+  const int host_start = fix->host_start_pair();
+  const int offload_end = fix->offload_end_pair();
+  const int ago = neighbor->ago;
+
+  if (fix->separate_buffers() == 0) {
+    fix->start_watch(TIME_PACK);
+    const AtomVecEllipsoid::Bonus * const bonus = avec->bonus;
+    const int * const ellipsoid = atom->ellipsoid;
+    QUAT_T * restrict const quat = buffers->get_quat();
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
+    #endif
+    {
+      int ifrom, ito, tid;
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, nall, nthreads, 
+				sizeof(ATOM_T));
+      if (ago != 0) buffers->thr_pack(ifrom,ito,ago);
+
+      for (int i = ifrom; i < ito; i++) {
+	int qi = ellipsoid[i];
+	if (qi > -1) {
+	  quat[i].w = bonus[qi].quat[0];
+	  quat[i].i = bonus[qi].quat[1];
+	  quat[i].j = bonus[qi].quat[2];
+	  quat[i].k = bonus[qi].quat[3];
+	}
+      }
+    }
+    quat[nall].w = (flt_t)1.0;
+    quat[nall].i = (flt_t)0.0;
+    quat[nall].j = (flt_t)0.0;
+    quat[nall].k = (flt_t)0.0;
+    fix->stop_watch(TIME_PACK);
+  }
+
+  if (evflag || vflag_fdotr) {
+    int ovflag = 0;
+    if (vflag_fdotr) ovflag = 2;
+    else if (vflag) ovflag = 1;
+    if (eflag) {
+      if (force->newton_pair) {
+        eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
+      } else {
+        eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
+      }
+    } else {
+      if (force->newton_pair) {
+        eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
+      } else {
+        eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
+      }
+    }
+  } else {
+    if (force->newton_pair) {
+      eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
+      eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
+    } else {
+      eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
+      eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
+    }
+  }
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+void PairGayBerneIntel::eval(const int offload, const int vflag,
+                             IntelBuffers<flt_t,acc_t> *buffers,
+                             const ForceConst<flt_t> &fc,
+                             const int astart, const int aend)
+{
+  const int inum = aend - astart;
+  if (inum == 0) return;
+  int nlocal, nall, minlocal;
+  fix->get_buffern(offload, nlocal, nall, minlocal);
+
+  const int ago = neighbor->ago;
+  ATOM_T * restrict const x = buffers->get_x(offload);
+  QUAT_T * restrict const quat = buffers->get_quat(offload);
+  const AtomVecEllipsoid::Bonus *bonus = avec->bonus;
+  const int *ellipsoid = atom->ellipsoid;
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (fix->separate_buffers()) {				
+    fix->start_watch(TIME_PACK);					
+    if (offload) {
+      #pragma omp parallel default(none) \
+	shared(buffers,nlocal,nall,bonus,ellipsoid)
+      {									
+        int ifrom, ito, tid;						
+	int nthreads = comm->nthreads;					
+	IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal,		
+				  nthreads, sizeof(ATOM_T));		
+	if (ago != 0) buffers->thr_pack_cop(ifrom, ito, 0);
+	for (int i = ifrom; i < ito; i++) {
+	  int qi = ellipsoid[i];
+	  if (qi > -1) {
+	    quat[i].w = bonus[qi].quat[0];
+	    quat[i].i = bonus[qi].quat[1];
+	    quat[i].j = bonus[qi].quat[2];
+	    quat[i].k = bonus[qi].quat[3];
+	  }
+	}
+	int nghost = nall - nlocal;
+	if (nghost) {
+	  IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal,		
+				 nthreads, sizeof(ATOM_T));			
+	  int offset = 0;
+	  ifrom += nlocal;
+	  ito += nlocal;
+	  if (ago != 0) {
+	    offset = fix->offload_min_ghost() - nlocal;
+	    buffers->thr_pack_cop(ifrom, ito, offset, ago == 1);
+	  }
+	  for (int i = ifrom; i < ito; i++) {
+	    int qi = ellipsoid[i + offset];
+	    if (qi > -1) {
+	      quat[i].w = bonus[qi].quat[0];
+	      quat[i].i = bonus[qi].quat[1];
+	      quat[i].j = bonus[qi].quat[2];
+	      quat[i].k = bonus[qi].quat[3];
+	    }
+	  }
+	}
+      }									
+    } else {
+      if (ago != 0) buffers->thr_pack_host(fix->host_min_local(), nlocal, 0);
+      for (int i = fix->host_min_local(); i < nlocal; i++) {
+	int qi = ellipsoid[i];
+	if (qi > -1) {
+	  quat[i].w = bonus[qi].quat[0];
+	  quat[i].i = bonus[qi].quat[1];
+	  quat[i].j = bonus[qi].quat[2];
+	  quat[i].k = bonus[qi].quat[3];
+	}
+      }
+      int offset = fix->host_min_ghost() - nlocal;
+      if (ago != 0) buffers->thr_pack_host(nlocal, nall, offset);
+      for (int i = nlocal; i < nall; i++) {
+	int qi = ellipsoid[i + offset];
+	if (qi > -1) {
+	  quat[i].w = bonus[qi].quat[0];
+	  quat[i].i = bonus[qi].quat[1];
+	  quat[i].j = bonus[qi].quat[2];
+	  quat[i].k = bonus[qi].quat[3];
+	}
+      }
+    }									
+    fix->stop_watch(TIME_PACK);						
+  }									
+  #endif
+
+  //  const int * restrict const ilist = list->ilist;
+  const int * restrict const numneigh = list->numneigh;
+  const int * restrict const cnumneigh = buffers->cnumneigh(list);
+  const int * restrict const firstneigh = buffers->firstneigh(list);
+  const flt_t * restrict const special_lj = fc.special_lj;
+
+  const FC_PACKED1_T * restrict const ijc = fc.ijc[0];
+  const FC_PACKED2_T * restrict const lj34 = fc.lj34[0];
+  const FC_PACKED3_T * restrict const ic = fc.ic;
+  const flt_t mu = fc.mu;
+  const flt_t gamma = fc.gamma;
+  const flt_t upsilon = fc.upsilon;
+
+  flt_t * const rsq_formi = fc.rsq_form[0];
+  flt_t * const delx_formi = fc.delx_form[0];
+  flt_t * const dely_formi = fc.dely_form[0];
+  flt_t * const delz_formi = fc.delz_form[0];
+  int * const jtype_formi = fc.jtype_form[0];
+  int * const jlist_formi = fc.jlist_form[0];
+
+  const int ntypes = atom->ntypes + 1;
+  const int eatom = this->eflag_atom;
+
+  // Determine how much data to transfer
+  int x_size, q_size, f_stride, ev_size, separate_flag;
+  IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
+		       buffers, offload, fix, separate_flag,
+		       x_size, q_size, ev_size, f_stride);
+
+  int tc;
+  FORCE_T * restrict f_start;
+  acc_t * restrict ev_global;
+  IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
+  const int max_nbors = _max_nbors;
+  const int nthreads = tc;
+
+  int pad = 1;
+  if (offload) {
+    if (INTEL_MIC_NBOR_PAD > 1)
+      pad = INTEL_MIC_NBOR_PAD * sizeof(float) / sizeof(flt_t);
+  } else {
+    if (INTEL_NBOR_PAD > 1)
+      pad = INTEL_NBOR_PAD * sizeof(float) / sizeof(flt_t);
+  }    
+  const int pad_width = pad;
+  
+  #ifdef _LMP_INTEL_OFFLOAD
+  int *overflow = fix->get_off_overflow_flag();
+  double *timer_compute = fix->off_watch_pair();
+
+  if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
+  #pragma offload target(mic:_cop) if(offload) \
+    in(special_lj:length(0) alloc_if(0) free_if(0)) \
+    in(ijc,lj34,ic:length(0) alloc_if(0) free_if(0)) \
+    in(rsq_formi, delx_formi, dely_formi: length(0) alloc_if(0) free_if(0)) \
+    in(delz_formi, jtype_formi, jlist_formi: length(0) alloc_if(0) free_if(0))\
+    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
+    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
+    in(numneigh:length(0) alloc_if(0) free_if(0)) \
+    in(x:length(x_size) alloc_if(0) free_if(0)) \
+    in(quat:length(nall+1) alloc_if(0) free_if(0)) \
+    in(overflow:length(0) alloc_if(0) free_if(0)) \
+    in(nthreads,inum,nall,ntypes,vflag,eatom,minlocal,separate_flag) \
+    in(astart,nlocal,f_stride,max_nbors,mu,gamma,upsilon,offload,pad_width) \
+    out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
+    out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
+    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
+    signal(f_start)
+  #endif
+  {
+    #ifdef __MIC__
+    *timer_compute=MIC_Wtime();
+    #endif
+
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (separate_flag) {							
+      if (separate_flag < 3) {							
+	int all_local = nlocal;						
+	int ghost_min = overflow[LMP_GHOST_MIN];				
+	nlocal = overflow[LMP_LOCAL_MAX] + 1;				
+	int nghost = overflow[LMP_GHOST_MAX] + 1 - ghost_min;
+	if (nghost < 0) nghost = 0;
+	nall = nlocal + nghost;
+	separate_flag--;							
+	int flength;							
+	if (NEWTON_PAIR) flength = nall;					
+	else flength = nlocal;						
+	IP_PRE_get_stride(f_stride, flength, sizeof(FORCE_T),		
+			     separate_flag);				
+	if (nghost) {
+	  if (nlocal < all_local || ghost_min > all_local) {			
+	    memmove(x + nlocal, x + ghost_min,
+		    (nall - nlocal) * sizeof(ATOM_T));			
+	    memmove(quat + nlocal, quat + ghost_min,
+		    (nall - nlocal) * sizeof(QUAT_T));			
+	  }
+	}
+      }	
+      x[nall].x = (flt_t)INTEL_BIGP;
+      x[nall].y = (flt_t)INTEL_BIGP;
+      x[nall].z = (flt_t)INTEL_BIGP;
+      quat[nall].w = (flt_t)1.0;
+      quat[nall].i = (flt_t)0.0;
+      quat[nall].j = (flt_t)0.0;
+      quat[nall].k = (flt_t)0.0;
+    }				
+    #endif
+
+    acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
+    if (EVFLAG) {
+      oevdwl = (acc_t)0;
+      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    }
+
+    // loop over neighbors of my atoms
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) \
+      shared(f_start,f_stride,nlocal,nall,minlocal) \
+      reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
+    #endif
+    {
+      int iifrom, iito, tid;
+      IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
+      iifrom += astart;
+      iito += astart;
+
+      FORCE_T * restrict const f = f_start - minlocal * 2 + (tid * f_stride);
+      memset(f + minlocal * 2, 0, f_stride * sizeof(FORCE_T));
+
+      flt_t * restrict const rsq_form = rsq_formi + tid * max_nbors;
+      flt_t * restrict const delx_form = delx_formi + tid * max_nbors;
+      flt_t * restrict const dely_form = dely_formi + tid * max_nbors;
+      flt_t * restrict const delz_form = delz_formi + tid * max_nbors;
+      int * restrict const jtype_form = jtype_formi + tid * max_nbors;
+      int * restrict const jlist_form = jlist_formi + tid * max_nbors;
+
+      int ierror = 0;
+      for (int i = iifrom; i < iito; ++i) {
+        // const int i = ilist[ii];
+        const int itype = x[i].w;
+        const int ptr_off = itype * ntypes;
+        const FC_PACKED1_T * restrict const ijci = ijc + ptr_off;
+        const FC_PACKED2_T * restrict const lj34i = lj34 + ptr_off;
+
+        const int * restrict const jlist = firstneigh + cnumneigh[i];
+        const int jnum = numneigh[i];
+
+        const flt_t xtmp = x[i].x;
+        const flt_t ytmp = x[i].y;
+        const flt_t ztmp = x[i].z;
+
+        flt_t a1_0, a1_1, a1_2, a1_3, a1_4, a1_5, a1_6, a1_7, a1_8;
+        flt_t b1_0, b1_1, b1_2, b1_3, b1_4, b1_5, b1_6, b1_7, b1_8;
+        flt_t g1_0, g1_1, g1_2, g1_3, g1_4, g1_5, g1_6, g1_7, g1_8;
+
+        if (ijci[itype].form == ELLIPSE_ELLIPSE) {
+          flt_t temp_0,temp_1,temp_2,temp_3,temp_4,temp_5,temp_6,temp_7,temp_8;
+          ME_quat_to_mat_trans(quat[i],a1);
+          ME_diag_times3(ic[itype].well,a1,temp);
+          ME_transpose_times3(a1,temp,b1);
+          ME_diag_times3(ic[itype].shape2,a1,temp);
+          ME_transpose_times3(a1,temp,g1);
+        }
+
+        acc_t fxtmp, fytmp, fztmp, fwtmp, t1tmp, t2tmp, t3tmp;
+        acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
+        fxtmp = fytmp = fztmp = t1tmp = t2tmp = t3tmp = (acc_t)0.0;
+
+        if (EVFLAG) {
+          if (EFLAG) fwtmp = sevdwl = (acc_t)0;
+          if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+        }
+
+        bool multiple_forms = false;
+        int packed_j = 0;
+        for (int jj = 0; jj < jnum; jj++) {
+          int jm = jlist[jj];
+          int j = jm & NEIGHMASK;
+          const int jtype = x[j].w;
+
+          if (ijci[jtype].form == ELLIPSE_ELLIPSE) {
+            flt_t delx = x[j].x-xtmp;
+            flt_t dely = x[j].y-ytmp;
+            flt_t delz = x[j].z-ztmp;
+            flt_t rsq = delx * delx + dely * dely + delz * delz;
+
+            if (rsq < ijci[jtype].cutsq) {
+              rsq_form[packed_j] = rsq;
+              delx_form[packed_j] = delx;
+              dely_form[packed_j] = dely;
+              delz_form[packed_j] = delz;
+              jtype_form[packed_j] = jtype;
+              jlist_form[packed_j] = jm;
+              packed_j++;
+            }
+          } else
+            multiple_forms = true;
+        }
+	while( (packed_j % pad_width) != 0 )
+	  jlist_form[packed_j++] = nall;
+
+        // -------------------------------------------------------------
+
+	#ifdef __MIC__
+	__assume(packed_j % INTEL_VECTOR_WIDTH == 0);
+	__assume(packed_j % 8 == 0);
+	__assume(packed_j % INTEL_MIC_VECTOR_WIDTH == 0);
+	#endif
+        #pragma vector aligned
+	#pragma simd reduction(+:fxtmp,fytmp,fztmp,fwtmp,t1tmp,t2tmp,t3tmp, \
+	                         sevdwl,sv0,sv1,sv2,sv3,sv4,sv5)
+        for (int jj = 0; jj < packed_j; jj++) {
+          flt_t a2_0, a2_1, a2_2, a2_3, a2_4, a2_5, a2_6, a2_7, a2_8;
+          flt_t b2_0, b2_1, b2_2, b2_3, b2_4, b2_5, b2_6, b2_7, b2_8;
+          flt_t g2_0, g2_1, g2_2, g2_3, g2_4, g2_5, g2_6, g2_7, g2_8;
+          flt_t temp_0,temp_1,temp_2,temp_3,temp_4,temp_5,temp_6,temp_7,temp_8;
+          flt_t fforce_0, fforce_1, fforce_2, ttor_0, ttor_1, ttor_2;
+          flt_t rtor_0, rtor_1, rtor_2;
+
+	  const int sbindex = jlist_form[jj] >> SBBITS & 3;
+	  const int j = jlist_form[jj] & NEIGHMASK;
+          flt_t factor_lj = special_lj[sbindex];
+          const int jtype = jtype_form[jj];
+	  const flt_t sigma = ijci[jtype].sigma;
+	  const flt_t epsilon = ijci[jtype].epsilon;
+	  const flt_t shape2_0 = ic[jtype].shape2[0];
+	  const flt_t shape2_1 = ic[jtype].shape2[1];
+	  const flt_t shape2_2 = ic[jtype].shape2[2];
+          flt_t one_eng, evdwl;
+
+          ME_quat_to_mat_trans(quat[j], a2);
+          ME_diag_times3(ic[jtype].well, a2, temp);
+          ME_transpose_times3(a2, temp, b2);
+          ME_diag_times3a(shape2, a2, temp);
+          ME_transpose_times3(a2, temp, g2);
+	  
+          flt_t tempv_0, tempv_1, tempv_2, tempv2_0, tempv2_1, tempv2_2;
+          flt_t temp1, temp2, temp3;
+
+          flt_t r12hat_0, r12hat_1, r12hat_2;
+          ME_normalize3(delx_form[jj], dely_form[jj], delz_form[jj], r12hat);
+          flt_t r = sqrt(rsq_form[jj]);
+
+          // compute distance of closest approach
+	  
+          flt_t g12_0, g12_1, g12_2, g12_3, g12_4, g12_5, g12_6, g12_7, g12_8;
+          ME_plus3(g1, g2, g12);
+          flt_t kappa_0, kappa_1, kappa_2;
+          ME_mldivide3(g12, delx_form[jj], dely_form[jj], delz_form[jj],
+		       kappa, ierror);
+	  
+          // tempv = G12^-1*r12hat
+
+          flt_t inv_r = (flt_t)1.0 / r;
+          tempv_0 = kappa_0 * inv_r;
+          tempv_1 = kappa_1 * inv_r;
+          tempv_2 = kappa_2 * inv_r;
+          flt_t sigma12 = ME_dot3(r12hat, tempv);
+          sigma12 = pow((flt_t)0.5 * sigma12,(flt_t) - 0.5);
+          flt_t h12 = r - sigma12;
+
+          // energy
+          // compute u_r
+
+          flt_t varrho = sigma / (h12 + gamma * sigma);
+          flt_t varrho6 = pow(varrho, (flt_t)6.0);
+          flt_t varrho12 = varrho6 * varrho6;
+          flt_t u_r = (flt_t)4.0 * epsilon * (varrho12 - varrho6);
+
+          // compute eta_12
+
+          flt_t eta = (flt_t)2.0 * ijci[jtype].lshape;
+          flt_t det_g12 = ME_det3(g12);
+          eta = pow(eta / det_g12, upsilon);
+
+          // compute chi_12
+
+          flt_t b12_0, b12_1, b12_2, b12_3, b12_4, b12_5, b12_6, b12_7, b12_8;
+          flt_t iota_0, iota_1, iota_2;
+          ME_plus3(b1, b2, b12);
+          ME_mldivide3(b12, delx_form[jj], dely_form[jj], delz_form[jj],
+		       iota, ierror);
+
+          // tempv = G12^-1*r12hat
+
+          tempv_0 = iota_0 * inv_r;
+          tempv_1 = iota_1 * inv_r;
+          tempv_2 = iota_2 * inv_r;
+          flt_t chi = ME_dot3(r12hat, tempv);
+          chi = pow(chi * (flt_t)2.0, mu);
+
+          // force
+          // compute dUr/dr
+
+          temp1 = ((flt_t)2.0 * varrho12 * varrho - varrho6 * varrho) / 
+	    sigma;
+          temp1 = temp1 * (flt_t)24.0 * epsilon;
+          flt_t u_slj = temp1 * pow(sigma12, (flt_t)3.0) * (flt_t)0.5;
+          flt_t dUr_0, dUr_1, dUr_2;
+          temp2 = ME_dot3(kappa, r12hat);
+          flt_t uslj_rsq = u_slj / rsq_form[jj];
+          dUr_0 = temp1 * r12hat_0 + uslj_rsq * (kappa_0 - temp2 * r12hat_0);
+          dUr_1 = temp1 * r12hat_1 + uslj_rsq * (kappa_1 - temp2 * r12hat_1);
+          dUr_2 = temp1 * r12hat_2 + uslj_rsq * (kappa_2 - temp2 * r12hat_2);
+
+          // compute dChi_12/dr
+
+          flt_t dchi_0, dchi_1, dchi_2;
+          temp1 = ME_dot3(iota, r12hat);
+          temp2 = (flt_t)-4.0 / rsq_form[jj] * mu * 
+	    pow(chi, (mu - (flt_t)1.0) / mu);
+          dchi_0 = temp2 * (iota_0 - temp1 * r12hat_0);
+          dchi_1 = temp2 * (iota_1 - temp1 * r12hat_1);
+          dchi_2 = temp2 * (iota_2 - temp1 * r12hat_2);
+
+          temp1 = -eta * u_r;
+          temp2 = eta * chi;
+          fforce_0 = temp1 * dchi_0 - temp2 * dUr_0;
+          fforce_1 = temp1 * dchi_1 - temp2 * dUr_1;
+          fforce_2 = temp1 * dchi_2 - temp2 * dUr_2;
+
+          // torque for particle 1 and 2
+          // compute dUr
+
+          tempv_0 = -uslj_rsq * kappa_0;
+          tempv_1 = -uslj_rsq * kappa_1;
+          tempv_2 = -uslj_rsq * kappa_2;
+          ME_vecmat(kappa, g1, tempv2);
+          ME_cross3(tempv, tempv2, dUr);
+          flt_t dUr2_0, dUr2_1, dUr2_2;
+
+          if (NEWTON_PAIR || j < nlocal) {
+            ME_vecmat(kappa, g2, tempv2);
+            ME_cross3(tempv, tempv2, dUr2);
+          }
+
+          // compute d_chi
+
+          ME_vecmat(iota, b1, tempv);
+          ME_cross3(tempv, iota, dchi);
+          temp1 = (flt_t)-4.0 / rsq_form[jj];
+          dchi_0 *= temp1;
+          dchi_1 *= temp1;
+          dchi_2 *= temp1;
+          flt_t dchi2_0, dchi2_1, dchi2_2;
+
+          if (NEWTON_PAIR || j < nlocal) {
+            ME_vecmat(iota, b2, tempv);
+            ME_cross3(tempv, iota, dchi2);
+            dchi2_0 *= temp1;
+            dchi2_1 *= temp1;
+            dchi2_2 *= temp1;
+          }
+
+          // compute d_eta
+
+          flt_t deta_0, deta_1, deta_2;
+          deta_0 = deta_1 = deta_2 = (flt_t)0.0;
+          ME_compute_eta_torque(g12, a1, shape2, temp);
+          temp1 = -eta * upsilon;
+
+          tempv_0 = temp1 * temp_0;
+          tempv_1 = temp1 * temp_1;
+          tempv_2 = temp1 * temp_2;
+          ME_mv0_cross3(a1, tempv, tempv2);
+          deta_0 += tempv2_0;
+          deta_1 += tempv2_1;
+          deta_2 += tempv2_2;
+
+          tempv_0 = temp1 * temp_3;
+          tempv_1 = temp1 * temp_4;
+          tempv_2 = temp1 * temp_5;
+          ME_mv1_cross3(a1, tempv, tempv2);
+          deta_0 += tempv2_0;
+          deta_1 += tempv2_1;
+          deta_2 += tempv2_2;
+
+          tempv_0 = temp1 * temp_6;
+          tempv_1 = temp1 * temp_7;
+          tempv_2 = temp1 * temp_8;
+          ME_mv2_cross3(a1, tempv, tempv2);
+          deta_0 += tempv2_0;
+          deta_1 += tempv2_1;
+          deta_2 += tempv2_2;
+
+          // compute d_eta for particle 2
+
+          flt_t deta2_0, deta2_1, deta2_2;
+          if (NEWTON_PAIR || j < nlocal) {
+            deta2_0 = deta2_1 = deta2_2 = (flt_t)0.0;
+            ME_compute_eta_torque(g12, a2, shape2, temp);
+
+            tempv_0 = temp1 * temp_0;
+            tempv_1 = temp1 * temp_1;
+            tempv_2 = temp1 * temp_2;
+            ME_mv0_cross3(a2, tempv, tempv2);
+            deta2_0 += tempv2_0;
+            deta2_1 += tempv2_1;
+            deta2_2 += tempv2_2;
+
+            tempv_0 = temp1 * temp_3;
+            tempv_1 = temp1 * temp_4;
+            tempv_2 = temp1 * temp_5;
+            ME_mv1_cross3(a2, tempv, tempv2);
+            deta2_0 += tempv2_0;
+            deta2_1 += tempv2_1;
+            deta2_2 += tempv2_2;
+
+            tempv_0 = temp1 * temp_6;
+            tempv_1 = temp1 * temp_7;
+            tempv_2 = temp1 * temp_8;
+            ME_mv2_cross3(a2, tempv, tempv2);
+            deta2_0 += tempv2_0;
+            deta2_1 += tempv2_1;
+            deta2_2 += tempv2_2;
+          }
+
+          // torque
+
+          temp1 = u_r * eta;
+          temp2 = u_r * chi;
+          temp3 = chi * eta;
+
+          ttor_0 = (temp1 * dchi_0 + temp2 * deta_0 + temp3 * dUr_0) * 
+	    (flt_t)-1.0;
+          ttor_1 = (temp1 * dchi_1 + temp2 * deta_1 + temp3 * dUr_1) * 
+	    (flt_t)-1.0;
+          ttor_2 = (temp1 * dchi_2 + temp2 * deta_2 + temp3 * dUr_2) * 
+	    (flt_t)-1.0;
+
+          if (NEWTON_PAIR || j < nlocal) {
+            rtor_0 = (temp1 * dchi2_0 + temp2 * deta2_0 + temp3 * dUr2_0) * 
+	      (flt_t)-1.0;
+            rtor_1 = (temp1 * dchi2_1 + temp2 * deta2_1 + temp3 * dUr2_1) * 
+	      (flt_t)-1.0;
+            rtor_2 = (temp1 * dchi2_2 + temp2 * deta2_2 + temp3 * dUr2_2) * 
+	      (flt_t)-1.0;
+          }
+
+          one_eng = temp1 * chi;
+	  #ifndef __MIC__
+	  if (jlist_form[jj] == nall) {
+	    one_eng = (flt_t)0.0;
+	    fforce_0 = 0.0;
+	    fforce_1 = 0.0;
+	    fforce_2 = 0.0;
+	    ttor_0 = 0.0;
+	    ttor_1 = 0.0;
+	    ttor_2 = 0.0;
+	    rtor_0 = 0.0;
+	    rtor_1 = 0.0;
+	    rtor_2 = 0.0;
+	  }
+	  #endif
+
+          fforce_0 *= factor_lj;
+          fforce_1 *= factor_lj;
+          fforce_2 *= factor_lj;
+          ttor_0 *= factor_lj;
+          ttor_1 *= factor_lj;
+          ttor_2 *= factor_lj;
+
+	  #ifdef __MIC__
+	  if (jlist_form[jj] < nall) {
+	  #endif
+	    fxtmp += fforce_0;
+	    fytmp += fforce_1;
+	    fztmp += fforce_2;
+	    t1tmp += ttor_0;
+	    t2tmp += ttor_1;
+	    t3tmp += ttor_2;
+
+	    if (NEWTON_PAIR || j < nlocal) {
+	      rtor_0 *= factor_lj;
+	      rtor_1 *= factor_lj;
+	      rtor_2 *= factor_lj;
+	      int jp = j * 2;
+	      f[jp].x -= fforce_0;
+	      f[jp].y -= fforce_1;
+	      f[jp].z -= fforce_2;
+	      jp++;
+	      f[jp].x += rtor_0;
+	      f[jp].y += rtor_1;
+	      f[jp].z += rtor_2;
+	    }
+	  
+	    if (EVFLAG) {
+	      flt_t ev_pre = (flt_t)0;
+	      if (NEWTON_PAIR || i < nlocal)
+		ev_pre += (flt_t)0.5;
+	      if (NEWTON_PAIR || j < nlocal)
+		ev_pre += (flt_t)0.5;
+
+	      if (EFLAG) {
+		evdwl = factor_lj * one_eng;
+		sevdwl += ev_pre * evdwl;
+		if (eatom) {
+		  if (NEWTON_PAIR || i < nlocal)
+		    fwtmp += (flt_t)0.5 * evdwl;
+		  if (NEWTON_PAIR || j < nlocal)
+		    f[j*2].w += (flt_t)0.5 * evdwl;
+		}
+	      }
+	      
+	      if (vflag == 1) {
+		ev_pre *= (flt_t)-1.0;
+		sv0 += ev_pre * delx_form[jj] * fforce_0;
+		sv1 += ev_pre * dely_form[jj] * fforce_1;
+		sv2 += ev_pre * delz_form[jj] * fforce_2;
+		sv3 += ev_pre * delx_form[jj] * fforce_1;
+		sv4 += ev_pre * delx_form[jj] * fforce_2;
+		sv5 += ev_pre * dely_form[jj] * fforce_2;
+	      }
+	    } // EVFLAG
+	  #ifdef __MIC__
+	  }
+	  #endif
+        } // for jj
+
+        // -------------------------------------------------------------
+
+        if (multiple_forms)
+          ierror = 2;
+
+        int ip = i * 2;
+        f[ip].x += fxtmp;
+        f[ip].y += fytmp;
+        f[ip].z += fztmp;
+        ip++;
+        f[ip].x += t1tmp;
+        f[ip].y += t2tmp;
+        f[ip].z += t3tmp;
+
+        if (EVFLAG) {
+          if (EFLAG) {
+            if (eatom) f[i * 2].w += fwtmp;
+            oevdwl += sevdwl;
+          }
+          if (vflag == 1) {
+            ov0 += sv0;
+            ov1 += sv1;
+            ov2 += sv2;
+            ov3 += sv3;
+            ov4 += sv4;
+            ov5 += sv5;
+          }
+        }
+      } // for i
+      int o_range;
+      if (NEWTON_PAIR)
+        o_range = nall;
+      else
+        o_range = nlocal;
+      if (offload == 0) o_range -= minlocal;
+      IP_PRE_omp_range_align(iifrom, iito, tid, o_range, nthreads, 
+			     sizeof(FORCE_T));
+      const int two_iito = iito * 2;
+
+      #if defined(_OPENMP)
+      #pragma omp barrier
+      #endif
+
+      acc_t *facc = &(f_start[0].x);
+      const int sto = two_iito * 4;
+      const int fst4 = f_stride * 4;
+      #if defined(_OPENMP)
+      #pragma omp barrier
+      #endif
+      int t_off = f_stride;
+      if (EFLAG && eatom) {
+        for (int t = 1; t < nthreads; t++) {
+	  #pragma vector nontemporal
+          for (int n = iifrom * 2; n < two_iito; n++) {
+            f_start[n].x += f_start[n + t_off].x;
+            f_start[n].y += f_start[n + t_off].y;
+            f_start[n].z += f_start[n + t_off].z;
+            f_start[n].w += f_start[n + t_off].w;
+          }
+          t_off += f_stride;
+        }
+      } else {
+        for (int t = 1; t < nthreads; t++) {
+	  #pragma vector nontemporal
+          for (int n = iifrom * 2; n < two_iito; n++) {
+            f_start[n].x += f_start[n + t_off].x;
+            f_start[n].y += f_start[n + t_off].y;
+            f_start[n].z += f_start[n + t_off].z;
+          }
+          t_off += f_stride;
+        }
+      }
+
+      if (EVFLAG) {
+        if (vflag==2) {
+          const ATOM_T * restrict const xo = x + minlocal;
+	  #pragma vector nontemporal
+          for (int n = iifrom; n < iito; n++) {
+            const int nt2 = n * 2;
+            ov0 += f_start[nt2].x * xo[n].x;
+            ov1 += f_start[nt2].y * xo[n].y;
+            ov2 += f_start[nt2].z * xo[n].z;
+            ov3 += f_start[nt2].y * xo[n].x;
+            ov4 += f_start[nt2].z * xo[n].x;
+            ov5 += f_start[nt2].z * xo[n].y;
+          }
+        }
+      }
+
+      if (ierror)
+        f_start[1].w = ierror;
+    } // omp
+
+    if (EVFLAG) {
+      if (EFLAG) {
+        ev_global[0] = oevdwl;
+        ev_global[1] = (acc_t)0.0;
+      }
+      if (vflag) {
+        ev_global[2] = ov0;
+        ev_global[3] = ov1;
+        ev_global[4] = ov2;
+        ev_global[5] = ov3;
+        ev_global[6] = ov4;
+        ev_global[7] = ov5;
+      }
+    }
+
+    #ifdef __MIC__
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // offload
+
+  if (offload)
+    fix->stop_watch(TIME_OFFLOAD_LATENCY);
+  else
+    fix->stop_watch(TIME_HOST_PAIR);
+
+  if (EVFLAG)
+    fix->add_result_array(f_start, ev_global, offload,eatom);
+  else
+    fix->add_result_array(f_start, 0, offload);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairGayBerneIntel::init_style()
+{
+  PairGayBerne::init_style();
+  neighbor->requests[neighbor->nrequest-1]->intel = 1;
+
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  fix = static_cast<FixIntel *>(modify->fix[ifix]);
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  fix->set_offload_affinity();
+  if (force->newton_pair) fix->set_offload_noghost(1);
+  _cop = fix->coprocessor_number();
+  #endif
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+    fix->get_mixed_buffers()->free_all_nbor_buffers();
+    pack_force_const(force_const_single, fix->get_mixed_buffers());
+  } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+    fix->get_double_buffers()->free_all_nbor_buffers();
+    pack_force_const(force_const_double, fix->get_double_buffers());
+  } else {
+    fix->get_single_buffers()->free_all_nbor_buffers();
+    pack_force_const(force_const_single, fix->get_single_buffers());
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void PairGayBerneIntel::pack_force_const(ForceConst<flt_t> &fc,
+                                         IntelBuffers<flt_t,acc_t> *buffers)
+{
+  int tp1 = atom->ntypes + 1;
+  _max_nbors = buffers->get_max_nbors();
+  int mthreads = comm->nthreads;
+  if (mthreads < buffers->get_off_threads())
+    mthreads = buffers->get_off_threads();
+  fc.set_ntypes(tp1, _max_nbors, mthreads, memory, _cop);
+  buffers->set_ntypes(tp1);
+  flt_t **cutneighsq = buffers->get_cutneighsq();
+
+  // Repeat cutsq calculation because done after call to init_style
+  double cut, cutneigh;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i,j);
+        cutneigh = cut + neighbor->skin;
+        cutsq[i][j] = cutsq[j][i] = cut*cut;
+        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
+      }
+    }
+  }
+
+  for (int i = 0; i < 4; i++) {
+    fc.special_lj[i] = force->special_lj[i];
+    fc.special_lj[0] = 1.0;
+  }
+  fc.gamma = gamma;
+  fc.upsilon = upsilon;
+  fc.mu = mu;
+
+  for (int i = 0; i < tp1; i++) {
+    for (int j = 0; j < tp1; j++) {
+      fc.ijc[i][j].lj1 = lj1[i][j];
+      fc.ijc[i][j].lj2 = lj2[i][j];
+      fc.ijc[i][j].cutsq = cutsq[i][j];
+      fc.ijc[i][j].offset = offset[i][j];
+      fc.ijc[i][j].sigma = sigma[i][j];
+      fc.ijc[i][j].epsilon = epsilon[i][j];
+      fc.ijc[i][j].form = form[i][j];
+      fc.ijc[i][j].lshape = lshape[i] * lshape[j];
+      fc.lj34[i][j].lj3 = lj3[i][j];
+      fc.lj34[i][j].lj4 = lj4[i][j];
+    }
+    for (int j = 0; j < 4; j++) {
+      fc.ic[i].shape2[j] = shape2[i][j];
+      fc.ic[i].well[j] = well[i][j];
+    }
+  }
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_cop < 0) return;
+  flt_t * special_lj = fc.special_lj;
+  FC_PACKED1_T *oijc = fc.ijc[0];
+  FC_PACKED2_T *olj34 = fc.lj34[0];
+  FC_PACKED3_T *oic = fc.ic;
+  flt_t * ocutneighsq = cutneighsq[0];
+  int tp1sq = tp1 * tp1;
+  if (oijc != NULL && oic != NULL) {
+    #pragma offload_transfer target(mic:_cop) \
+      in(special_lj: length(4) alloc_if(0) free_if(0)) \
+      in(oijc,olj34: length(tp1sq) alloc_if(0) free_if(0)) \
+      in(oic: length(tp1) alloc_if(0) free_if(0)) \
+      in(ocutneighsq: length(tp1sq))
+  }
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t>
+void PairGayBerneIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
+                                                      const int one_length,
+                                                      const int nthreads,
+                                                      Memory *memory,
+						      const int cop) {
+  if (ntypes != _ntypes) {
+    if (_ntypes > 0) {
+      fc_packed3 *oic = ic;
+
+      #ifdef _LMP_INTEL_OFFLOAD
+      flt_t * ospecial_lj = special_lj;
+      fc_packed1 *oijc = ijc[0];
+      fc_packed2 *olj34 = lj34[0];
+      flt_t * orsq_form = rsq_form[0];
+      flt_t * odelx_form = delx_form[0];
+      flt_t * odely_form = dely_form[0];
+      flt_t * odelz_form = delz_form[0];
+      int * ojtype_form = jtype_form[0];
+      int * ojlist_form = jlist_form[0];
+
+      if (ospecial_lj != NULL && oijc != NULL && olj34 != NULL &&
+	  orsq_form != NULL && odelx_form != NULL && odely_form != NULL &&
+	  odelz_form != NULL && ojtype_form != NULL && ojlist_form != NULL &&
+	  _cop >= 0) {
+        #pragma offload_transfer target(mic:_cop) \
+          nocopy(ospecial_lj, oijc, olj34, oic: alloc_if(0) free_if(1)) \
+          nocopy(orsq_form, odelx_form, odely_form: alloc_if(0) free_if(1)) \
+          nocopy(odelz_form, ojtype_form, ojlist_form: alloc_if(0) free_if(1))
+      }
+      #endif
+
+      _memory->destroy(oic);
+      _memory->destroy(ijc);
+      _memory->destroy(lj34);
+      _memory->destroy(rsq_form);
+      _memory->destroy(delx_form);
+      _memory->destroy(dely_form);
+      _memory->destroy(delz_form);
+      _memory->destroy(jtype_form);
+      _memory->destroy(jlist_form);
+    }
+
+    if (ntypes > 0) {
+      _cop = cop;
+      memory->create(ijc, ntypes, ntypes, "fc.ijc");
+      memory->create(lj34, ntypes, ntypes, "fc.lj34");
+      memory->create(ic, ntypes, "fc.ic");
+      memory->create(rsq_form, nthreads, one_length, "rsq_form");
+      memory->create(delx_form, nthreads, one_length, "delx_form");
+      memory->create(dely_form, nthreads, one_length, "dely_form");
+      memory->create(delz_form, nthreads, one_length, "delz_form");
+      memory->create(jtype_form, nthreads, one_length, "jtype_form");
+      memory->create(jlist_form, nthreads, one_length, "jlist_form");
+
+      for (int zn = 0; zn < nthreads; zn++)
+	for (int zo = 0; zo < one_length; zo++) {
+	  rsq_form[zn][zo] = 10.0;
+	  delx_form[zn][zo] = 10.0;
+	  dely_form[zn][zo] = 10.0;
+	  delz_form[zn][zo] = 10.0;
+	  jtype_form[zn][zo] = 1;
+	  jlist_form[zn][zo] = 0;
+	}
+
+      #ifdef _LMP_INTEL_OFFLOAD
+      flt_t * ospecial_lj = special_lj;
+      fc_packed1 *oijc = ijc[0];
+      fc_packed2 *olj34 = lj34[0];
+      fc_packed3 *oic = ic;
+      flt_t * orsq_form = rsq_form[0];
+      flt_t * odelx_form = delx_form[0];
+      flt_t * odely_form = dely_form[0];
+      flt_t * odelz_form = delz_form[0];
+      int * ojtype_form = jtype_form[0];
+      int * ojlist_form = jlist_form[0];
+      int off_onel = one_length * nthreads;
+
+      int tp1sq = ntypes*ntypes;
+      if (ospecial_lj != NULL && oijc != NULL && olj34 != NULL && 
+	  oic != NULL && orsq_form != NULL && odelx_form != NULL && 
+	  odely_form != NULL && odelz_form != NULL && ojtype_form !=NULL && 
+	  ojlist_form !=NULL && cop >= 0) {
+        #pragma offload_transfer target(mic:cop) \
+          nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
+          nocopy(oijc,olj34: length(tp1sq) alloc_if(1) free_if(0)) \
+          nocopy(oic: length(ntypes) alloc_if(1) free_if(0)) \
+          in(orsq_form: length(off_onel) alloc_if(1) free_if(0)) \
+          in(odelx_form: length(off_onel) alloc_if(1) free_if(0)) \
+          in(odely_form: length(off_onel) alloc_if(1) free_if(0)) \
+          in(odelz_form: length(off_onel) alloc_if(1) free_if(0)) \
+          in(ojtype_form: length(off_onel) alloc_if(1) free_if(0)) \
+          in(ojlist_form: length(off_onel) alloc_if(1) free_if(0))
+      }
+      #endif
+    }
+  }
+  _ntypes = ntypes;
+  _memory = memory;
+}
diff --git a/src/USER-INTEL/pair_gayberne_intel.h b/src/USER-INTEL/pair_gayberne_intel.h
new file mode 100644
index 0000000000..eb055e151e
--- /dev/null
+++ b/src/USER-INTEL/pair_gayberne_intel.h
@@ -0,0 +1,99 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(gayberne/intel,PairGayBerneIntel)
+
+#else
+
+#ifndef LMP_PAIR_GAYBERNE_INTEL_H
+#define LMP_PAIR_GAYBERNE_INTEL_H
+
+#include "pair_gayberne.h"
+#include "fix_intel.h"
+
+namespace LAMMPS_NS {
+
+class PairGayBerneIntel : public PairGayBerne {
+
+ public:
+  PairGayBerneIntel(class LAMMPS *);
+
+  virtual void compute(int, int);
+  void init_style();
+
+ private:
+  template <class flt_t> class ForceConst;
+
+  template <class flt_t, class acc_t>
+  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
+               const ForceConst<flt_t> &fc);
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+  void eval(const int offload, const int vflag,
+            IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc, const int astart, const int aend);
+
+  template <class flt_t, class acc_t>
+  void pack_force_const(ForceConst<flt_t> &fc,
+                        IntelBuffers<flt_t, acc_t> *buffers);
+
+  template <class flt_t>
+  class ForceConst {
+   public:
+    typedef struct { 
+      flt_t cutsq, lj1, lj2, offset, sigma, epsilon, lshape; 
+      int form; 
+    } fc_packed1;
+    typedef struct { flt_t lj3, lj4; } fc_packed2;
+    typedef struct { flt_t shape2[4], well[4]; } fc_packed3;
+
+    __declspec(align(64)) flt_t special_lj[4], gamma, upsilon, mu;
+    fc_packed1 **ijc;
+    fc_packed2 **lj34;
+    fc_packed3 *ic;
+
+    flt_t **rsq_form, **delx_form, **dely_form, **delz_form;
+    int **jtype_form, **jlist_form;
+
+    ForceConst() : _ntypes(0)  {}
+    ~ForceConst() { set_ntypes(0, 0, 0, NULL, _cop); }
+
+    void set_ntypes(const int ntypes, const int one_length,
+                    const int nthreads, Memory *memory, const int cop);
+
+   private:
+    int _ntypes, _cop;
+    Memory *_memory;
+  };
+
+  ForceConst<float> force_const_single;
+  ForceConst<double> force_const_double;
+  int _max_nbors;
+
+  double gayberne_lj(const int i, const int j, double a1[3][3],
+                     double b1[3][3], double g1[3][3], double *r12,
+                     const double rsq, double *fforce, double *ttor);
+
+  FixIntel *fix;
+  int _cop;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
new file mode 100644
index 0000000000..576d5b21c7
--- /dev/null
+++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.cpp
@@ -0,0 +1,675 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_charmm_coul_long_intel.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "group.h"
+#include "kspace.h"
+#include "memory.h"
+#include "modify.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "memory.h"
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define LJ_T typename IntelBuffers<flt_t,flt_t>::vec4_t
+#define TABLE_T typename ForceConst<flt_t>::table_t
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCharmmCoulLongIntel::PairLJCharmmCoulLongIntel(LAMMPS *lmp) :
+  PairLJCharmmCoulLong(lmp)
+{
+  suffix_flag |= Suffix::INTEL;
+  respa_enable = 0;
+  cut_respa = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCharmmCoulLongIntel::~PairLJCharmmCoulLongIntel()
+{
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulLongIntel::compute(int eflag, int vflag)
+{
+  if (fix->precision()==FixIntel::PREC_MODE_MIXED)
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(), 
+                          force_const_single);
+  else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
+    compute<double,double>(eflag, vflag, fix->get_double_buffers(),
+                           force_const_double);
+  else
+    compute<float,float>(eflag, vflag, fix->get_single_buffers(),
+                         force_const_single);
+
+  fix->balance_stamp();
+  vflag_fdotr = 0;
+}
+
+template <class flt_t, class acc_t>
+void PairLJCharmmCoulLongIntel::compute(int eflag, int vflag,
+					IntelBuffers<flt_t,acc_t> *buffers,
+					const ForceConst<flt_t> &fc)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int inum = list->inum;
+  const int nthreads = comm->nthreads;
+  const int host_start = fix->host_start_pair();
+  const int offload_end = fix->offload_end_pair();
+  const int ago = neighbor->ago;
+  
+  if (ago != 0 && fix->separate_buffers() == 0) {
+    fix->start_watch(TIME_PACK);
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
+    #endif
+    {
+      int ifrom, ito, tid;
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal+atom->nghost, 
+			      nthreads, sizeof(ATOM_T));
+      buffers->thr_pack(ifrom,ito,ago);
+    }
+    fix->stop_watch(TIME_PACK);
+  }
+  
+  // -------------------- Regular version
+  if (evflag || vflag_fdotr) {
+    int ovflag = 0;
+    if (vflag_fdotr) ovflag = 2;
+    else if (vflag) ovflag = 1;
+    if (eflag) {
+      if (force->newton_pair) {
+	eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+	eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
+      } else {
+	eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+	eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
+      }
+    } else {
+      if (force->newton_pair) {
+	eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+	eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
+      } else {
+	eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+	eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
+      }
+    }
+  } else {
+    if (force->newton_pair) {
+      eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
+      eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
+    } else {
+      eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
+      eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+void PairLJCharmmCoulLongIntel::eval(const int offload, const int vflag,
+				     IntelBuffers<flt_t,acc_t> *buffers,
+				     const ForceConst<flt_t> &fc,
+				     const int astart, const int aend)
+{
+  const int inum = aend - astart;
+  if (inum == 0) return;
+  int nlocal, nall, minlocal;
+  fix->get_buffern(offload, nlocal, nall, minlocal);
+
+  const int ago = neighbor->ago;
+  IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
+
+  ATOM_T * restrict const x = buffers->get_x(offload);
+  flt_t * restrict const q = buffers->get_q(offload);
+
+  const int * restrict const numneigh = list->numneigh;
+  const int * restrict const cnumneigh = buffers->cnumneigh(list);
+  const int * restrict const firstneigh = buffers->firstneigh(list);
+
+  const flt_t * restrict const special_coul = fc.special_coul;
+  const flt_t * restrict const special_lj = fc.special_lj;
+  const flt_t qqrd2e = force->qqrd2e;
+  const flt_t inv_denom_lj = (flt_t)1.0/denom_lj;
+
+  const flt_t * restrict const cutsq = fc.cutsq[0];
+  const LJ_T * restrict const lj = fc.lj[0];
+  const TABLE_T * restrict const table = fc.table;
+  const flt_t * restrict const etable = fc.etable;
+  const flt_t * restrict const detable = fc.detable;
+  const flt_t * restrict const ctable = fc.ctable;
+  const flt_t * restrict const dctable = fc.dctable;
+  const flt_t cut_ljsq = fc.cut_ljsq;
+  const flt_t cut_lj_innersq = fc.cut_lj_innersq;
+  const flt_t cut_coulsq = fc.cut_coulsq;
+  const flt_t g_ewald = fc.g_ewald;
+  const flt_t tabinnersq = fc.tabinnersq;
+
+  const int ntypes = atom->ntypes + 1;
+  const int eatom = this->eflag_atom;
+
+  // Determine how much data to transfer
+  int x_size, q_size, f_stride, ev_size, separate_flag;
+  IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
+		       buffers, offload, fix, separate_flag,
+		       x_size, q_size, ev_size, f_stride);
+		       
+  int tc;
+  FORCE_T * restrict f_start;
+  acc_t * restrict ev_global;
+  IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
+
+  const int nthreads = tc;
+  #ifdef _LMP_INTEL_OFFLOAD
+  int *overflow = fix->get_off_overflow_flag();
+  double *timer_compute = fix->off_watch_pair();
+  // Redeclare as local variables for offload
+  const int ncoultablebits = this->ncoultablebits;
+  const int ncoulmask = this->ncoulmask;
+  const int ncoulshiftbits = this->ncoulshiftbits;
+  #ifdef INTEL_ALLOW_TABLE
+  #define ITABLE_IN in(table,etable,detable:length(0) alloc_if(0) free_if(0)) \
+                    in(ctable,dctable:length(0) alloc_if(0) free_if(0)) \
+                    in(ncoultablebits,tabinnersq,ncoulmask,ncoulshiftbits)
+  #else
+  #define ITABLE_IN
+  #endif
+
+  if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
+  #pragma offload target(mic:_cop) if(offload) \
+    in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \
+    in(cutsq,lj:length(0) alloc_if(0) free_if(0)) \
+    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
+    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
+    in(numneigh:length(0) alloc_if(0) free_if(0)) \
+    in(x:length(x_size) alloc_if(0) free_if(0)) \
+    in(q:length(q_size) alloc_if(0) free_if(0)) \
+    in(overflow:length(0) alloc_if(0) free_if(0)) \
+    in(nthreads,qqrd2e,g_ewald,inum,nall,ntypes,cut_coulsq,vflag,eatom) \
+    in(f_stride,separate_flag,offload) \
+    in(astart,cut_ljsq,cut_lj_innersq,nlocal,inv_denom_lj,minlocal) \
+    out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
+    out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
+    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
+    ITABLE_IN signal(f_start)
+  #endif
+  {
+    #ifdef __MIC__
+    *timer_compute = MIC_Wtime();
+    #endif
+
+    IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, 
+			      f_stride, x, q);
+
+    acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
+    if (EVFLAG) {
+      oevdwl = oecoul = (acc_t)0;
+      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    }
+
+    // loop over neighbors of my atoms
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) \
+      shared(f_start,f_stride,nlocal,nall,minlocal) \
+      reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
+    #endif
+    {
+      int iifrom, iito, tid;
+      IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
+      iifrom += astart;
+      iito += astart;
+
+      FORCE_T * restrict const f = f_start - minlocal + (tid * f_stride);
+      memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+      flt_t cutboth = cut_coulsq;
+
+      for (int i = iifrom; i < iito; ++i) {
+	//        const int i = ilist[ii];
+        const int itype = x[i].w;
+
+        const int ptr_off = itype * ntypes;
+        const flt_t * restrict const cutsqi = cutsq + ptr_off;
+        const LJ_T * restrict const lji = lj + ptr_off;
+
+        const int   * restrict const jlist = firstneigh + cnumneigh[i];
+        const int jnum = numneigh[i];
+
+        acc_t fxtmp,fytmp,fztmp,fwtmp;
+	acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
+
+        const flt_t xtmp = x[i].x;
+        const flt_t ytmp = x[i].y;
+        const flt_t ztmp = x[i].z;
+        const flt_t qtmp = q[i];
+        fxtmp = fytmp = fztmp = (acc_t)0;
+        if (EVFLAG) {
+	  if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
+	  if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+	}
+
+	#pragma vector aligned
+	#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
+	                       sv0, sv1, sv2, sv3, sv4, sv5)
+        for (int jj = 0; jj < jnum; jj++) {
+          flt_t forcecoul, forcelj, evdwl, ecoul;
+          forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0;
+
+          const int sbindex = jlist[jj] >> SBBITS & 3;
+          const int j = jlist[jj] & NEIGHMASK;
+
+          const flt_t delx = xtmp - x[j].x;
+          const flt_t dely = ytmp - x[j].y;
+          const flt_t delz = ztmp - x[j].z;
+          const int jtype = x[j].w;
+          const flt_t rsq = delx * delx + dely * dely + delz * delz;
+          const flt_t r2inv = (flt_t)1.0 / rsq;
+
+	  #ifdef __MIC__
+	  if (rsq < cut_coulsq) {
+          #endif
+            #ifdef INTEL_ALLOW_TABLE
+            if (!ncoultablebits || rsq <= tabinnersq) {
+            #endif
+              const flt_t A1 =  0.254829592;
+              const flt_t A2 = -0.284496736;
+              const flt_t A3 =  1.421413741;
+              const flt_t A4 = -1.453152027;
+              const flt_t A5 =  1.061405429;
+              const flt_t EWALD_F = 1.12837917;
+              const flt_t INV_EWALD_P = 1.0 / 0.3275911;
+
+              const flt_t r = sqrt(rsq);
+              const flt_t grij = g_ewald * r;
+              const flt_t expm2 = exp(-grij * grij);
+              const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
+              const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+              const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
+              forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
+              if (EFLAG) ecoul = prefactor * erfc;
+              if (sbindex) {
+                const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
+                    prefactor;
+                forcecoul -= adjust;
+                if (EFLAG) ecoul -= adjust;
+              }
+            #ifdef INTEL_ALLOW_TABLE
+            } else {
+              float rsq_lookup = rsq;
+              const int itable = (__intel_castf32_u32(rsq_lookup) &
+                  ncoulmask) >> ncoulshiftbits;
+              const flt_t fraction = (rsq_lookup - table[itable].r) *
+                  table[itable].dr;
+
+              const flt_t tablet = table[itable].f +
+                  fraction * table[itable].df;
+              forcecoul = qtmp * q[j] * tablet;
+              if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
+                  fraction * detable[itable]);
+              if (sbindex) {
+                const flt_t table2 = ctable[itable] +
+                    fraction * dctable[itable];
+                const flt_t prefactor = qtmp * q[j] * table2;
+                const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
+                    prefactor;
+                forcecoul -= adjust;
+                if (EFLAG) ecoul -= adjust;
+              }
+            }
+            #endif
+	  #ifdef __MIC__
+	  }
+	  #endif
+
+	  #ifdef __MIC__
+	  if (rsq < cut_ljsq) {
+	  #endif
+            flt_t r6inv = r2inv * r2inv * r2inv;
+            forcelj = r6inv * (lji[jtype].x * r6inv - lji[jtype].y);
+            if (EFLAG) evdwl = r6inv*(lji[jtype].z * r6inv - lji[jtype].w);
+
+	    #ifdef __MIC__
+	    if (rsq > cut_lj_innersq) {
+	    #endif
+              const flt_t drsq = cut_ljsq - rsq;
+              const flt_t cut2 = (rsq - cut_lj_innersq) * drsq;
+              const flt_t switch1 = drsq * (drsq * drsq + (flt_t)3.0 * cut2) *
+                  inv_denom_lj;
+              const flt_t switch2 = (flt_t)12.0 * rsq * cut2 * inv_denom_lj;
+              if (EFLAG) {
+		#ifndef __MIC__
+		if (rsq > cut_lj_innersq) {
+		#endif
+                  forcelj = forcelj * switch1 + evdwl * switch2;
+                  evdwl *= switch1;
+		#ifndef __MIC__
+		}
+		#endif
+              } else {
+                const flt_t philj = r6inv * (lji[jtype].z*r6inv -
+                    lji[jtype].w);
+		#ifndef __MIC__
+		if (rsq > cut_lj_innersq)
+		#endif
+                  forcelj =  forcelj * switch1 + philj * switch2;
+              }
+	    #ifdef __MIC__
+	    }
+	    #endif
+
+            if (sbindex) {
+              const flt_t factor_lj = special_lj[sbindex];
+              forcelj *= factor_lj;
+              if (EFLAG) evdwl *= factor_lj;
+            }
+	  #ifdef __MIC__
+	  }
+	  #else
+	  if (rsq > cut_coulsq) { forcecoul = (flt_t)0.0; ecoul = (flt_t)0.0; }
+	  if (rsq > cut_ljsq) { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
+	  #endif
+
+	  #ifdef __MIC__
+	  if (rsq < cut_coulsq) {
+	  #endif
+            const flt_t fpair = (forcecoul + forcelj) * r2inv;
+            fxtmp += delx * fpair;
+            fytmp += dely * fpair;
+            fztmp += delz * fpair;
+            if (NEWTON_PAIR || j < nlocal) {
+              f[j].x -= delx * fpair;
+              f[j].y -= dely * fpair;
+              f[j].z -= delz * fpair;
+            }
+
+            if (EVFLAG) {
+              flt_t ev_pre = (flt_t)0;
+              if (NEWTON_PAIR || i < nlocal)
+                ev_pre += (flt_t)0.5;
+              if (NEWTON_PAIR || j < nlocal)
+                ev_pre += (flt_t)0.5;
+
+              if (EFLAG) {
+                sevdwl += ev_pre * evdwl;
+                secoul += ev_pre * ecoul;
+                if (eatom) {
+                  if (NEWTON_PAIR || i < nlocal)
+                    fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
+                  if (NEWTON_PAIR || j < nlocal) 
+                    f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
+                }
+              }
+
+	      IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair,
+				   delx, dely, delz);
+            }
+	  #ifdef __MIC__
+	  }
+	  #endif
+        } // for jj
+        f[i].x += fxtmp;
+        f[i].y += fytmp;
+        f[i].z += fztmp;
+
+	IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp);
+      } // for ii
+
+      #if defined(_OPENMP)
+      #pragma omp barrier
+      #endif
+      IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
+			     nlocal, minlocal, nthreads, f_start, f_stride, 
+			     x);
+    } // end of omp parallel region
+    if (EVFLAG) {
+      if (EFLAG) {
+        ev_global[0] = oevdwl;
+        ev_global[1] = oecoul;
+      }
+      if (vflag) {
+        ev_global[2] = ov0;
+        ev_global[3] = ov1;
+        ev_global[4] = ov2;
+        ev_global[5] = ov3;
+        ev_global[6] = ov4;
+        ev_global[7] = ov5;
+      }
+    }
+    #ifdef __MIC__
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // end of offload region
+
+  if (offload)
+    fix->stop_watch(TIME_OFFLOAD_LATENCY);
+  else
+    fix->stop_watch(TIME_HOST_PAIR);
+
+  if (EVFLAG)
+    fix->add_result_array(f_start, ev_global, offload, eatom);
+  else
+    fix->add_result_array(f_start, 0, offload);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulLongIntel::init_style()
+{
+  PairLJCharmmCoulLong::init_style();
+  neighbor->requests[neighbor->nrequest-1]->intel = 1;
+
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  fix = static_cast<FixIntel *>(modify->fix[ifix]);
+  
+  #ifdef _LMP_INTEL_OFFLOAD
+  fix->set_offload_affinity();
+  _cop = fix->coprocessor_number();
+  #endif
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+    fix->get_mixed_buffers()->free_all_nbor_buffers();
+    pack_force_const(force_const_single, fix->get_mixed_buffers());
+  } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+    fix->get_double_buffers()->free_all_nbor_buffers();
+    pack_force_const(force_const_double, fix->get_double_buffers());
+  } else {
+    fix->get_single_buffers()->free_all_nbor_buffers();
+    pack_force_const(force_const_single, fix->get_single_buffers());
+  }
+}
+
+template <class flt_t, class acc_t>
+void PairLJCharmmCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
+                                          IntelBuffers<flt_t,acc_t> *buffers)
+{
+  int tp1 = atom->ntypes + 1;
+  int ntable = 1;
+  if (ncoultablebits)
+    for (int i = 0; i < ncoultablebits; i++) ntable *= 2;
+
+  fc.set_ntypes(tp1, ntable, memory, _cop);
+  buffers->set_ntypes(tp1);
+  flt_t **cutneighsq = buffers->get_cutneighsq();
+
+  // Repeat cutsq calculation because done after call to init_style
+  double cut, cutneigh;
+  if (cut_lj > cut_coul)
+    error->all(FLERR,
+	 "Intel varient of lj/charmm/coul/long expects lj cutoff<=coulombic");
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i, j);
+        cutneigh = cut + neighbor->skin;
+        cutsq[i][j] = cutsq[j][i] = cut*cut;
+        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
+      }
+    }
+  }
+
+  cut_lj_innersq = cut_lj_inner * cut_lj_inner;
+  cut_ljsq = cut_lj * cut_lj;
+  cut_coulsq = cut_coul * cut_coul;
+  cut_bothsq = MAX(cut_ljsq, cut_coulsq);
+
+  fc.g_ewald = force->kspace->g_ewald;
+  fc.tabinnersq = tabinnersq;
+  fc.cut_coulsq = cut_coulsq;
+  fc.cut_ljsq = cut_ljsq;
+  fc.cut_lj_innersq = cut_lj_innersq;
+
+  for (int i = 0; i < 4; i++) {
+    fc.special_lj[i] = force->special_lj[i];
+    fc.special_coul[i] = force->special_coul[i];
+    fc.special_coul[0] = 1.0;
+    fc.special_lj[0] = 1.0;
+  }
+
+  for (int i = 0; i < tp1; i++) {
+    for (int j = 0; j < tp1; j++) {
+      fc.lj[i][j].x = lj1[i][j];
+      fc.lj[i][j].y = lj2[i][j];
+      fc.lj[i][j].z = lj3[i][j];
+      fc.lj[i][j].w = lj4[i][j];
+      fc.cutsq[i][j] = cutsq[i][j];
+    }
+  }
+
+  if (ncoultablebits) {
+    for (int i = 0; i < ntable; i++) {
+      fc.table[i].r = rtable[i];
+      fc.table[i].dr = drtable[i];
+      fc.table[i].f = ftable[i];
+      fc.table[i].df = dftable[i];
+      fc.etable[i] = etable[i];
+      fc.detable[i] = detable[i];
+      fc.ctable[i] = ctable[i];
+      fc.dctable[i] = dctable[i];
+    }
+  }
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_cop < 0) return;
+  flt_t * special_lj = fc.special_lj;
+  flt_t * special_coul = fc.special_coul;
+  flt_t * cutsq = fc.cutsq[0];
+  LJ_T * lj = fc.lj[0];
+  TABLE_T * table = fc.table;
+  flt_t * etable = fc.etable;
+  flt_t * detable = fc.detable;
+  flt_t * ctable = fc.ctable;
+  flt_t * dctable = fc.dctable;
+  flt_t * ocutneighsq = cutneighsq[0];
+  int tp1sq = tp1 * tp1;
+  #pragma offload_transfer target(mic:_cop) \
+    in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \
+    in(cutsq,lj: length(tp1sq) alloc_if(0) free_if(0)) \
+    in(table: length(ntable) alloc_if(0) free_if(0)) \
+    in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0)) \
+    in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t>
+void PairLJCharmmCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
+                                                              const int ntable,
+                                                              Memory *memory,
+							      const int cop) {
+  if ( (ntypes != _ntypes || ntable != _ntable) ) {
+    if (_ntypes > 0) {
+      #ifdef _LMP_INTEL_OFFLOAD
+      flt_t * ospecial_lj = special_lj;
+      flt_t * ospecial_coul = special_coul;
+      flt_t * ocutsq = cutsq[0];
+      typename IntelBuffers<flt_t,flt_t>::vec4_t * olj = lj[0];
+      table_t * otable = table;
+      flt_t * oetable = etable;
+      flt_t * odetable = detable;
+      flt_t * octable = ctable;
+      flt_t * odctable = dctable;
+      if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL &&
+          otable != NULL && oetable != NULL && odetable != NULL &&
+          octable != NULL && odctable != NULL && ospecial_coul != NULL &&
+	  cop >= 0) {
+        #pragma offload_transfer target(mic:cop) \
+          nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
+	  nocopy(ocutsq, olj: alloc_if(0) free_if(1)) \
+	  nocopy(otable: alloc_if(0) free_if(1)) \
+	  nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1))
+      }
+      #endif
+
+      _memory->destroy(cutsq);
+      _memory->destroy(lj);
+      _memory->destroy(table);
+      _memory->destroy(etable);
+      _memory->destroy(detable);
+      _memory->destroy(ctable);
+      _memory->destroy(dctable);
+    }
+    if (ntypes > 0) {
+      _cop = cop;
+      memory->create(cutsq,ntypes,ntypes,"fc.cutsq");
+      memory->create(lj,ntypes,ntypes,"fc.lj");
+      memory->create(table,ntable,"pair:fc.table");
+      memory->create(etable,ntable,"pair:fc.etable");
+      memory->create(detable,ntable,"pair:fc.detable");
+      memory->create(ctable,ntable,"pair:fc.ctable");
+      memory->create(dctable,ntable,"pair:fc.dctable");
+
+      #ifdef _LMP_INTEL_OFFLOAD
+      flt_t * ospecial_lj = special_lj;
+      flt_t * ospecial_coul = special_coul;
+      flt_t * ocutsq = cutsq[0];
+      typename IntelBuffers<flt_t,flt_t>::vec4_t * olj = lj[0];
+      table_t * otable = table;
+      flt_t * oetable = etable;
+      flt_t * odetable = detable;
+      flt_t * octable = ctable;
+      flt_t * odctable = dctable;
+      int tp1sq = ntypes*ntypes;
+      if (ospecial_lj != NULL && ocutsq != NULL && olj != NULL &&
+          otable !=NULL && oetable != NULL && odetable != NULL &&
+          octable != NULL && odctable != NULL && ospecial_coul != NULL &&
+	  cop >= 0) {
+        #pragma offload_transfer target(mic:cop) \
+          nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
+          nocopy(ospecial_coul: length(4) alloc_if(1) free_if(0)) \
+          nocopy(ocutsq,olj: length(tp1sq) alloc_if(1) free_if(0)) \
+          nocopy(otable: length(ntable) alloc_if(1) free_if(0)) \
+          nocopy(oetable,odetable: length(ntable) alloc_if(1) free_if(0)) \
+          nocopy(octable,odctable: length(ntable) alloc_if(1) free_if(0))
+      }
+      #endif
+    }
+  }
+  _ntypes=ntypes;
+  _ntable=ntable;
+  _memory=memory;
+}
diff --git a/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h
new file mode 100644
index 0000000000..ad66c786b6
--- /dev/null
+++ b/src/USER-INTEL/pair_lj_charmm_coul_long_intel.h
@@ -0,0 +1,104 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/charmm/coul/long/intel,PairLJCharmmCoulLongIntel)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CHARMM_COUL_LONG_INTEL_H
+#define LMP_PAIR_LJ_CHARMM_COUL_LONG_INTEL_H
+
+#include "pair_lj_charmm_coul_long.h"
+#include "fix_intel.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCharmmCoulLongIntel : public PairLJCharmmCoulLong {
+
+ public:
+  PairLJCharmmCoulLongIntel(class LAMMPS *);
+  virtual ~PairLJCharmmCoulLongIntel();
+
+  virtual void compute(int, int);
+  void init_style();
+
+  typedef struct { float x,y,z; int w; } sng4_t;
+
+ private:
+  FixIntel *fix;
+  int _cop;
+
+  template <class flt_t> class ForceConst;
+  template <class flt_t, class acc_t>
+  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
+               const ForceConst<flt_t> &fc);
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+  void eval(const int offload, const int vflag,
+	    IntelBuffers<flt_t,acc_t> * buffers,
+	    const ForceConst<flt_t> &fc, const int astart, const int aend);
+
+  template <class flt_t, class acc_t>
+  void pack_force_const(ForceConst<flt_t> &fc,
+                        IntelBuffers<flt_t, acc_t> *buffers);
+
+  // ----------------------------------------------------------------------
+  template <class flt_t>
+  class ForceConst {
+   public:
+    typedef struct { flt_t r, dr, f, df; } table_t;
+    __declspec(align(64)) flt_t special_coul[4];
+    __declspec(align(64)) flt_t special_lj[4];
+    flt_t **cutsq, g_ewald, tabinnersq;
+    flt_t cut_coulsq, cut_ljsq;
+    flt_t cut_lj_innersq;
+    table_t *table;
+    flt_t *etable, *detable, *ctable, *dctable;
+    typename IntelBuffers<flt_t,flt_t>::vec4_t **lj;
+
+    ForceConst() : _ntypes(0), _ntable(0) {}
+    ~ForceConst() { set_ntypes(0,0,NULL,_cop); }
+
+    void set_ntypes(const int ntypes, const int ntable, Memory *memory,
+		    const int cop);
+
+   private:
+    int _ntypes, _ntable, _cop;
+    Memory *_memory;
+  };
+  ForceConst<float> force_const_single;
+  ForceConst<double> force_const_double;
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: The 'package intel' command is required for /intel styles
+
+Self-explanatory.
+
+E: Intel varient of lj/charmm/coul/long expects lj cutoff<=coulombic
+
+The intel accelerated version of the CHARMM style requires that the
+Lennard-Jones cutoff is not greater than the coulombic cutoff.
+
+*/
diff --git a/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
new file mode 100644
index 0000000000..4163a1f7d2
--- /dev/null
+++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.cpp
@@ -0,0 +1,634 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_cut_coul_long_intel.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "group.h"
+#include "kspace.h"
+#include "memory.h"
+#include "modify.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "memory.h"
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define C_FORCE_T typename ForceConst<flt_t>::c_force_t
+#define C_ENERGY_T typename ForceConst<flt_t>::c_energy_t
+#define TABLE_T typename ForceConst<flt_t>::table_t
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCutCoulLongIntel::PairLJCutCoulLongIntel(LAMMPS *lmp) :
+  PairLJCutCoulLong(lmp)
+{
+  suffix_flag |= Suffix::INTEL;
+  respa_enable = 0;
+  cut_respa = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCutCoulLongIntel::~PairLJCutCoulLongIntel()
+{
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulLongIntel::compute(int eflag, int vflag)
+{
+  if (fix->precision()==FixIntel::PREC_MODE_MIXED)
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(), 
+                          force_const_single);
+  else if (fix->precision()==FixIntel::PREC_MODE_DOUBLE)
+    compute<double,double>(eflag, vflag, fix->get_double_buffers(),
+                           force_const_double);
+  else
+    compute<float,float>(eflag, vflag, fix->get_single_buffers(),
+                         force_const_single);
+
+  fix->balance_stamp();
+  vflag_fdotr = 0;
+}
+
+template <class flt_t, class acc_t>
+void PairLJCutCoulLongIntel::compute(int eflag, int vflag,
+				     IntelBuffers<flt_t,acc_t> *buffers,
+				     const ForceConst<flt_t> &fc)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int inum = list->inum;
+  const int nthreads = comm->nthreads;
+  const int host_start = fix->host_start_pair();
+  const int offload_end = fix->offload_end_pair();
+  const int ago = neighbor->ago;
+
+  if (ago != 0 && fix->separate_buffers() == 0) {
+    fix->start_watch(TIME_PACK);
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
+    #endif
+    {
+      int ifrom, ito, tid;
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost, 
+				nthreads, sizeof(ATOM_T));
+      buffers->thr_pack(ifrom,ito,ago);
+    }
+    fix->stop_watch(TIME_PACK);
+  }
+  
+  if (evflag || vflag_fdotr) {
+    int ovflag = 0;
+    if (vflag_fdotr) ovflag = 2;
+    else if (vflag) ovflag = 1;
+    if (eflag) {
+      if (force->newton_pair) {
+	eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+	eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
+      } else {
+	eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+	eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
+      }
+    } else {
+      if (force->newton_pair) {
+	eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+	eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
+      } else {
+	eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+	eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
+      }
+    }
+  } else {
+    if (force->newton_pair) {
+      eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
+      eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
+    } else {
+      eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
+      eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+void PairLJCutCoulLongIntel::eval(const int offload, const int vflag,
+				     IntelBuffers<flt_t,acc_t> *buffers,
+				     const ForceConst<flt_t> &fc,
+				     const int astart, const int aend)
+{
+  const int inum = aend - astart;
+  if (inum == 0) return;
+  int nlocal, nall, minlocal;
+  fix->get_buffern(offload, nlocal, nall, minlocal);
+
+  const int ago = neighbor->ago;
+  IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
+
+  ATOM_T * restrict const x = buffers->get_x(offload);
+  flt_t * restrict const q = buffers->get_q(offload);
+
+  const int * restrict const numneigh = list->numneigh;
+  const int * restrict const cnumneigh = buffers->cnumneigh(list);
+  const int * restrict const firstneigh = buffers->firstneigh(list);
+
+  const flt_t * restrict const special_coul = fc.special_coul;
+  const flt_t * restrict const special_lj = fc.special_lj;
+  const flt_t qqrd2e = force->qqrd2e;
+
+  const C_FORCE_T * restrict const c_force = fc.c_force[0];
+  const C_ENERGY_T * restrict const c_energy = fc.c_energy[0];
+  const TABLE_T * restrict const table = fc.table;
+  const flt_t * restrict const etable = fc.etable;
+  const flt_t * restrict const detable = fc.detable;
+  const flt_t * restrict const ctable = fc.ctable;
+  const flt_t * restrict const dctable = fc.dctable;
+  const flt_t g_ewald = fc.g_ewald;
+  const flt_t tabinnersq = fc.tabinnersq;
+
+  const int ntypes = atom->ntypes + 1;
+  const int eatom = this->eflag_atom;
+
+  // Determine how much data to transfer
+  int x_size, q_size, f_stride, ev_size, separate_flag;
+  IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
+		       buffers, offload, fix, separate_flag,
+		       x_size, q_size, ev_size, f_stride);
+
+  int tc;
+  FORCE_T * restrict f_start;
+  acc_t * restrict ev_global;
+  IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
+
+  const int nthreads = tc;
+  #ifdef _LMP_INTEL_OFFLOAD
+  int *overflow = fix->get_off_overflow_flag();
+  double *timer_compute = fix->off_watch_pair();
+  // Redeclare as local variables for offload
+  const int ncoultablebits = this->ncoultablebits;
+  const int ncoulmask = this->ncoulmask;
+  const int ncoulshiftbits = this->ncoulshiftbits;
+  #ifdef INTEL_ALLOW_TABLE
+  #define ITABLE_IN in(table,etable,detable:length(0) alloc_if(0) free_if(0)) \
+                    in(ctable,dctable:length(0) alloc_if(0) free_if(0)) \
+                    in(ncoultablebits,tabinnersq,ncoulmask,ncoulshiftbits)
+  #else
+  #define ITABLE_IN
+  #endif
+
+  if (offload) fix->start_watch(TIME_OFFLOAD_LATENCY);
+  #pragma offload target(mic:_cop) if(offload) \
+    in(special_lj,special_coul:length(0) alloc_if(0) free_if(0)) \
+    in(c_force, c_energy:length(0) alloc_if(0) free_if(0)) \
+    in(firstneigh:length(0) alloc_if(0) free_if(0)) \
+    in(cnumneigh:length(0) alloc_if(0) free_if(0)) \
+    in(numneigh:length(0) alloc_if(0) free_if(0)) \
+    in(x:length(x_size) alloc_if(0) free_if(0)) \
+    in(q:length(q_size) alloc_if(0) free_if(0)) \
+    in(overflow:length(0) alloc_if(0) free_if(0)) \
+    in(astart,nthreads,qqrd2e,g_ewald,inum,nall,ntypes,vflag,eatom) \
+    in(f_stride,nlocal,minlocal,separate_flag,offload) \
+    out(f_start:length(f_stride) alloc_if(0) free_if(0)) \
+    out(ev_global:length(ev_size) alloc_if(0) free_if(0)) \
+    out(timer_compute:length(1) alloc_if(0) free_if(0)) \
+    ITABLE_IN signal(f_start)
+  #endif
+  {
+    #ifdef __MIC__
+    *timer_compute = MIC_Wtime();
+    #endif
+
+    IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, 
+			      f_stride, x, q);
+
+    acc_t oevdwl, oecoul, ov0, ov1, ov2, ov3, ov4, ov5;
+    if (EVFLAG) {
+      oevdwl = oecoul = (acc_t)0;
+      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    }
+
+    // loop over neighbors of my atoms
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) \
+      shared(f_start,f_stride,nlocal,nall,minlocal)	\
+      reduction(+:oevdwl,oecoul,ov0,ov1,ov2,ov3,ov4,ov5)
+    #endif
+    {
+      int iifrom, iito, tid;
+      IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
+      iifrom += astart;
+      iito += astart;
+
+      FORCE_T * restrict const f = f_start - minlocal + (tid * f_stride);
+      memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+
+      for (int i = iifrom; i < iito; ++i) {
+        const int itype = x[i].w;
+
+        const int ptr_off = itype * ntypes;
+        const C_FORCE_T * restrict const c_forcei = c_force + ptr_off;
+        const C_ENERGY_T * restrict const c_energyi = c_energy + ptr_off;
+
+        const int   * restrict const jlist = firstneigh + cnumneigh[i];
+        const int jnum = numneigh[i];
+
+        acc_t fxtmp,fytmp,fztmp,fwtmp;
+	acc_t sevdwl, secoul, sv0, sv1, sv2, sv3, sv4, sv5;
+
+        const flt_t xtmp = x[i].x;
+        const flt_t ytmp = x[i].y;
+        const flt_t ztmp = x[i].z;
+        const flt_t qtmp = q[i];
+        fxtmp = fytmp = fztmp = (acc_t)0;
+        if (EVFLAG) {
+	  if (EFLAG) fwtmp = sevdwl = secoul = (acc_t)0;
+	  if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+	}
+
+	#pragma vector aligned
+	#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, secoul, \
+	                       sv0, sv1, sv2, sv3, sv4, sv5)
+        for (int jj = 0; jj < jnum; jj++) {
+          flt_t forcecoul, forcelj, evdwl, ecoul;
+          forcecoul = forcelj = evdwl = ecoul = (flt_t)0.0;
+
+          const int sbindex = jlist[jj] >> SBBITS & 3;
+          const int j = jlist[jj] & NEIGHMASK;
+
+          const flt_t delx = xtmp - x[j].x;
+          const flt_t dely = ytmp - x[j].y;
+          const flt_t delz = ztmp - x[j].z;
+          const int jtype = x[j].w;
+          const flt_t rsq = delx * delx + dely * dely + delz * delz;
+
+          const flt_t r2inv = (flt_t)1.0 / rsq;
+
+	  #ifdef __MIC__
+	  if (rsq < c_forcei[jtype].cutsq) {
+          #endif
+            #ifdef INTEL_ALLOW_TABLE
+            if (!ncoultablebits || rsq <= tabinnersq) {
+            #endif
+              const flt_t A1 =  0.254829592;
+              const flt_t A2 = -0.284496736;
+              const flt_t A3 =  1.421413741;
+              const flt_t A4 = -1.453152027;
+              const flt_t A5 =  1.061405429;
+              const flt_t EWALD_F = 1.12837917;
+              const flt_t INV_EWALD_P = 1.0 / 0.3275911;
+
+              const flt_t r = sqrt(rsq);
+              const flt_t grij = g_ewald * r;
+              const flt_t expm2 = exp(-grij * grij);
+              const flt_t t = INV_EWALD_P / (INV_EWALD_P + grij);
+              const flt_t erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+              const flt_t prefactor = qqrd2e * qtmp * q[j] / r;
+              forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
+              if (EFLAG) ecoul = prefactor * erfc;
+              if (sbindex) {
+                const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex])*
+                    prefactor;
+                forcecoul -= adjust;
+                if (EFLAG) ecoul -= adjust;
+              }
+            #ifdef INTEL_ALLOW_TABLE
+            } else {
+              float rsq_lookup = rsq;
+              const int itable = (__intel_castf32_u32(rsq_lookup) &
+                  ncoulmask) >> ncoulshiftbits;
+              const flt_t fraction = (rsq_lookup - table[itable].r) *
+                  table[itable].dr;
+
+              const flt_t tablet = table[itable].f +
+                  fraction * table[itable].df;
+              forcecoul = qtmp * q[j] * tablet;
+              if (EFLAG) ecoul = qtmp * q[j] * (etable[itable] +
+                  fraction * detable[itable]);
+              if (sbindex) {
+                const flt_t table2 = ctable[itable] +
+                    fraction * dctable[itable];
+                const flt_t prefactor = qtmp * q[j] * table2;
+                const flt_t adjust = ((flt_t)1.0 - special_coul[sbindex]) *
+                    prefactor;
+                forcecoul -= adjust;
+                if (EFLAG) ecoul -= adjust;
+              }
+            }
+            #endif
+	  #ifdef __MIC__
+	  }
+	  #endif
+
+	  #ifdef __MIC__
+	  if (rsq < c_forcei[jtype].cut_ljsq) {
+	  #endif
+            flt_t r6inv = r2inv * r2inv * r2inv;
+            forcelj = r6inv * (c_forcei[jtype].lj1 * r6inv -
+			       c_forcei[jtype].lj2);
+            if (EFLAG) evdwl = r6inv*(c_energyi[jtype].lj3 * r6inv -
+                                      c_energyi[jtype].lj4) -
+                               c_energyi[jtype].offset;
+
+            if (sbindex) {
+              const flt_t factor_lj = special_lj[sbindex];
+              forcelj *= factor_lj;
+              if (EFLAG) evdwl *= factor_lj;
+            }
+	  #ifdef __MIC__
+	  }
+	  #else
+	  if (rsq > c_forcei[jtype].cutsq)
+	    { forcecoul = (flt_t)0.0; ecoul = (flt_t)0.0; }
+	  if (rsq > c_forcei[jtype].cut_ljsq)
+	    { forcelj = (flt_t)0.0; evdwl = (flt_t)0.0; }
+	  #endif
+
+	  #ifdef __MIC__
+	  if (rsq < c_forcei[jtype].cutsq) {
+	  #endif
+            const flt_t fpair = (forcecoul + forcelj) * r2inv;
+            fxtmp += delx * fpair;
+            fytmp += dely * fpair;
+            fztmp += delz * fpair;
+            if (NEWTON_PAIR || j < nlocal) {
+              f[j].x -= delx * fpair;
+              f[j].y -= dely * fpair;
+              f[j].z -= delz * fpair;
+            }
+
+            if (EVFLAG) {
+              flt_t ev_pre = (flt_t)0;
+              if (NEWTON_PAIR || i < nlocal)
+                ev_pre += (flt_t)0.5;
+              if (NEWTON_PAIR || j < nlocal)
+                ev_pre += (flt_t)0.5;
+
+              if (EFLAG) {
+                sevdwl += ev_pre * evdwl;
+                secoul += ev_pre * ecoul;
+                if (eatom) {
+                  if (NEWTON_PAIR || i < nlocal)
+                    fwtmp += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
+                  if (NEWTON_PAIR || j < nlocal) 
+                    f[j].w += (flt_t)0.5 * evdwl + (flt_t)0.5 * ecoul;
+                }
+              }
+ 	      IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair, delx, dely, delz);
+            }
+          #ifdef __MIC__
+	  }
+	  #endif
+        } // for jj
+
+        f[i].x += fxtmp;
+        f[i].y += fytmp;
+        f[i].z += fztmp;
+	IP_PRE_ev_tally_atomq(EVFLAG, EFLAG, vflag, f, fwtmp);
+      } // for ii
+      #if defined(_OPENMP)
+      #pragma omp barrier
+      #endif
+      IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
+			     nlocal, minlocal, nthreads, f_start, f_stride, 
+			     x);
+    } // end of omp parallel region
+    if (EVFLAG) {
+      if (EFLAG) {
+        ev_global[0] = oevdwl;
+        ev_global[1] = oecoul;
+      }
+      if (vflag) {
+        ev_global[2] = ov0;
+        ev_global[3] = ov1;
+        ev_global[4] = ov2;
+        ev_global[5] = ov3;
+        ev_global[6] = ov4;
+        ev_global[7] = ov5;
+      }
+    }
+    #ifdef __MIC__
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // end of offload region
+
+  if (offload)
+    fix->stop_watch(TIME_OFFLOAD_LATENCY);
+  else
+    fix->stop_watch(TIME_HOST_PAIR);
+
+  if (EVFLAG)
+    fix->add_result_array(f_start, ev_global, offload, eatom);
+  else
+    fix->add_result_array(f_start, 0, offload);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutCoulLongIntel::init_style()
+{
+  PairLJCutCoulLong::init_style();
+  neighbor->requests[neighbor->nrequest-1]->intel = 1;
+
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  fix = static_cast<FixIntel *>(modify->fix[ifix]);
+  
+  #ifdef _LMP_INTEL_OFFLOAD
+  fix->set_offload_affinity();
+  _cop = fix->coprocessor_number();
+  #endif
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+    fix->get_mixed_buffers()->free_all_nbor_buffers();
+    pack_force_const(force_const_single, fix->get_mixed_buffers());
+  } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+    fix->get_double_buffers()->free_all_nbor_buffers();
+    pack_force_const(force_const_double, fix->get_double_buffers());
+  } else {
+    fix->get_single_buffers()->free_all_nbor_buffers();
+    pack_force_const(force_const_single, fix->get_single_buffers());
+  }
+}
+
+template <class flt_t, class acc_t>
+void PairLJCutCoulLongIntel::pack_force_const(ForceConst<flt_t> &fc,
+                                          IntelBuffers<flt_t,acc_t> *buffers)
+{
+  int tp1 = atom->ntypes + 1;
+  int ntable = 1;
+  if (ncoultablebits)
+    for (int i = 0; i < ncoultablebits; i++) ntable *= 2;
+
+  fc.set_ntypes(tp1, ntable, memory, _cop);
+  buffers->set_ntypes(tp1);
+  flt_t **cutneighsq = buffers->get_cutneighsq();
+
+  // Repeat cutsq calculation because done after call to init_style
+  double cut, cutneigh;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i, j);
+        cutneigh = cut + neighbor->skin;
+        cutsq[i][j] = cutsq[j][i] = cut*cut;
+        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
+      }
+    }
+  }
+
+  fc.g_ewald = force->kspace->g_ewald;
+  fc.tabinnersq = tabinnersq;
+
+  for (int i = 0; i < 4; i++) {
+    fc.special_lj[i] = force->special_lj[i];
+    fc.special_coul[i] = force->special_coul[i];
+    fc.special_coul[0] = 1.0;
+    fc.special_lj[0] = 1.0;
+  }
+
+  for (int i = 0; i < tp1; i++) {
+    for (int j = 0; j < tp1; j++) {
+      fc.c_force[i][j].cutsq = cutsq[i][j];
+      fc.c_force[i][j].cut_ljsq = cut_ljsq[i][j];
+      fc.c_force[i][j].lj1 = lj1[i][j];
+      fc.c_force[i][j].lj2 = lj2[i][j];
+      fc.c_energy[i][j].lj3 = lj3[i][j];
+      fc.c_energy[i][j].lj4 = lj4[i][j];
+      fc.c_energy[i][j].offset = offset[i][j];
+    }
+  }
+
+  if (ncoultablebits) {
+    for (int i = 0; i < ntable; i++) {
+      fc.table[i].r = rtable[i];
+      fc.table[i].dr = drtable[i];
+      fc.table[i].f = ftable[i];
+      fc.table[i].df = dftable[i];
+      fc.etable[i] = etable[i];
+      fc.detable[i] = detable[i];
+      fc.ctable[i] = ctable[i];
+      fc.dctable[i] = dctable[i];
+    }
+  }
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_cop < 0) return;
+  flt_t * special_lj = fc.special_lj;
+  flt_t * special_coul = fc.special_coul;
+  C_FORCE_T * c_force = fc.c_force[0];
+  C_ENERGY_T * c_energy = fc.c_energy[0];
+  TABLE_T * table = fc.table;
+  flt_t * etable = fc.etable;
+  flt_t * detable = fc.detable;
+  flt_t * ctable = fc.ctable;
+  flt_t * dctable = fc.dctable;
+  flt_t * ocutneighsq = cutneighsq[0];
+  int tp1sq = tp1 * tp1;
+  #pragma offload_transfer target(mic:_cop) \
+    in(special_lj, special_coul: length(4) alloc_if(0) free_if(0)) \
+    in(c_force, c_energy: length(tp1sq) alloc_if(0) free_if(0)) \
+    in(table: length(ntable) alloc_if(0) free_if(0)) \
+    in(etable,detable,ctable,dctable: length(ntable) alloc_if(0) free_if(0)) \
+    in(ocutneighsq: length(tp1sq) alloc_if(0) free_if(0))
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t>
+void PairLJCutCoulLongIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
+							   const int ntable,
+							   Memory *memory,
+							   const int cop) {
+  if ( (ntypes != _ntypes || ntable != _ntable) ) {
+    if (_ntypes > 0) {
+      #ifdef _LMP_INTEL_OFFLOAD
+      flt_t * ospecial_lj = special_lj;
+      flt_t * ospecial_coul = special_coul;
+      c_force_t * oc_force = c_force[0];
+      c_energy_t * oc_energy = c_energy[0];
+      table_t * otable = table;
+      flt_t * oetable = etable;
+      flt_t * odetable = detable;
+      flt_t * octable = ctable;
+      flt_t * odctable = dctable;
+      if (ospecial_lj != NULL && oc_force != NULL &&
+          oc_energy != NULL && otable != NULL && oetable != NULL &&
+          odetable != NULL && octable != NULL && odctable != NULL &&
+          ospecial_coul != NULL && _cop >= 0) {
+        #pragma offload_transfer target(mic:cop) \
+          nocopy(ospecial_lj, ospecial_coul: alloc_if(0) free_if(1)) \
+	  nocopy(oc_force, oc_energy: alloc_if(0) free_if(1)) \
+	  nocopy(otable: alloc_if(0) free_if(1)) \
+	  nocopy(oetable, odetable, octable, odctable: alloc_if(0) free_if(1))
+      }
+      #endif
+
+      _memory->destroy(c_force);
+      _memory->destroy(c_energy);
+      _memory->destroy(table);
+      _memory->destroy(etable);
+      _memory->destroy(detable);
+      _memory->destroy(ctable);
+      _memory->destroy(dctable);
+    }
+    if (ntypes > 0) {
+      _cop = cop;
+      memory->create(c_force,ntypes,ntypes,"fc.c_force");
+      memory->create(c_energy,ntypes,ntypes,"fc.c_energy");
+      memory->create(table,ntable,"pair:fc.table");
+      memory->create(etable,ntable,"pair:fc.etable");
+      memory->create(detable,ntable,"pair:fc.detable");
+      memory->create(ctable,ntable,"pair:fc.ctable");
+      memory->create(dctable,ntable,"pair:fc.dctable");
+
+      #ifdef _LMP_INTEL_OFFLOAD
+      flt_t * ospecial_lj = special_lj;
+      flt_t * ospecial_coul = special_coul;
+      c_force_t * oc_force = c_force[0];
+      c_energy_t * oc_energy = c_energy[0];
+      table_t * otable = table;
+      flt_t * oetable = etable;
+      flt_t * odetable = detable;
+      flt_t * octable = ctable;
+      flt_t * odctable = dctable;
+      int tp1sq = ntypes*ntypes;
+      if (ospecial_lj != NULL && oc_force != NULL &&
+          oc_energy != NULL && otable !=NULL && oetable != NULL &&
+          odetable != NULL && octable != NULL && odctable != NULL &&
+          ospecial_coul != NULL && cop >= 0) {
+        #pragma offload_transfer target(mic:cop) \
+          nocopy(ospecial_lj: length(4) alloc_if(1) free_if(0)) \
+          nocopy(ospecial_coul: length(4) alloc_if(1) free_if(0)) \
+          nocopy(oc_force: length(tp1sq) alloc_if(1) free_if(0)) \
+          nocopy(oc_energy: length(tp1sq) alloc_if(1) free_if(0)) \
+          nocopy(otable: length(ntable) alloc_if(1) free_if(0)) \
+          nocopy(oetable,odetable: length(ntable) alloc_if(1) free_if(0)) \
+          nocopy(octable,odctable: length(ntable) alloc_if(1) free_if(0))
+      }
+      #endif
+    }
+  }
+  _ntypes=ntypes;
+  _ntable=ntable;
+  _memory=memory;
+}
diff --git a/src/USER-INTEL/pair_lj_cut_coul_long_intel.h b/src/USER-INTEL/pair_lj_cut_coul_long_intel.h
new file mode 100644
index 0000000000..d7b4282a99
--- /dev/null
+++ b/src/USER-INTEL/pair_lj_cut_coul_long_intel.h
@@ -0,0 +1,100 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/cut/coul/long/intel,PairLJCutCoulLongIntel)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CUT_COUL_LONG_INTEL_H
+#define LMP_PAIR_LJ_CUT_COUL_LONG_INTEL_H
+
+#include "pair_lj_cut_coul_long.h"
+#include "fix_intel.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCutCoulLongIntel : public PairLJCutCoulLong {
+
+ public:
+  PairLJCutCoulLongIntel(class LAMMPS *);
+  virtual ~PairLJCutCoulLongIntel();
+
+  virtual void compute(int, int);
+  void init_style();
+
+  typedef struct { float x,y,z; int w; } sng4_t;
+
+ private:
+  FixIntel *fix;
+  int _cop;
+
+  template <class flt_t> class ForceConst;
+  template <class flt_t, class acc_t>
+  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
+               const ForceConst<flt_t> &fc);
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+  void eval(const int offload, const int vflag,
+	    IntelBuffers<flt_t,acc_t> * buffers,
+	    const ForceConst<flt_t> &fc, const int astart, const int aend);
+
+  template <class flt_t, class acc_t>
+  void pack_force_const(ForceConst<flt_t> &fc,
+                        IntelBuffers<flt_t, acc_t> *buffers);
+
+  // ----------------------------------------------------------------------
+  template <class flt_t>
+  class ForceConst {
+   public:
+    typedef struct { flt_t cutsq, cut_ljsq, lj1, lj2; } c_force_t;
+    typedef struct { flt_t lj3, lj4, offset, pad; } c_energy_t;
+    typedef struct { flt_t r, dr, f, df; } table_t;
+    __declspec(align(64)) flt_t special_coul[4];
+    __declspec(align(64)) flt_t special_lj[4];
+    flt_t g_ewald, tabinnersq;
+    c_force_t **c_force;
+    c_energy_t **c_energy;
+    table_t *table;
+    flt_t *etable, *detable, *ctable, *dctable;
+
+    ForceConst() : _ntypes(0), _ntable(0) {}
+    ~ForceConst() { set_ntypes(0,0,NULL,_cop); }
+
+    void set_ntypes(const int ntypes, const int ntable, Memory *memory,
+		    const int cop);
+
+   private:
+    int _ntypes, _ntable, _cop;
+    Memory *_memory;
+  };
+  ForceConst<float> force_const_single;
+  ForceConst<double> force_const_double;
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: The 'package intel' command is required for /intel styles
+
+Self-explanatory.
+
+*/
diff --git a/src/USER-INTEL/pair_lj_cut_intel.cpp b/src/USER-INTEL/pair_lj_cut_intel.cpp
new file mode 100644
index 0000000000..bca3a73493
--- /dev/null
+++ b/src/USER-INTEL/pair_lj_cut_intel.cpp
@@ -0,0 +1,412 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include "math.h"
+#include "pair_lj_cut_intel.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "memory.h"
+#include "modify.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define FC_PACKED1_T typename ForceConst<flt_t>::fc_packed1
+#define FC_PACKED2_T typename ForceConst<flt_t>::fc_packed2
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCutIntel::PairLJCutIntel(LAMMPS *lmp) :
+  PairLJCut(lmp)
+{
+  suffix_flag |= Suffix::INTEL;
+  respa_enable = 0;
+  cut_respa = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutIntel::compute(int eflag, int vflag)
+{
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
+                          force_const_single);
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    compute<double,double>(eflag, vflag, fix->get_double_buffers(),
+                           force_const_double);
+  else
+    compute<float,float>(eflag, vflag, fix->get_single_buffers(),
+                         force_const_single);
+
+  fix->balance_stamp();
+  vflag_fdotr = 0;
+}
+
+template <class flt_t, class acc_t>
+void PairLJCutIntel::compute(int eflag, int vflag,
+                             IntelBuffers<flt_t,acc_t> *buffers,
+                             const ForceConst<flt_t> &fc)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag, vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int inum = list->inum;
+  const int nthreads = comm->nthreads;
+  const int host_start = fix->host_start_pair();
+  const int offload_end = fix->offload_end_pair();
+  const int ago = neighbor->ago;
+
+  if (ago != 0 && fix->separate_buffers() == 0) {
+    fix->start_watch(TIME_PACK);
+    if (ago != 0) {
+      #if defined(_OPENMP)
+      #pragma omp parallel default(none) shared(eflag,vflag,buffers,fc)
+      #endif
+      {
+        int ifrom, ito, tid;
+	IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
+				  nthreads, sizeof(ATOM_T));
+	buffers->thr_pack(ifrom,ito,ago);
+      }
+    }
+    fix->stop_watch(TIME_PACK);
+  }
+
+  if (evflag || vflag_fdotr) {
+    int ovflag = 0;
+    if (vflag_fdotr) ovflag = 2;
+    else if (vflag) ovflag = 1;
+    if (eflag) {
+      if (force->newton_pair) {
+        eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
+      } else {
+        eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
+      }
+    } else {
+      if (force->newton_pair) {
+        eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
+      } else {
+        eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
+      }
+    }
+  } else {
+    if (force->newton_pair) {
+      eval<0,0,1>(1, 0, buffers, fc, 0, offload_end);
+      eval<0,0,1>(0, 0, buffers, fc, host_start, inum);
+    } else {
+      eval<0,0,0>(1, 0, buffers, fc, 0, offload_end);
+      eval<0,0,0>(0, 0, buffers, fc, host_start, inum);
+    }
+  }
+}
+
+template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+void PairLJCutIntel::eval(const int offload, const int vflag,
+                          IntelBuffers<flt_t,acc_t> *buffers,
+                          const ForceConst<flt_t> &fc,
+                          const int astart, const int aend)
+{
+  const int inum = aend - astart;
+  if (inum == 0) return;
+  int nlocal, nall, minlocal;
+  fix->get_buffern(offload, nlocal, nall, minlocal);
+
+  const int ago = neighbor->ago;
+  IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
+
+  ATOM_T * restrict const x = buffers->get_x(offload);
+
+  const int * restrict const numneigh = list->numneigh;
+  const int * restrict const cnumneigh = buffers->cnumneigh(list);
+  const int * restrict const firstneigh = buffers->firstneigh(list);
+  const flt_t * restrict const special_lj = fc.special_lj;
+  const FC_PACKED1_T * restrict const ljc12o = fc.ljc12o[0];
+  const FC_PACKED2_T * restrict const lj34 = fc.lj34[0];
+
+  const int ntypes = atom->ntypes + 1;
+  const int eatom = this->eflag_atom;
+
+  // Determine how much data to transfer
+  int x_size, q_size, f_stride, ev_size, separate_flag;
+  IP_PRE_get_transfern(ago, NEWTON_PAIR, EVFLAG, EFLAG, vflag,
+		       buffers, offload, fix, separate_flag,
+		       x_size, q_size, ev_size, f_stride);
+
+  int tc;
+  FORCE_T * restrict f_start;
+  acc_t * restrict ev_global;
+  IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
+  const int nthreads = tc;
+  int *overflow = fix->get_off_overflow_flag();
+  {
+    #ifdef __MIC__
+    *timer_compute = MIC_Wtime();
+    #endif
+
+    IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall, 
+			      f_stride, x, 0);
+
+    acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
+    if (EVFLAG) {
+      oevdwl = (acc_t)0;
+      if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+    }
+
+    // loop over neighbors of my atoms
+    #if defined(_OPENMP)
+    #pragma omp parallel default(none) \
+      shared(f_start,f_stride,nlocal,nall,minlocal) \
+      reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
+    #endif
+    {
+      int iifrom, iito, tid;
+      IP_PRE_omp_range_id(iifrom, iito, tid, inum, nthreads);
+      iifrom += astart;
+      iito += astart;
+
+      FORCE_T * restrict const f = f_start - minlocal + (tid * f_stride);
+      memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+
+      for (int i = iifrom; i < iito; ++i) {
+        const int itype = x[i].w;
+
+        const int ptr_off = itype * ntypes;
+        const FC_PACKED1_T * restrict const ljc12oi = ljc12o + ptr_off;
+        const FC_PACKED2_T * restrict const lj34i = lj34 + ptr_off;
+
+        const int * restrict const jlist = firstneigh + cnumneigh[i];
+        const int jnum = numneigh[i];
+
+        acc_t fxtmp, fytmp, fztmp, fwtmp;
+        acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
+
+        const flt_t xtmp = x[i].x;
+        const flt_t ytmp = x[i].y;
+        const flt_t ztmp = x[i].z;
+        fxtmp = fytmp = fztmp = (acc_t)0;
+        if (EVFLAG) {
+          if (EFLAG) fwtmp = sevdwl = (acc_t)0;
+          if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+        }
+
+        #pragma vector aligned
+	#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+	                       sv0, sv1, sv2, sv3, sv4, sv5)
+        for (int jj = 0; jj < jnum; jj++) {
+          flt_t forcelj, evdwl;
+          forcelj = evdwl = (flt_t)0.0;
+
+          const int sbindex = jlist[jj] >> SBBITS & 3;
+          const int j = jlist[jj] & NEIGHMASK;
+          const flt_t delx = xtmp - x[j].x;
+          const flt_t dely = ytmp - x[j].y;
+          const flt_t delz = ztmp - x[j].z;
+          const int jtype = x[j].w;
+          const flt_t rsq = delx * delx + dely * dely + delz * delz;
+
+          #ifdef __MIC__
+          if (rsq < ljc12oi[jtype].cutsq) {
+	  #endif
+            flt_t factor_lj = special_lj[sbindex];
+            flt_t r2inv = 1.0 / rsq;
+            flt_t r6inv = r2inv * r2inv * r2inv;
+            #ifndef __MIC__
+	    if (rsq > ljc12oi[jtype].cutsq) r6inv = (flt_t)0.0;
+	    #endif
+            forcelj = r6inv * (ljc12oi[jtype].lj1 * r6inv - ljc12oi[jtype].lj2);
+            flt_t fpair = factor_lj * forcelj * r2inv;
+
+            fxtmp += delx * fpair;
+            fytmp += dely * fpair;
+            fztmp += delz * fpair;
+            if (NEWTON_PAIR || j < nlocal) {
+              f[j].x -= delx * fpair;
+              f[j].y -= dely * fpair;
+              f[j].z -= delz * fpair;
+            }
+
+            if (EVFLAG) {
+              flt_t ev_pre = (flt_t)0;
+              if (NEWTON_PAIR || i<nlocal)
+                ev_pre += (flt_t)0.5;
+              if (NEWTON_PAIR || j<nlocal)
+                ev_pre += (flt_t)0.5;
+
+              if (EFLAG) {
+                evdwl = r6inv * (lj34i[jtype].lj3 * r6inv-lj34i[jtype].lj4) -
+                    ljc12oi[jtype].offset;
+                evdwl *= factor_lj;
+                sevdwl += ev_pre*evdwl;
+                if (eatom) {
+                  if (NEWTON_PAIR || i < nlocal)
+                    fwtmp += 0.5 * evdwl;
+                  if (NEWTON_PAIR || j < nlocal)
+                    f[j].w += 0.5 * evdwl;
+                }
+              }
+
+	      IP_PRE_ev_tally_nbor(vflag, ev_pre, fpair,
+				   delx, dely, delz);
+            }
+          #ifdef __MIC__
+          } // if rsq
+          #endif
+        } // for jj
+        f[i].x += fxtmp;
+        f[i].y += fytmp;
+        f[i].z += fztmp;
+        
+	IP_PRE_ev_tally_atom(EVFLAG, EFLAG, vflag, f, fwtmp);
+      } // for ii
+
+      #if defined(_OPENMP)
+      #pragma omp barrier
+      #endif
+      IP_PRE_fdotr_acc_force(NEWTON_PAIR, EVFLAG,  EFLAG, vflag, eatom, nall,
+			     nlocal, minlocal, nthreads, f_start, f_stride, 
+			     x);
+    } // end omp
+    if (EVFLAG) {
+      if (EFLAG) {
+        ev_global[0] = oevdwl;
+	ev_global[1] = (acc_t)0.0;
+      }
+      if (vflag) {
+        ev_global[2] = ov0;
+        ev_global[3] = ov1;
+        ev_global[4] = ov2;
+        ev_global[5] = ov3;
+        ev_global[6] = ov4;
+        ev_global[7] = ov5;
+      }
+    }
+    #ifdef __MIC__
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // end offload
+
+  if (offload)
+    fix->stop_watch(TIME_OFFLOAD_LATENCY);
+  else
+    fix->stop_watch(TIME_HOST_PAIR);
+
+  if (EVFLAG)
+    fix->add_result_array(f_start, ev_global, offload, eatom);
+  else
+    fix->add_result_array(f_start, 0, offload);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCutIntel::init_style()
+{
+  PairLJCut::init_style();
+  neighbor->requests[neighbor->nrequest-1]->intel = 1;
+
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  fix = static_cast<FixIntel *>(modify->fix[ifix]);
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (fix->offload_balance() != 0.0)
+    error->all(FLERR,
+          "Offload for lj/cut/intel is not yet available. Set balance to 0.");
+  #endif
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED) {
+    fix->get_mixed_buffers()->free_all_nbor_buffers();
+    pack_force_const(force_const_single, fix->get_mixed_buffers());
+  } else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE) {
+    fix->get_double_buffers()->free_all_nbor_buffers();
+    pack_force_const(force_const_double, fix->get_double_buffers());
+  } else {
+    fix->get_single_buffers()->free_all_nbor_buffers();
+    pack_force_const(force_const_single, fix->get_single_buffers());
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void PairLJCutIntel::pack_force_const(ForceConst<flt_t> &fc,
+                                      IntelBuffers<flt_t,acc_t> *buffers)
+{
+  int tp1 = atom->ntypes + 1;
+  fc.set_ntypes(tp1,memory,_cop);
+  buffers->set_ntypes(tp1);
+  flt_t **cutneighsq = buffers->get_cutneighsq();
+
+  // Repeat cutsq calculation because done after call to init_style
+  double cut, cutneigh;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i,j);
+        cutneigh = cut + neighbor->skin;
+        cutsq[i][j] = cutsq[j][i] = cut*cut;
+        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
+      }
+    }
+  }
+
+  for (int i = 0; i < 4; i++) {
+    fc.special_lj[i] = force->special_lj[i];
+    fc.special_lj[0] = 1.0;
+  }
+
+  for (int i = 0; i < tp1; i++) {
+    for (int j = 0; j < tp1; j++) {
+      fc.ljc12o[i][j].lj1 = lj1[i][j];
+      fc.ljc12o[i][j].lj2 = lj2[i][j];
+      fc.lj34[i][j].lj3 = lj3[i][j];
+      fc.lj34[i][j].lj4 = lj4[i][j];
+      fc.ljc12o[i][j].cutsq = cutsq[i][j];
+      fc.ljc12o[i][j].offset = offset[i][j];
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t>
+void PairLJCutIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
+                                                   Memory *memory,
+						   const int cop) {
+  if (ntypes != _ntypes) {
+    if (_ntypes > 0) {
+      fc_packed1 *oljc12o = ljc12o[0];
+      fc_packed2 *olj34 = lj34[0];
+
+      _memory->destroy(oljc12o);
+      _memory->destroy(olj34);
+    }
+    if (ntypes > 0) {
+      _cop = cop;
+      memory->create(ljc12o,ntypes,ntypes,"fc.c12o");
+      memory->create(lj34,ntypes,ntypes,"fc.lj34");
+    }
+  }
+  _ntypes = ntypes;
+  _memory = memory;
+}
diff --git a/src/USER-INTEL/pair_lj_cut_intel.h b/src/USER-INTEL/pair_lj_cut_intel.h
new file mode 100644
index 0000000000..a40e39af56
--- /dev/null
+++ b/src/USER-INTEL/pair_lj_cut_intel.h
@@ -0,0 +1,93 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/cut/intel,PairLJCutIntel)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CUT_INTEL_H
+#define LMP_PAIR_LJ_CUT_INTEL_H
+
+#include "pair_lj_cut.h"
+#include "fix_intel.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCutIntel : public PairLJCut {
+
+ public:
+  PairLJCutIntel(class LAMMPS *);
+
+  virtual void compute(int, int);
+  void init_style();
+
+ private:
+  FixIntel *fix;
+  int _cop;
+
+  template <class flt_t> class ForceConst;
+  template <class flt_t, class acc_t>
+  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
+               const ForceConst<flt_t> &fc);
+  template <int EVFLAG, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+  void eval(const int offload, const int vflag,
+            IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc, const int astart, const int aend);
+
+  template <class flt_t, class acc_t>
+  void pack_force_const(ForceConst<flt_t> &fc,
+                        IntelBuffers<flt_t, acc_t> *buffers);
+
+  // ----------------------------------------------------------------------
+
+  template <class flt_t>
+  class ForceConst {
+   public:
+    typedef struct { flt_t cutsq, lj1, lj2, offset; } fc_packed1;
+    typedef struct { flt_t lj3, lj4; } fc_packed2;
+
+    __declspec(align(64)) flt_t special_lj[4];
+    fc_packed1 **ljc12o;
+    fc_packed2 **lj34;
+
+    ForceConst() : _ntypes(0)  {}
+    ~ForceConst() { set_ntypes(0, NULL, _cop); }
+
+    void set_ntypes(const int ntypes, Memory *memory, const int cop);
+
+   private:
+    int _ntypes, _cop;
+    Memory *_memory;
+  };
+  ForceConst<float> force_const_single;
+  ForceConst<double> force_const_double;
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: The 'package intel' command is required for /intel styles
+
+Self-explanatory.
+
+*/
diff --git a/src/USER-INTEL/verlet_intel.cpp b/src/USER-INTEL/verlet_intel.cpp
new file mode 100644
index 0000000000..64177e0f05
--- /dev/null
+++ b/src/USER-INTEL/verlet_intel.cpp
@@ -0,0 +1,486 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "string.h"
+#include "verlet_intel.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "comm.h"
+#include "atom.h"
+#include "force.h"
+#include "pair.h"
+#include "bond.h"
+#include "angle.h"
+#include "dihedral.h"
+#include "improper.h"
+#include "kspace.h"
+#include "output.h"
+#include "update.h"
+#include "modify.h"
+#include "compute.h"
+#include "fix.h"
+#include "timer.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+VerletIntel::VerletIntel(LAMMPS *lmp, int narg, char **arg) :
+  Integrate(lmp, narg, arg) {}
+
+/* ----------------------------------------------------------------------
+   initialization before run
+------------------------------------------------------------------------- */
+
+void VerletIntel::init()
+{
+  Integrate::init();
+
+  // warn if no fixes
+
+  if (modify->nfix == 0 && comm->me == 0)
+    error->warning(FLERR,"No fixes defined, atoms won't move");
+
+  // virial_style:
+  // 1 if computed explicitly by pair->compute via sum over pair interactions
+  // 2 if computed implicitly by pair->virial_fdotr_compute via sum over ghosts
+
+  if (force->newton_pair) virial_style = 2;
+  else virial_style = 1;
+
+  // setup lists of computes for global and per-atom PE and pressure
+
+  ev_setup();
+
+  // detect if fix omp is present for clearing force arrays
+
+  int ifix = modify->find_fix("package_omp");
+  if (ifix >= 0) external_force_clear = 1;
+
+  if (nvlist_atom)
+    error->all(FLERR,
+	       "Cannot currently get per-atom virials with Intel package.");
+  #ifdef _LMP_INTEL_OFFLOAD
+  ifix = modify->find_fix("package_intel");
+  if (ifix >= 0) fix_intel = static_cast<FixIntel *>(modify->fix[ifix]);
+  else fix_intel = 0;
+  #endif
+
+  // set flags for what arrays to clear in force_clear()
+  // need to clear additionals arrays if they exist
+
+  torqueflag = 0;
+  if (atom->torque_flag) torqueflag = 1;
+  erforceflag = 0;
+  if (atom->erforce_flag) erforceflag = 1;
+  e_flag = 0;
+  if (atom->e_flag) e_flag = 1;
+  rho_flag = 0;
+  if (atom->rho_flag) rho_flag = 1;
+
+  // orthogonal vs triclinic simulation box
+
+  triclinic = domain->triclinic;
+}
+
+/* ----------------------------------------------------------------------
+   setup before run
+------------------------------------------------------------------------- */
+
+void VerletIntel::setup()
+{
+  if (comm->me == 0 && screen) fprintf(screen,"Setting up run ...\n");
+
+  update->setupflag = 1;
+
+  // setup domain, communication and neighboring
+  // acquire ghosts
+  // build neighbor lists
+
+  atom->setup();
+  modify->setup_pre_exchange();
+  if (triclinic) domain->x2lamda(atom->nlocal);
+  domain->pbc();
+  domain->reset_box();
+  comm->setup();
+  if (neighbor->style) neighbor->setup_bins();
+  comm->exchange();
+  if (atom->sortfreq > 0) atom->sort();
+  comm->borders();
+  if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
+  domain->image_check();
+  domain->box_too_small_check();
+  modify->setup_pre_neighbor();
+  neighbor->build();
+  neighbor->ncalls = 0;
+
+  // compute all forces
+
+  ev_set(update->ntimestep);
+  force_clear();
+  modify->setup_pre_force(vflag);
+
+  if (pair_compute_flag) force->pair->compute(eflag,vflag);
+  else if (force->pair) force->pair->compute_dummy(eflag,vflag);
+
+  if (atom->molecular) {
+    if (force->bond) force->bond->compute(eflag,vflag);
+    if (force->angle) force->angle->compute(eflag,vflag);
+    if (force->dihedral) force->dihedral->compute(eflag,vflag);
+    if (force->improper) force->improper->compute(eflag,vflag);
+  }
+
+  if (force->kspace) {
+    force->kspace->setup();
+    if (kspace_compute_flag) force->kspace->compute(eflag,vflag);
+    else force->kspace->compute_dummy(eflag,vflag);
+  }
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  sync_mode = 0;
+  if (fix_intel) {
+    if (fix_intel->offload_balance() != 0.0) {
+      if (fix_intel->offload_noghost())
+	sync_mode = 2;
+      else
+	sync_mode = 1;
+    }
+  }
+  
+  if (sync_mode == 1) fix_intel->sync_coprocessor();
+  #endif
+
+  if (force->newton) comm->reverse_comm();
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (sync_mode == 2) fix_intel->sync_coprocessor();
+  #endif
+
+  modify->setup(vflag);
+  output->setup();
+  update->setupflag = 0;
+}
+
+/* ----------------------------------------------------------------------
+   setup without output
+   flag = 0 = just force calculation
+   flag = 1 = reneighbor and force calculation
+------------------------------------------------------------------------- */
+
+void VerletIntel::setup_minimal(int flag)
+{
+  update->setupflag = 1;
+
+  // setup domain, communication and neighboring
+  // acquire ghosts
+  // build neighbor lists
+
+  if (flag) {
+    modify->setup_pre_exchange();
+    if (triclinic) domain->x2lamda(atom->nlocal);
+    domain->pbc();
+    domain->reset_box();
+    comm->setup();
+    if (neighbor->style) neighbor->setup_bins();
+    comm->exchange();
+    comm->borders();
+    if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
+    domain->image_check();
+    domain->box_too_small_check();
+    modify->setup_pre_neighbor();
+    neighbor->build();
+    neighbor->ncalls = 0;
+  }
+
+  // compute all forces
+
+  ev_set(update->ntimestep);
+  force_clear();
+  modify->setup_pre_force(vflag);
+
+  if (pair_compute_flag) force->pair->compute(eflag,vflag);
+  else if (force->pair) force->pair->compute_dummy(eflag,vflag);
+
+  if (atom->molecular) {
+    if (force->bond) force->bond->compute(eflag,vflag);
+    if (force->angle) force->angle->compute(eflag,vflag);
+    if (force->dihedral) force->dihedral->compute(eflag,vflag);
+    if (force->improper) force->improper->compute(eflag,vflag);
+  }
+
+  if (force->kspace) {
+    force->kspace->setup();
+    if (kspace_compute_flag) force->kspace->compute(eflag,vflag);
+    else force->kspace->compute_dummy(eflag,vflag);
+  }
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  sync_mode = 0;
+  if (fix_intel) {
+    if (fix_intel->offload_balance() != 0.0) {
+      if (fix_intel->offload_noghost())
+	sync_mode = 2;
+      else
+	sync_mode = 1;
+    }
+  }
+  
+  if (sync_mode == 1) fix_intel->sync_coprocessor();
+  #endif
+
+  if (force->newton) comm->reverse_comm();
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (sync_mode == 2) fix_intel->sync_coprocessor();
+  #endif
+
+  modify->setup(vflag);
+  update->setupflag = 0;
+}
+
+/* ----------------------------------------------------------------------
+   run for N steps
+------------------------------------------------------------------------- */
+
+void VerletIntel::run(int n)
+{
+  bigint ntimestep;
+  int nflag,sortflag;
+
+  int n_post_integrate = modify->n_post_integrate;
+  int n_pre_exchange = modify->n_pre_exchange;
+  int n_pre_neighbor = modify->n_pre_neighbor;
+  int n_pre_force = modify->n_pre_force;
+  int n_post_force = modify->n_post_force;
+  int n_end_of_step = modify->n_end_of_step;
+
+  if (atom->sortfreq > 0) sortflag = 1;
+  else sortflag = 0;
+
+  for (int i = 0; i < n; i++) {
+
+    ntimestep = ++update->ntimestep;
+    ev_set(ntimestep);
+
+    // initial time integration
+
+    modify->initial_integrate(vflag);
+    if (n_post_integrate) modify->post_integrate();
+
+    // regular communication vs neighbor list rebuild
+
+    nflag = neighbor->decide();
+
+    if (nflag == 0) {
+      timer->stamp();
+      comm->forward_comm();
+      timer->stamp(TIME_COMM);
+    } else {
+      if (n_pre_exchange) modify->pre_exchange();
+      if (triclinic) domain->x2lamda(atom->nlocal);
+      domain->pbc();
+      if (domain->box_change) {
+        domain->reset_box();
+        comm->setup();
+        if (neighbor->style) neighbor->setup_bins();
+      }
+      timer->stamp();
+      comm->exchange();
+      if (sortflag && ntimestep >= atom->nextsort) atom->sort();
+      comm->borders();
+      if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
+      timer->stamp(TIME_COMM);
+      if (n_pre_neighbor) modify->pre_neighbor();
+      neighbor->build();
+      timer->stamp(TIME_NEIGHBOR);
+    }
+
+    // force computations
+    // important for pair to come before bonded contributions
+    // since some bonded potentials tally pairwise energy/virial
+    // and Pair:ev_tally() needs to be called before any tallying
+
+    force_clear();
+    if (n_pre_force) modify->pre_force(vflag);
+
+    timer->stamp();
+
+    if (pair_compute_flag) {
+      force->pair->compute(eflag,vflag);
+      timer->stamp(TIME_PAIR);
+    }
+
+    if (atom->molecular) {
+      if (force->bond) force->bond->compute(eflag,vflag);
+      if (force->angle) force->angle->compute(eflag,vflag);
+      if (force->dihedral) force->dihedral->compute(eflag,vflag);
+      if (force->improper) force->improper->compute(eflag,vflag);
+      timer->stamp(TIME_BOND);
+    }
+
+    if (kspace_compute_flag) {
+      force->kspace->compute(eflag,vflag);
+      timer->stamp(TIME_KSPACE);
+    }
+
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (sync_mode == 1) {
+      fix_intel->sync_coprocessor();
+      timer->stamp(TIME_PAIR);
+    }
+    #endif
+
+    // reverse communication of forces
+
+    if (force->newton) {
+      comm->reverse_comm();
+      timer->stamp(TIME_COMM);
+    }
+
+    #ifdef _LMP_INTEL_OFFLOAD
+    if (sync_mode == 2) {
+      fix_intel->sync_coprocessor();
+      timer->stamp(TIME_PAIR);
+    }
+    #endif
+
+    // force modifications, final time integration, diagnostics
+
+    if (n_post_force) modify->post_force(vflag);
+    modify->final_integrate();
+    if (n_end_of_step) modify->end_of_step();
+
+    // all output
+
+    if (ntimestep == output->next) {
+      timer->stamp();
+      output->write(ntimestep);
+      timer->stamp(TIME_OUTPUT);
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void VerletIntel::cleanup()
+{
+  modify->post_run();
+  domain->box_too_small_check();
+  update->update_time();
+}
+
+/* ----------------------------------------------------------------------
+   clear force on own & ghost atoms
+   clear other arrays as needed
+------------------------------------------------------------------------- */
+
+void VerletIntel::force_clear()
+{
+  int i;
+
+  if (external_force_clear) return;
+
+  // clear force on all particles
+  // if either newton flag is set, also include ghosts
+  // when using threads always clear all forces.
+
+  if (neighbor->includegroup == 0) {
+    int nall;
+    if (force->newton) nall = atom->nlocal + atom->nghost;
+    else nall = atom->nlocal;
+
+    size_t nbytes = sizeof(double) * nall;
+
+    if (nbytes) {
+      memset(&(atom->f[0][0]),0,3*nbytes);
+      if (torqueflag)  memset(&(atom->torque[0][0]),0,3*nbytes);
+      if (erforceflag) memset(&(atom->erforce[0]),  0,  nbytes);
+      if (e_flag)      memset(&(atom->de[0]),       0,  nbytes);
+      if (rho_flag)    memset(&(atom->drho[0]),     0,  nbytes);
+    }
+
+  // neighbor includegroup flag is set
+  // clear force only on initial nfirst particles
+  // if either newton flag is set, also include ghosts
+
+  } else {
+    int nall = atom->nfirst;
+
+    double **f = atom->f;
+    for (i = 0; i < nall; i++) {
+      f[i][0] = 0.0;
+      f[i][1] = 0.0;
+      f[i][2] = 0.0;
+    }
+
+    if (torqueflag) {
+      double **torque = atom->torque;
+      for (i = 0; i < nall; i++) {
+        torque[i][0] = 0.0;
+        torque[i][1] = 0.0;
+        torque[i][2] = 0.0;
+      }
+    }
+
+    if (erforceflag) {
+      double *erforce = atom->erforce;
+      for (i = 0; i < nall; i++) erforce[i] = 0.0;
+    }
+
+    if (e_flag) {
+      double *de = atom->de;
+      for (i = 0; i < nall; i++) de[i] = 0.0;
+    }
+
+    if (rho_flag) {
+      double *drho = atom->drho;
+      for (i = 0; i < nall; i++) drho[i] = 0.0;
+    }
+
+    if (force->newton) {
+      nall = atom->nlocal + atom->nghost;
+
+      for (i = atom->nlocal; i < nall; i++) {
+        f[i][0] = 0.0;
+        f[i][1] = 0.0;
+        f[i][2] = 0.0;
+      }
+
+      if (torqueflag) {
+        double **torque = atom->torque;
+        for (i = atom->nlocal; i < nall; i++) {
+          torque[i][0] = 0.0;
+          torque[i][1] = 0.0;
+          torque[i][2] = 0.0;
+        }
+      }
+
+      if (erforceflag) {
+        double *erforce = atom->erforce;
+        for (i = atom->nlocal; i < nall; i++) erforce[i] = 0.0;
+      }
+
+      if (e_flag) {
+        double *de = atom->de;
+        for (i = 0; i < nall; i++) de[i] = 0.0;
+      }
+
+      if (rho_flag) {
+        double *drho = atom->drho;
+        for (i = 0; i < nall; i++) drho[i] = 0.0;
+      }
+    }
+  }
+}
diff --git a/src/USER-INTEL/verlet_intel.h b/src/USER-INTEL/verlet_intel.h
new file mode 100644
index 0000000000..de4231431d
--- /dev/null
+++ b/src/USER-INTEL/verlet_intel.h
@@ -0,0 +1,68 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef INTEGRATE_CLASS
+
+IntegrateStyle(verlet/intel,VerletIntel)
+
+#else
+
+#ifndef LMP_VERLET_INTEL_H
+#define LMP_VERLET_INTEL_H
+
+#include "integrate.h"
+#ifdef LMP_INTEL_OFFLOAD
+#include "fix_intel.h"
+#endif
+
+namespace LAMMPS_NS {
+
+class VerletIntel : public Integrate {
+ public:
+  VerletIntel(class LAMMPS *, int, char **);
+  virtual ~VerletIntel() {}
+  virtual void init();
+  virtual void setup();
+  virtual void setup_minimal(int);
+  virtual void run(int);
+  void cleanup();
+
+ protected:
+  int triclinic;                    // 0 if domain is orthog, 1 if triclinic
+  int torqueflag,erforceflag;
+  int e_flag,rho_flag;
+
+  virtual void force_clear();
+  #ifdef _LMP_INTEL_OFFLOAD
+  FixIntel *fix_intel;
+  int sync_mode;
+  #endif
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+W: No fixes defined, atoms won't move
+
+If you are not using a fix like nve, nvt, npt then atom velocities and
+coordinates will not be updated during timestepping.
+
+E: Cannot currently get per-atom virials with intel package.
+
+The Intel package does not yet support per-atom virial calculation.
+
+*/
diff --git a/src/USER-INTEL/verlet_split_intel.cpp b/src/USER-INTEL/verlet_split_intel.cpp
new file mode 100644
index 0000000000..3976607b18
--- /dev/null
+++ b/src/USER-INTEL/verlet_split_intel.cpp
@@ -0,0 +1,589 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Yuxing Peng and Chris Knight (U Chicago)
+------------------------------------------------------------------------- */
+
+#include "string.h"
+#include "verlet_split_intel.h"
+#include "universe.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "comm.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "force.h"
+#include "pair.h"
+#include "bond.h"
+#include "angle.h"
+#include "dihedral.h"
+#include "improper.h"
+#include "kspace.h"
+#include "output.h"
+#include "update.h"
+#include "fix.h"
+#include "modify.h"
+#include "timer.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+VerletSplitIntel::VerletSplitIntel(LAMMPS *lmp, int narg, char **arg) :
+  VerletIntel(lmp, narg, arg)
+{
+  // error checks on partitions
+
+  if (universe->nworlds != 2)
+    error->universe_all(FLERR,"Verlet/split requires 2 partitions");
+  if (universe->procs_per_world[0] % universe->procs_per_world[1])
+    error->universe_all(FLERR,"Verlet/split requires Rspace partition "
+                        "size be multiple of Kspace partition size");
+
+  // master = 1 for Rspace procs, 0 for Kspace procs
+
+  if (universe->iworld == 0) master = 1;
+  else master = 0;
+
+  ratio = universe->procs_per_world[0] / universe->procs_per_world[1];
+
+  // Kspace root proc broadcasts info about Kspace proc layout to Rspace procs
+
+  int kspace_procgrid[3];
+
+  if (universe->me == universe->root_proc[1]) {
+    kspace_procgrid[0] = comm->procgrid[0];
+    kspace_procgrid[1] = comm->procgrid[1];
+    kspace_procgrid[2] = comm->procgrid[2];
+  }
+  MPI_Bcast(kspace_procgrid,3,MPI_INT,universe->root_proc[1],universe->uworld);
+
+  int ***kspace_grid2proc;
+  memory->create(kspace_grid2proc,kspace_procgrid[0],
+                 kspace_procgrid[1],kspace_procgrid[2],
+                 "verlet/split:kspace_grid2proc");
+
+  if (universe->me == universe->root_proc[1]) {
+    for (int i = 0; i < comm->procgrid[0]; i++)
+      for (int j = 0; j < comm->procgrid[1]; j++)
+        for (int k = 0; k < comm->procgrid[2]; k++)
+          kspace_grid2proc[i][j][k] = comm->grid2proc[i][j][k];
+  }
+  MPI_Bcast(&kspace_grid2proc[0][0][0],
+            kspace_procgrid[0]*kspace_procgrid[1]*kspace_procgrid[2],MPI_INT,
+            universe->root_proc[1],universe->uworld);
+
+  // Rspace partition must be multiple of Kspace partition in each dim
+  // so atoms of one Kspace proc coincide with atoms of several Rspace procs
+
+  if (master) {
+    int flag = 0;
+    if (comm->procgrid[0] % kspace_procgrid[0]) flag = 1;
+    if (comm->procgrid[1] % kspace_procgrid[1]) flag = 1;
+    if (comm->procgrid[2] % kspace_procgrid[2]) flag = 1;
+    if (flag)
+      error->one(FLERR,
+                 "Verlet/split requires Rspace partition layout be "
+                 "multiple of Kspace partition layout in each dim");
+  }
+
+  // block = 1 Kspace proc with set of Rspace procs it overlays
+  // me_block = 0 for Kspace proc
+  // me_block = 1 to ratio for Rspace procs
+  // block = MPI communicator for that set of procs
+
+  int iblock,key;
+
+  if (!master) {
+    iblock = comm->me;
+    key = 0;
+  } else {
+    int kpx = comm->myloc[0] / (comm->procgrid[0]/kspace_procgrid[0]);
+    int kpy = comm->myloc[1] / (comm->procgrid[1]/kspace_procgrid[1]);
+    int kpz = comm->myloc[2] / (comm->procgrid[2]/kspace_procgrid[2]);
+    iblock = kspace_grid2proc[kpx][kpy][kpz];
+    key = 1;
+  }
+
+  MPI_Comm_split(universe->uworld,iblock,key,&block);
+  MPI_Comm_rank(block,&me_block);
+
+  // output block groupings to universe screen/logfile
+  // bmap is ordered by block and then by proc within block
+
+  int *bmap = new int[universe->nprocs];
+  for (int i = 0; i < universe->nprocs; i++) bmap[i] = -1;
+  bmap[iblock*(ratio+1)+me_block] = universe->me;
+
+  int *bmapall = new int[universe->nprocs];
+  MPI_Allreduce(bmap,bmapall,universe->nprocs,MPI_INT,MPI_MAX,universe->uworld);
+
+  if (universe->me == 0) {
+    if (universe->uscreen) {
+      fprintf(universe->uscreen,
+              "Per-block Rspace/Kspace proc IDs (original proc IDs):\n");
+      int m = 0;
+      for (int i = 0; i < universe->nprocs/(ratio+1); i++) {
+        fprintf(universe->uscreen,"  block %d:",i);
+        int kspace_proc = bmapall[m];
+        for (int j = 1; j <= ratio; j++)
+          fprintf(universe->uscreen," %d",bmapall[m+j]);
+        fprintf(universe->uscreen," %d",kspace_proc);
+        kspace_proc = bmapall[m];
+        for (int j = 1; j <= ratio; j++) {
+          if (j == 1) fprintf(universe->uscreen," (");
+          else fprintf(universe->uscreen," ");
+          fprintf(universe->uscreen,"%d",
+                  universe->uni2orig[bmapall[m+j]]);
+        }
+        fprintf(universe->uscreen," %d)\n",universe->uni2orig[kspace_proc]);
+        m += ratio + 1;
+      }
+    }
+    if (universe->ulogfile) {
+      fprintf(universe->ulogfile,
+              "Per-block Rspace/Kspace proc IDs (original proc IDs):\n");
+      int m = 0;
+      for (int i = 0; i < universe->nprocs/(ratio+1); i++) {
+        fprintf(universe->ulogfile,"  block %d:",i);
+        int kspace_proc = bmapall[m];
+        for (int j = 1; j <= ratio; j++)
+          fprintf(universe->ulogfile," %d",bmapall[m+j]);
+
+        fprintf(universe->ulogfile," %d",kspace_proc);
+        kspace_proc = bmapall[m];
+        for (int j = 1; j <= ratio; j++) {
+          if (j == 1) fprintf(universe->ulogfile," (");
+          else fprintf(universe->ulogfile," ");
+          fprintf(universe->ulogfile,"%d",
+                  universe->uni2orig[bmapall[m+j]]);
+        }
+        fprintf(universe->ulogfile," %d)\n",universe->uni2orig[kspace_proc]);
+        m += ratio + 1;
+      }
+    }
+  }
+
+  memory->destroy(kspace_grid2proc);
+  delete [] bmap;
+  delete [] bmapall;
+
+  // size/disp = vectors for MPI gather/scatter within block
+
+  qsize = new int[ratio+1];
+  qdisp = new int[ratio+1];
+  xsize = new int[ratio+1];
+  xdisp = new int[ratio+1];
+
+  // f_kspace = Rspace copy of Kspace forces
+  // allocate dummy version for Kspace partition
+
+  maxatom = 0;
+  f_kspace = NULL;
+  if (!master) memory->create(f_kspace,1,1,"verlet/split:f_kspace");
+}
+
+/* ---------------------------------------------------------------------- */
+
+VerletSplitIntel::~VerletSplitIntel()
+{
+  delete [] qsize;
+  delete [] qdisp;
+  delete [] xsize;
+  delete [] xdisp;
+  memory->destroy(f_kspace);
+  MPI_Comm_free(&block);
+}
+
+/* ----------------------------------------------------------------------
+   initialization before run
+------------------------------------------------------------------------- */
+
+void VerletSplitIntel::init()
+{
+  if (!force->kspace && comm->me == 0)
+    error->warning(FLERR,"No Kspace calculation with verlet/split");
+
+  if (force->kspace_match("tip4p",0)) tip4p_flag = 1;
+  else tip4p_flag = 0;
+
+  // currently TIP4P does not work with verlet/split, so generate error
+  // see Axel email on this, also other TIP4P notes below
+
+  if (tip4p_flag) error->all(FLERR,"Verlet/split does not yet support TIP4P");
+
+  VerletIntel::init();
+}
+
+/* ----------------------------------------------------------------------
+   setup before run
+   servant partition only sets up KSpace calculation
+------------------------------------------------------------------------- */
+
+void VerletSplitIntel::setup()
+{
+  if (comm->me == 0 && screen) fprintf(screen,"Setting up run ...\n");
+
+  if (!master) force->kspace->setup();
+  else {
+    VerletIntel::setup();
+  }
+}
+
+/* ----------------------------------------------------------------------
+   setup without output
+   flag = 0 = just force calculation
+   flag = 1 = reneighbor and force calculation
+   servant partition only sets up KSpace calculation
+------------------------------------------------------------------------- */
+
+void VerletSplitIntel::setup_minimal(int flag)
+{
+  if (!master) force->kspace->setup();
+  else {
+    VerletIntel::setup_minimal(flag);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   run for N steps
+   master partition does everything but Kspace
+   servant partition does just Kspace
+   communicate back and forth every step:
+     atom coords from master -> servant
+     kspace forces from servant -> master
+     also box bounds from master -> servant if necessary
+------------------------------------------------------------------------- */
+
+void VerletSplitIntel::run(int n)
+{
+  bigint ntimestep;
+  int nflag,sortflag;
+
+  // sync both partitions before start timer
+
+  MPI_Barrier(universe->uworld);
+  timer->init();
+  timer->barrier_start(TIME_LOOP);
+
+  // setup initial Rspace <-> Kspace comm params
+
+  rk_setup();
+
+  // check if OpenMP support fix defined
+
+  Fix *fix_omp;
+  int ifix = modify->find_fix("package_omp");
+  if (ifix < 0) fix_omp = NULL;
+  else fix_omp = modify->fix[ifix];
+
+  // flags for timestepping iterations
+
+  int n_post_integrate = modify->n_post_integrate;
+  int n_pre_exchange = modify->n_pre_exchange;
+  int n_pre_neighbor = modify->n_pre_neighbor;
+  int n_pre_force = modify->n_pre_force;
+  int n_post_force = modify->n_post_force;
+  int n_end_of_step = modify->n_end_of_step;
+
+  if (atom->sortfreq > 0) sortflag = 1;
+  else sortflag = 0;
+
+  for (int i = 0; i < n; i++) {
+
+    ntimestep = ++update->ntimestep;
+    ev_set(ntimestep);
+
+    // initial time integration
+
+    if (master) {
+      modify->initial_integrate(vflag);
+      if (n_post_integrate) modify->post_integrate();
+    }
+
+    // regular communication vs neighbor list rebuild
+
+    if (master) nflag = neighbor->decide();
+    MPI_Bcast(&nflag,1,MPI_INT,1,block);
+
+    if (master) {
+      if (nflag == 0) {
+        timer->stamp();
+        comm->forward_comm();
+        timer->stamp(TIME_COMM);
+      } else {
+        if (n_pre_exchange) modify->pre_exchange();
+        if (triclinic) domain->x2lamda(atom->nlocal);
+        domain->pbc();
+        if (domain->box_change) {
+          domain->reset_box();
+          comm->setup();
+          if (neighbor->style) neighbor->setup_bins();
+        }
+        timer->stamp();
+        comm->exchange();
+        if (sortflag && ntimestep >= atom->nextsort) atom->sort();
+        comm->borders();
+        if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
+        timer->stamp(TIME_COMM);
+        if (n_pre_neighbor) modify->pre_neighbor();
+        neighbor->build();
+        timer->stamp(TIME_NEIGHBOR);
+      }
+    }
+
+    // if reneighboring occurred, re-setup Rspace <-> Kspace comm params
+    // comm Rspace atom coords to Kspace procs
+
+    if (nflag) rk_setup();
+    r2k_comm();
+
+    // force computations
+
+    force_clear();
+
+    if (master) {
+      if (n_pre_force) modify->pre_force(vflag);
+
+      timer->stamp();
+      if (force->pair) {
+        force->pair->compute(eflag,vflag);
+        timer->stamp(TIME_PAIR);
+      }
+
+      if (atom->molecular) {
+        if (force->bond) force->bond->compute(eflag,vflag);
+        if (force->angle) force->angle->compute(eflag,vflag);
+        if (force->dihedral) force->dihedral->compute(eflag,vflag);
+        if (force->improper) force->improper->compute(eflag,vflag);
+        timer->stamp(TIME_BOND);
+      }
+
+      #ifdef _LMP_INTEL_OFFLOAD
+      if (sync_mode == 1) {
+	fix_intel->sync_coprocessor();
+	timer->stamp(TIME_PAIR);
+      }
+      #endif
+
+      if (force->newton) {
+        comm->reverse_comm();
+        timer->stamp(TIME_COMM);
+      }
+
+      #ifdef _LMP_INTEL_OFFLOAD
+      if (sync_mode == 2) {
+	fix_intel->sync_coprocessor();
+	timer->stamp(TIME_PAIR);
+      }
+      #endif
+
+    } else {
+
+      // run FixOMP as sole pre_force fix, if defined
+
+      if (fix_omp) fix_omp->pre_force(vflag);
+
+      if (force->kspace) {
+        timer->stamp();
+        force->kspace->compute(eflag,vflag);
+        timer->stamp(TIME_KSPACE);
+      }
+
+      // TIP4P PPPM puts forces on ghost atoms, so must reverse_comm()
+
+      if (tip4p_flag && force->newton) {
+        comm->reverse_comm();
+        timer->stamp(TIME_COMM);
+      }
+    }
+
+    // comm and sum Kspace forces back to Rspace procs
+
+    k2r_comm();
+
+    // force modifications, final time integration, diagnostics
+    // all output
+
+    if (master) {
+      if (n_post_force) modify->post_force(vflag);
+      modify->final_integrate();
+      if (n_end_of_step) modify->end_of_step();
+
+      if (ntimestep == output->next) {
+        timer->stamp();
+        output->write(ntimestep);
+        timer->stamp(TIME_OUTPUT);
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   setup params for Rspace <-> Kspace communication
+   called initially and after every reneighbor
+   also communcicate atom charges from Rspace to KSpace since static
+------------------------------------------------------------------------- */
+
+void VerletSplitIntel::rk_setup()
+{
+  // grow f_kspace array on master procs if necessary
+
+  if (master) {
+    if (atom->nlocal > maxatom) {
+      memory->destroy(f_kspace);
+      maxatom = atom->nmax;
+      memory->create(f_kspace,maxatom,3,"verlet/split:f_kspace");
+    }
+  }
+
+  // qsize = # of atoms owned by each master proc in block
+
+  int n = 0;
+  if (master) n = atom->nlocal;
+  MPI_Gather(&n,1,MPI_INT,qsize,1,MPI_INT,0,block);
+
+  // setup qdisp, xsize, xdisp based on qsize
+  // only needed by Kspace proc
+  // set Kspace nlocal to sum of Rspace nlocals
+  // insure Kspace atom arrays are large enough
+
+  if (!master) {
+    qsize[0] = qdisp[0] = xsize[0] = xdisp[0] = 0;
+    for (int i = 1; i <= ratio; i++) {
+      qdisp[i] = qdisp[i-1]+qsize[i-1];
+      xsize[i] = 3*qsize[i];
+      xdisp[i] = xdisp[i-1]+xsize[i-1];
+    }
+
+    atom->nlocal = qdisp[ratio] + qsize[ratio];
+    while (atom->nmax <= atom->nlocal) atom->avec->grow(0);
+    atom->nghost = 0;
+  }
+
+  // one-time gather of Rspace atom charges to Kspace proc
+
+  MPI_Gatherv(atom->q,n,MPI_DOUBLE,atom->q,qsize,qdisp,MPI_DOUBLE,0,block);
+
+  // for TIP4P also need to send atom type and tag
+  // KSpace procs need to acquire ghost atoms and map all their atoms
+  // map_clear() call is in lieu of comm->exchange() which performs map_clear
+  // borders() call acquires ghost atoms and maps them
+  // NOTE: do atom coords need to be communicated here before borders() call?
+  //   could do this by calling r2k_comm() here and not again from run()
+  //   except that forward_comm() in r2k_comm() is wrong
+
+  if (tip4p_flag) {
+    //r2k_comm();
+    MPI_Gatherv(atom->type,n,MPI_INT,atom->type,qsize,qdisp,MPI_INT,0,block);
+    MPI_Gatherv(atom->tag,n,MPI_LMP_TAGINT,
+                atom->tag,qsize,qdisp,MPI_LMP_TAGINT,0,block);
+    if (!master) {
+      if (triclinic) domain->x2lamda(atom->nlocal);
+      if (domain->box_change) comm->setup();
+      timer->stamp();
+      atom->map_clear();
+      comm->borders();
+      if (triclinic) domain->lamda2x(atom->nlocal+atom->nghost);
+      timer->stamp(TIME_COMM);
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   communicate Rspace atom coords to Kspace
+   also eflag,vflag and box bounds if needed
+------------------------------------------------------------------------- */
+
+void VerletSplitIntel::r2k_comm()
+{
+  MPI_Status status;
+
+  int n = 0;
+  if (master) n = atom->nlocal;
+  MPI_Gatherv(atom->x[0],n*3,MPI_DOUBLE,atom->x[0],xsize,xdisp,
+              MPI_DOUBLE,0,block);
+
+  // send eflag,vflag from Rspace to Kspace
+
+  if (me_block == 1) {
+    int flags[2];
+    flags[0] = eflag; flags[1] = vflag;
+    MPI_Send(flags,2,MPI_INT,0,0,block);
+  } else if (!master) {
+    int flags[2];
+    MPI_Recv(flags,2,MPI_DOUBLE,1,0,block,&status);
+    eflag = flags[0]; vflag = flags[1];
+  }
+
+  // send box bounds from Rspace to Kspace if simulation box is dynamic
+
+  if (domain->box_change) {
+    if (me_block == 1) {
+      MPI_Send(domain->boxlo,3,MPI_DOUBLE,0,0,block);
+      MPI_Send(domain->boxhi,3,MPI_DOUBLE,0,0,block);
+    } else if (!master) {
+      MPI_Recv(domain->boxlo,3,MPI_DOUBLE,1,0,block,&status);
+      MPI_Recv(domain->boxhi,3,MPI_DOUBLE,1,0,block,&status);
+      domain->set_global_box();
+      domain->set_local_box();
+      force->kspace->setup();
+    }
+  }
+
+  // for TIP4P, Kspace partition needs to update its ghost atoms
+
+  if (tip4p_flag && !master) {
+    timer->stamp();
+    comm->forward_comm();
+    timer->stamp(TIME_COMM);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   communicate and sum Kspace atom forces back to Rspace
+------------------------------------------------------------------------- */
+
+void VerletSplitIntel::k2r_comm()
+{
+  if (eflag) MPI_Bcast(&force->kspace->energy,1,MPI_DOUBLE,0,block);
+  if (vflag) MPI_Bcast(force->kspace->virial,6,MPI_DOUBLE,0,block);
+
+  int n = 0;
+  if (master) n = atom->nlocal;
+  MPI_Scatterv(atom->f[0],xsize,xdisp,MPI_DOUBLE,
+               f_kspace[0],n*3,MPI_DOUBLE,0,block);
+
+  if (master) {
+    double **f = atom->f;
+    int nlocal = atom->nlocal;
+    for (int i = 0; i < nlocal; i++) {
+      f[i][0] += f_kspace[i][0];
+      f[i][1] += f_kspace[i][1];
+      f[i][2] += f_kspace[i][2];
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   memory usage of Kspace force array on master procs
+------------------------------------------------------------------------- */
+
+bigint VerletSplitIntel::memory_usage()
+{
+  bigint bytes = maxatom*3 * sizeof(double);
+  return bytes;
+}
diff --git a/src/USER-INTEL/verlet_split_intel.h b/src/USER-INTEL/verlet_split_intel.h
new file mode 100644
index 0000000000..3f81d41a97
--- /dev/null
+++ b/src/USER-INTEL/verlet_split_intel.h
@@ -0,0 +1,89 @@
+/* -------------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef INTEGRATE_CLASS
+
+IntegrateStyle(verlet/split/intel,VerletSplitIntel)
+
+#else
+
+#ifndef LMP_VERLET_SPLIT_INTEL_H
+#define LMP_VERLET_SPLIT_INTEL_H
+
+#include "verlet_intel.h"
+#ifdef LMP_INTEL_OFFLOAD
+#include "fix_intel.h"
+#endif
+
+namespace LAMMPS_NS {
+
+class VerletSplitIntel : public VerletIntel {
+ public:
+  VerletSplitIntel(class LAMMPS *, int, char **);
+  ~VerletSplitIntel();
+  void init();
+  void setup();
+  void setup_minimal(int);
+  void run(int);
+  bigint memory_usage();
+
+ private:
+  int master;                        // 1 if an Rspace proc, 0 if Kspace
+  int me_block;                      // proc ID within Rspace/Kspace block
+  int ratio;                         // ratio of Rspace procs to Kspace procs
+  int *qsize,*qdisp,*xsize,*xdisp;   // MPI gather/scatter params for block comm
+  MPI_Comm block;                    // communicator within one block
+  int tip4p_flag;                    // 1 if PPPM/tip4p so do extra comm
+
+  double **f_kspace;                 // copy of Kspace forces on Rspace procs
+  int maxatom;
+
+  void rk_setup();
+  void r2k_comm();
+  void k2r_comm();
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Verlet/split requires 2 partitions
+
+See the -partition command-line switch.
+
+E: Verlet/split requires Rspace partition size be multiple of Kspace partition size
+
+This is so there is an equal number of Rspace processors for every
+Kspace processor.
+
+E: Verlet/split requires Rspace partition layout be multiple of Kspace partition layout in each dim
+
+This is controlled by the processors command.
+
+W: No Kspace calculation with verlet/split
+
+The 2nd partition performs a kspace calculation so the kspace_style
+command must be used.
+
+E: Verlet/split does not yet support TIP4P
+
+This is a current limitation.
+
+E: Cannot currently get per-atom virials with Intel package.
+
+The Intel package does not yet support per-atom virial calculation.
+
+*/
diff --git a/src/USER-OMP/pair_gran_hooke_history_omp.cpp b/src/USER-OMP/pair_gran_hooke_history_omp.cpp
index 57827adf28..afe287c601 100644
--- a/src/USER-OMP/pair_gran_hooke_history_omp.cpp
+++ b/src/USER-OMP/pair_gran_hooke_history_omp.cpp
@@ -35,9 +35,6 @@ PairGranHookeHistoryOMP::PairGranHookeHistoryOMP(LAMMPS *lmp) :
 {
   suffix_flag |= Suffix::OMP;
   respa_enable = 0;
-  // trigger use of OpenMP version of FixShearHistory
-  suffix = new char[4];
-  memcpy(suffix,"omp",4);
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/angle_hybrid.cpp b/src/angle_hybrid.cpp
index 6f1cceba00..1780c4344e 100644
--- a/src/angle_hybrid.cpp
+++ b/src/angle_hybrid.cpp
@@ -208,7 +208,7 @@ void AngleHybrid::settings(int narg, char **arg)
   // one exception is 1st arg of style "table", which is non-numeric
   // need a better way to skip these exceptions
 
-  int dummy;
+  int sflag;
   nstyles = 0;
   i = 0;
 
@@ -221,9 +221,10 @@ void AngleHybrid::settings(int narg, char **arg)
       error->all(FLERR,"Angle style hybrid cannot have hybrid as an argument");
     if (strcmp(arg[i],"none") == 0)
       error->all(FLERR,"Angle style hybrid cannot have none as an argument");
-    styles[nstyles] = force->new_angle(arg[i],lmp->suffix,dummy);
-    keywords[nstyles] = new char[strlen(arg[i])+1];
-    strcpy(keywords[nstyles],arg[i]);
+
+    styles[nstyles] = force->new_angle(arg[i],1,sflag);
+    force->store_style(keywords[nstyles],arg[i],sflag);
+
     istyle = i;
     if (strcmp(arg[i],"table") == 0) i++;
     i++;
@@ -346,7 +347,7 @@ void AngleHybrid::read_restart(FILE *fp)
     keywords[m] = new char[n];
     if (me == 0) fread(keywords[m],sizeof(char),n,fp);
     MPI_Bcast(keywords[m],n,MPI_CHAR,0,world);
-    styles[m] = force->new_angle(keywords[m],lmp->suffix,dummy);
+    styles[m] = force->new_angle(keywords[m],0,dummy);
   }
 }
 
diff --git a/src/atom.cpp b/src/atom.cpp
index 7efbf4740f..550b959f22 100644
--- a/src/atom.cpp
+++ b/src/atom.cpp
@@ -333,7 +333,7 @@ void Atom::settings(Atom *old)
    called from lammps.cpp, input script, restart file, replicate
 ------------------------------------------------------------------------- */
 
-void Atom::create_avec(const char *style, int narg, char **arg, char *suffix)
+void Atom::create_avec(const char *style, int narg, char **arg, int trysuffix)
 {
   delete [] atom_style;
   if (avec) delete avec;
@@ -362,14 +362,15 @@ void Atom::create_avec(const char *style, int narg, char **arg, char *suffix)
   //   so that x[0][0] can always be referenced even if proc has no atoms
 
   int sflag;
-  avec = new_avec(style,suffix,sflag);
+  avec = new_avec(style,trysuffix,sflag);
   avec->store_args(narg,arg);
   avec->process_args(narg,arg);
   avec->grow(1);
 
   if (sflag) {
     char estyle[256];
-    sprintf(estyle,"%s/%s",style,suffix);
+    if (sflag = 1) sprintf(estyle,"%s/%s",style,lmp->suffix);
+    else sprintf(estyle,"%s/%s",style,lmp->suffix2);
     int n = strlen(estyle) + 1;
     atom_style = new char[n];
     strcpy(atom_style,estyle);
@@ -394,26 +395,41 @@ void Atom::create_avec(const char *style, int narg, char **arg, char *suffix)
    generate an AtomVec class, first with suffix appended
 ------------------------------------------------------------------------- */
 
-AtomVec *Atom::new_avec(const char *style, char *suffix, int &sflag)
+AtomVec *Atom::new_avec(const char *style, int trysuffix, int &sflag)
 {
-  if (suffix && lmp->suffix_enable) {
-    sflag = 1;
-    char estyle[256];
-    sprintf(estyle,"%s/%s",style,suffix);
+  if (trysuffix && lmp->suffix_enable) {
+    if (lmp->suffix) {
+      sflag = 1;
+      char estyle[256];
+      sprintf(estyle,"%s/%s",style,lmp->suffix);
 
-    if (0) return NULL;
+      if (0) return NULL;
 
 #define ATOM_CLASS
 #define AtomStyle(key,Class) \
-    else if (strcmp(estyle,#key) == 0) return new Class(lmp);
+      else if (strcmp(estyle,#key) == 0) return new Class(lmp);
 #include "style_atom.h"
 #undef AtomStyle
 #undef ATOM_CLASS
+    }
 
+    if (lmp->suffix2) {
+      sflag = 1;
+      char estyle[256];
+      sprintf(estyle,"%s/%s",style,lmp->suffix2);
+
+      if (0) return NULL;
+
+#define ATOM_CLASS
+#define AtomStyle(key,Class) \
+      else if (strcmp(estyle,#key) == 0) return new Class(lmp);
+#include "style_atom.h"
+#undef AtomStyle
+#undef ATOM_CLASS
+    }
   }
 
   sflag = 0;
-
   if (0) return NULL;
 
 #define ATOM_CLASS
@@ -423,7 +439,6 @@ AtomVec *Atom::new_avec(const char *style, char *suffix, int &sflag)
 #undef ATOM_CLASS
 
   else error->all(FLERR,"Invalid atom style");
-
   return NULL;
 }
 
diff --git a/src/atom.h b/src/atom.h
index c6bebe88a9..2f21fee722 100644
--- a/src/atom.h
+++ b/src/atom.h
@@ -171,8 +171,8 @@ class Atom : protected Pointers {
   ~Atom();
 
   void settings(class Atom *);
-  void create_avec(const char *, int, char **, char *suffix = NULL);
-  class AtomVec *new_avec(const char *, char *, int &);
+  void create_avec(const char *, int, char **, int);
+  class AtomVec *new_avec(const char *, int, int &);
   void init();
   void setup();
 
diff --git a/src/bond_hybrid.cpp b/src/bond_hybrid.cpp
index 4545008611..63357a12e5 100644
--- a/src/bond_hybrid.cpp
+++ b/src/bond_hybrid.cpp
@@ -207,7 +207,7 @@ void BondHybrid::settings(int narg, char **arg)
   // one exception is 1st arg of style "table", which is non-numeric
   // need a better way to skip these exceptions
 
-  int dummy;
+  int sflag;
   nstyles = 0;
   i = 0;
 
@@ -219,9 +219,10 @@ void BondHybrid::settings(int narg, char **arg)
       error->all(FLERR,"Bond style hybrid cannot have hybrid as an argument");
     if (strcmp(arg[i],"none") == 0)
       error->all(FLERR,"Bond style hybrid cannot have none as an argument");
-    styles[nstyles] = force->new_bond(arg[i],lmp->suffix,dummy);
-    keywords[nstyles] = new char[strlen(arg[i])+1];
-    strcpy(keywords[nstyles],arg[i]);
+
+    styles[nstyles] = force->new_bond(arg[i],1,sflag);
+    force->store_style(keywords[nstyles],arg[i],sflag);
+
     istyle = i;
     if (strcmp(arg[i],"table") == 0) i++;
     i++;
@@ -330,7 +331,7 @@ void BondHybrid::read_restart(FILE *fp)
     keywords[m] = new char[n];
     if (me == 0) fread(keywords[m],sizeof(char),n,fp);
     MPI_Bcast(keywords[m],n,MPI_CHAR,0,world);
-    styles[m] = force->new_bond(keywords[m],lmp->suffix,dummy);
+    styles[m] = force->new_bond(keywords[m],0,dummy);
   }
 }
 
diff --git a/src/delete_bonds.cpp b/src/delete_bonds.cpp
index 3b2e9a528b..b380508ee9 100644
--- a/src/delete_bonds.cpp
+++ b/src/delete_bonds.cpp
@@ -167,7 +167,7 @@ void DeleteBonds::command(int narg, char **arg)
           else if (style == ATOM) {
             if (tlist[type[i]] || tlist[type[atom1]]) flag = 1;
           } else if (style == BOND) {
-            itype = static_cast<int> (fabs(bond_type[i][m]));
+            itype = abs(bond_type[i][m]);
             if (tlist[itype]) flag = 1;
           }
           if (flag) {
@@ -205,7 +205,7 @@ void DeleteBonds::command(int narg, char **arg)
             if (tlist[type[atom1]] || tlist[type[atom2]] ||
                 tlist[type[atom3]]) flag = 1;
           } else if (style == ANGLE) {
-            itype = static_cast<int> (fabs(angle_type[i][m]));
+            itype = abs(angle_type[i][m]);
             if (tlist[itype]) flag = 1;
           }
           if (flag) {
@@ -245,7 +245,7 @@ void DeleteBonds::command(int narg, char **arg)
               if (tlist[type[atom1]] || tlist[type[atom2]] ||
                   tlist[type[atom3]] || tlist[type[atom4]]) flag = 1;
           } else if (style == DIHEDRAL) {
-            itype = static_cast<int> (fabs(dihedral_type[i][m]));
+            itype = abs(dihedral_type[i][m]);
             if (tlist[itype]) flag = 1;
           }
           if (flag) {
@@ -285,7 +285,7 @@ void DeleteBonds::command(int narg, char **arg)
               if (tlist[type[atom1]] || tlist[type[atom2]] ||
                   tlist[type[atom3]] || tlist[type[atom4]]) flag = 1;
           } else if (style == IMPROPER) {
-            itype = static_cast<int> (fabs(improper_type[i][m]));
+            itype = abs(improper_type[i][m]);
             if (tlist[itype]) flag = 1;
           }
           if (flag) {
diff --git a/src/dihedral_hybrid.cpp b/src/dihedral_hybrid.cpp
index 7b0dea64db..6f8ef5a093 100644
--- a/src/dihedral_hybrid.cpp
+++ b/src/dihedral_hybrid.cpp
@@ -209,7 +209,7 @@ void DihedralHybrid::settings(int narg, char **arg)
   // one exception is 1st arg of style "table", which is non-numeric
   // need a better way to skip these exceptions
 
-  int dummy;
+  int sflag;
   nstyles = 0;
   i = 0;
 
@@ -223,9 +223,10 @@ void DihedralHybrid::settings(int narg, char **arg)
                  "Dihedral style hybrid cannot have hybrid as an argument");
     if (strcmp(arg[i],"none") == 0)
       error->all(FLERR,"Dihedral style hybrid cannot have none as an argument");
-    styles[nstyles] = force->new_dihedral(arg[i],lmp->suffix,dummy);
-    keywords[nstyles] = new char[strlen(arg[i])+1];
-    strcpy(keywords[nstyles],arg[i]);
+
+    styles[nstyles] = force->new_dihedral(arg[i],1,sflag);
+    force->store_style(keywords[nstyles],arg[i],sflag);
+
     istyle = i;
     if (strcmp(arg[i],"table") == 0) i++;
     i++;
@@ -331,7 +332,7 @@ void DihedralHybrid::read_restart(FILE *fp)
     keywords[m] = new char[n];
     if (me == 0) fread(keywords[m],sizeof(char),n,fp);
     MPI_Bcast(keywords[m],n,MPI_CHAR,0,world);
-    styles[m] = force->new_dihedral(keywords[m],lmp->suffix,dummy);
+    styles[m] = force->new_dihedral(keywords[m],0,dummy);
   }
 }
 
diff --git a/src/force.cpp b/src/force.cpp
index c316c04a52..832133548a 100644
--- a/src/force.cpp
+++ b/src/force.cpp
@@ -125,47 +125,46 @@ void Force::init()
    create a pair style, called from input script or restart file
 ------------------------------------------------------------------------- */
 
-void Force::create_pair(const char *style, const char *suffix)
+void Force::create_pair(const char *style, int trysuffix)
 {
   delete [] pair_style;
   if (pair) delete pair;
 
   int sflag;
-  pair = new_pair(style,suffix,sflag);
-
-  if (sflag) {
-    char estyle[256];
-    sprintf(estyle,"%s/%s",style,suffix);
-    int n = strlen(estyle) + 1;
-    pair_style = new char[n];
-    strcpy(pair_style,estyle);
-  } else {
-    int n = strlen(style) + 1;
-    pair_style = new char[n];
-    strcpy(pair_style,style);
-  }
+  pair = new_pair(style,trysuffix,sflag);
+  store_style(pair_style,style,sflag);
 }
 
 /* ----------------------------------------------------------------------
    generate a pair class
-   try first with suffix appended
+   if trysuffix = 1, try first with suffix1/2 appended
+   return sflag = 0 for no suffix added, 1 or 2 for suffix1/2 added
 ------------------------------------------------------------------------- */
 
-Pair *Force::new_pair(const char *style, const char *suffix, int &sflag)
+Pair *Force::new_pair(const char *style, int trysuffix, int &sflag)
 {
-  if (suffix && lmp->suffix_enable) {
-    sflag = 1;
-    char estyle[256];
-    sprintf(estyle,"%s/%s",style,suffix);
-
-    if (pair_map->find(estyle) != pair_map->end()) {
-      PairCreator pair_creator = (*pair_map)[estyle];
-      return pair_creator(lmp);
+  if (trysuffix && lmp->suffix_enable) {
+    if (lmp->suffix) {
+      sflag = 1;
+      char estyle[256];
+      sprintf(estyle,"%s/%s",style,lmp->suffix);
+      if (pair_map->find(estyle) != pair_map->end()) {
+        PairCreator pair_creator = (*pair_map)[estyle];
+        return pair_creator(lmp);
+      }
+    }
+    if (lmp->suffix2) {
+      sflag = 2;
+      char estyle[256];
+      sprintf(estyle,"%s/%s",style,lmp->suffix2);
+      if (pair_map->find(estyle) != pair_map->end()) {
+        PairCreator pair_creator = (*pair_map)[estyle];
+        return pair_creator(lmp);
+      }
     }
   }
 
   sflag = 0;
-
   if (strcmp(style,"none") == 0) return NULL;
   if (pair_map->find(style) != pair_map->end()) {
     PairCreator pair_creator = (*pair_map)[style];
@@ -230,50 +229,55 @@ Pair *Force::pair_match(const char *word, int exact)
    create a bond style, called from input script or restart file
 ------------------------------------------------------------------------- */
 
-void Force::create_bond(const char *style, const char *suffix)
+void Force::create_bond(const char *style, int trysuffix)
 {
   delete [] bond_style;
   if (bond) delete bond;
 
   int sflag;
-  bond = new_bond(style,suffix,sflag);
-
-  if (sflag) {
-    char estyle[256];
-    sprintf(estyle,"%s/%s",style,suffix);
-    int n = strlen(estyle) + 1;
-    bond_style = new char[n];
-    strcpy(bond_style,estyle);
-  } else {
-    int n = strlen(style) + 1;
-    bond_style = new char[n];
-    strcpy(bond_style,style);
-  }
+  bond = new_bond(style,trysuffix,sflag);
+  store_style(bond_style,style,sflag);
 }
 
 /* ----------------------------------------------------------------------
    generate a bond class, fist with suffix appended
 ------------------------------------------------------------------------- */
 
-Bond *Force::new_bond(const char *style, const char *suffix, int &sflag)
+Bond *Force::new_bond(const char *style, int trysuffix, int &sflag)
 {
-  if (suffix && lmp->suffix_enable) {
-    sflag = 1;
-    char estyle[256];
-    sprintf(estyle,"%s/%s",style,suffix);
-
-    if (0) return NULL;
+  if (trysuffix && lmp->suffix_enable) {
+    if (lmp->suffix) {
+      sflag = 1;
+      char estyle[256];
+      sprintf(estyle,"%s/%s",style,lmp->suffix);
+      
+      if (0) return NULL;
 
 #define BOND_CLASS
 #define BondStyle(key,Class) \
-    else if (strcmp(estyle,#key) == 0) return new Class(lmp);
+      else if (strcmp(estyle,#key) == 0) return new Class(lmp);
 #include "style_bond.h"
 #undef BondStyle
 #undef BOND_CLASS
+    }
+
+    if (lmp->suffix2) {
+      sflag = 2;
+      char estyle[256];
+      sprintf(estyle,"%s/%s",style,lmp->suffix2);
+      
+      if (0) return NULL;
+
+#define BOND_CLASS
+#define BondStyle(key,Class) \
+      else if (strcmp(estyle,#key) == 0) return new Class(lmp);
+#include "style_bond.h"
+#undef BondStyle
+#undef BOND_CLASS
+    }
   }
 
   sflag = 0;
-
   if (strcmp(style,"none") == 0) return NULL;
 
 #define BOND_CLASS
@@ -305,51 +309,55 @@ Bond *Force::bond_match(const char *style)
    create an angle style, called from input script or restart file
 ------------------------------------------------------------------------- */
 
-void Force::create_angle(const char *style, const char *suffix)
+void Force::create_angle(const char *style, int trysuffix)
 {
   delete [] angle_style;
   if (angle) delete angle;
 
   int sflag;
-  angle = new_angle(style,suffix,sflag);
-
-  if (sflag) {
-    char estyle[256];
-    sprintf(estyle,"%s/%s",style,suffix);
-    int n = strlen(estyle) + 1;
-    angle_style = new char[n];
-    strcpy(angle_style,estyle);
-  } else {
-    int n = strlen(style) + 1;
-    angle_style = new char[n];
-    strcpy(angle_style,style);
-  }
+  angle = new_angle(style,trysuffix,sflag);
+  store_style(angle_style,style,sflag);
 }
 
 /* ----------------------------------------------------------------------
    generate an angle class
 ------------------------------------------------------------------------- */
 
-Angle *Force::new_angle(const char *style, const char *suffix, int &sflag)
+Angle *Force::new_angle(const char *style, int trysuffix, int &sflag)
 {
-  if (suffix && lmp->suffix_enable) {
-    sflag = 1;
-    char estyle[256];
-    sprintf(estyle,"%s/%s",style,suffix);
-
-    if (0) return NULL;
+  if (trysuffix && lmp->suffix_enable) {
+    if (lmp->suffix) {
+      sflag = 1;
+      char estyle[256];
+      sprintf(estyle,"%s/%s",style,lmp->suffix);
+      
+      if (0) return NULL;
 
 #define ANGLE_CLASS
 #define AngleStyle(key,Class) \
-    else if (strcmp(estyle,#key) == 0) return new Class(lmp);
+      else if (strcmp(estyle,#key) == 0) return new Class(lmp);
 #include "style_angle.h"
 #undef AngleStyle
 #undef ANGLE_CLASS
+    }
 
+    if (lmp->suffix2) {
+      sflag = 2;
+      char estyle[256];
+      sprintf(estyle,"%s/%s",style,lmp->suffix);
+      
+      if (0) return NULL;
+
+#define ANGLE_CLASS
+#define AngleStyle(key,Class) \
+      else if (strcmp(estyle,#key) == 0) return new Class(lmp);
+#include "style_angle.h"
+#undef AngleStyle
+#undef ANGLE_CLASS
+    }
   }
 
   sflag = 0;
-
   if (strcmp(style,"none") == 0) return NULL;
 
 #define ANGLE_CLASS
@@ -366,51 +374,55 @@ Angle *Force::new_angle(const char *style, const char *suffix, int &sflag)
    create a dihedral style, called from input script or restart file
 ------------------------------------------------------------------------- */
 
-void Force::create_dihedral(const char *style, const char *suffix)
+void Force::create_dihedral(const char *style, int trysuffix)
 {
   delete [] dihedral_style;
   if (dihedral) delete dihedral;
 
   int sflag;
-  dihedral = new_dihedral(style,suffix,sflag);
-
-  if (sflag) {
-    char estyle[256];
-    sprintf(estyle,"%s/%s",style,suffix);
-    int n = strlen(estyle) + 1;
-    dihedral_style = new char[n];
-    strcpy(dihedral_style,estyle);
-  } else {
-    int n = strlen(style) + 1;
-    dihedral_style = new char[n];
-    strcpy(dihedral_style,style);
-  }
+  dihedral = new_dihedral(style,trysuffix,sflag);
+  store_style(dihedral_style,style,sflag);
 }
 
 /* ----------------------------------------------------------------------
    generate a dihedral class
 ------------------------------------------------------------------------- */
 
-Dihedral *Force::new_dihedral(const char *style, const char *suffix, int &sflag)
+Dihedral *Force::new_dihedral(const char *style, int trysuffix, int &sflag)
 {
-  if (suffix && lmp->suffix_enable) {
-    sflag = 1;
-    char estyle[256];
-    sprintf(estyle,"%s/%s",style,suffix);
+  if (trysuffix && lmp->suffix_enable) {
+    if (lmp->suffix) {
+      sflag = 1;
+      char estyle[256];
+      sprintf(estyle,"%s/%s",style,lmp->suffix);
 
-    if (0) return NULL;
+      if (0) return NULL;
 
 #define DIHEDRAL_CLASS
 #define DihedralStyle(key,Class) \
-    else if (strcmp(estyle,#key) == 0) return new Class(lmp);
+      else if (strcmp(estyle,#key) == 0) return new Class(lmp);
 #include "style_dihedral.h"
 #undef DihedralStyle
 #undef DIHEDRAL_CLASS
+    }
 
+    if (lmp->suffix) {
+      sflag = 2;
+      char estyle[256];
+      sprintf(estyle,"%s/%s",style,lmp->suffix2);
+
+      if (0) return NULL;
+
+#define DIHEDRAL_CLASS
+#define DihedralStyle(key,Class) \
+      else if (strcmp(estyle,#key) == 0) return new Class(lmp);
+#include "style_dihedral.h"
+#undef DihedralStyle
+#undef DIHEDRAL_CLASS
+    }
   }
 
   sflag = 0;
-
   if (strcmp(style,"none") == 0) return NULL;
 
 #define DIHEDRAL_CLASS
@@ -428,51 +440,55 @@ Dihedral *Force::new_dihedral(const char *style, const char *suffix, int &sflag)
    create an improper style, called from input script or restart file
 ------------------------------------------------------------------------- */
 
-void Force::create_improper(const char *style, const char *suffix)
+void Force::create_improper(const char *style, int trysuffix)
 {
   delete [] improper_style;
   if (improper) delete improper;
 
   int sflag;
-  improper = new_improper(style,suffix,sflag);
-
-  if (sflag) {
-    char estyle[256];
-    sprintf(estyle,"%s/%s",style,suffix);
-    int n = strlen(estyle) + 1;
-    improper_style = new char[n];
-    strcpy(improper_style,estyle);
-  } else {
-    int n = strlen(style) + 1;
-    improper_style = new char[n];
-    strcpy(improper_style,style);
-  }
+  improper = new_improper(style,trysuffix,sflag);
+  store_style(improper_style,style,sflag);
 }
 
 /* ----------------------------------------------------------------------
    generate a improper class
 ------------------------------------------------------------------------- */
 
-Improper *Force::new_improper(const char *style, const char *suffix, int &sflag)
+Improper *Force::new_improper(const char *style, int trysuffix, int &sflag)
 {
-  if (suffix && lmp->suffix_enable) {
-    sflag = 1;
-    char estyle[256];
-    sprintf(estyle,"%s/%s",style,suffix);
+  if (trysuffix && lmp->suffix_enable) {
+    if (lmp->suffix) {
+      sflag = 1;
+      char estyle[256];
+      sprintf(estyle,"%s/%s",style,lmp->suffix);
 
-    if (0) return NULL;
+      if (0) return NULL;
 
 #define IMPROPER_CLASS
 #define ImproperStyle(key,Class) \
-    else if (strcmp(estyle,#key) == 0) return new Class(lmp);
+      else if (strcmp(estyle,#key) == 0) return new Class(lmp);
 #include "style_improper.h"
 #undef ImproperStyle
 #undef IMPROPER_CLASS
+    }
 
+    if (lmp->suffix2) {
+      sflag = 2;
+      char estyle[256];
+      sprintf(estyle,"%s/%s",style,lmp->suffix2);
+
+      if (0) return NULL;
+
+#define IMPROPER_CLASS
+#define ImproperStyle(key,Class) \
+      else if (strcmp(estyle,#key) == 0) return new Class(lmp);
+#include "style_improper.h"
+#undef ImproperStyle
+#undef IMPROPER_CLASS
+    }
   }
 
   sflag = 0;
-
   if (strcmp(style,"none") == 0) return NULL;
 
 #define IMPROPER_CLASS
@@ -504,25 +520,14 @@ Improper *Force::improper_match(const char *style)
    new kspace style
 ------------------------------------------------------------------------- */
 
-void Force::create_kspace(int narg, char **arg, const char *suffix)
+void Force::create_kspace(int narg, char **arg, int trysuffix)
 {
   delete [] kspace_style;
   if (kspace) delete kspace;
 
   int sflag;
-  kspace = new_kspace(narg,arg,suffix,sflag);
-
-  if (sflag) {
-    char estyle[256];
-    sprintf(estyle,"%s/%s",arg[0],suffix);
-    int n = strlen(estyle) + 1;
-    kspace_style = new char[n];
-    strcpy(kspace_style,estyle);
-  } else {
-    int n = strlen(arg[0]) + 1;
-    kspace_style = new char[n];
-    strcpy(kspace_style,arg[0]);
-  }
+  kspace = new_kspace(narg,arg,trysuffix,sflag);
+  store_style(kspace_style,arg[0],sflag);
 
   if (comm->style == 1 && !kspace_match("ewald",0))
     error->all(FLERR,
@@ -533,26 +538,41 @@ void Force::create_kspace(int narg, char **arg, const char *suffix)
    generate a kspace class
 ------------------------------------------------------------------------- */
 
-KSpace *Force::new_kspace(int narg, char **arg, const char *suffix, int &sflag)
+KSpace *Force::new_kspace(int narg, char **arg, int trysuffix, int &sflag)
 {
-  if (suffix && lmp->suffix_enable) {
-    sflag = 1;
-    char estyle[256];
-    sprintf(estyle,"%s/%s",arg[0],suffix);
+  if (trysuffix && lmp->suffix_enable) {
+    if (lmp->suffix) {
+      sflag = 1;
+      char estyle[256];
+      sprintf(estyle,"%s/%s",arg[0],lmp->suffix);
 
-    if (0) return NULL;
+      if (0) return NULL;
 
 #define KSPACE_CLASS
 #define KSpaceStyle(key,Class) \
-  else if (strcmp(estyle,#key) == 0) return new Class(lmp,narg-1,&arg[1]);
+      else if (strcmp(estyle,#key) == 0) return new Class(lmp,narg-1,&arg[1]);
 #include "style_kspace.h"
 #undef KSpaceStyle
 #undef KSPACE_CLASS
+    }
 
+    if (lmp->suffix2) {
+      sflag = 1;
+      char estyle[256];
+      sprintf(estyle,"%s/%s",arg[0],lmp->suffix2);
+
+      if (0) return NULL;
+
+#define KSPACE_CLASS
+#define KSpaceStyle(key,Class) \
+      else if (strcmp(estyle,#key) == 0) return new Class(lmp,narg-1,&arg[1]);
+#include "style_kspace.h"
+#undef KSpaceStyle
+#undef KSPACE_CLASS
+    }
   }
 
   sflag = 0;
-
   if (strcmp(arg[0],"none") == 0) return NULL;
 
 #define KSPACE_CLASS
@@ -579,6 +599,28 @@ KSpace *Force::kspace_match(const char *word, int exact)
   return NULL;
 }
 
+/* ----------------------------------------------------------------------
+   store style name in str allocated here
+   if sflag = 0, no suffix
+   if sflag = 1/2, append suffix or suffix2 to style
+------------------------------------------------------------------------- */
+
+void Force::store_style(char *&str, const char *style, int sflag)
+{
+  if (sflag) {
+    char estyle[256];
+    if (sflag == 1) sprintf(estyle,"%s/%s",style,lmp->suffix);
+    else sprintf(estyle,"%s/%s",style,lmp->suffix2);
+    int n = strlen(estyle) + 1;
+    str = new char[n];
+    strcpy(str,estyle);
+  } else {
+    int n = strlen(style) + 1;
+    str = new char[n];
+    strcpy(str,style);
+  }
+}
+
 /* ----------------------------------------------------------------------
    set special bond values
 ------------------------------------------------------------------------- */
diff --git a/src/force.h b/src/force.h
index bf364f253b..f857c1a119 100644
--- a/src/force.h
+++ b/src/force.h
@@ -77,28 +77,29 @@ class Force : protected Pointers {
   ~Force();
   void init();
 
-  void create_pair(const char *, const char *suffix = NULL);
-  class Pair *new_pair(const char *, const char *, int &);
+  void create_pair(const char *, int);
+  class Pair *new_pair(const char *, int, int &);
   class Pair *pair_match(const char *, int);
 
-  void create_bond(const char *, const char *suffix = NULL);
-  class Bond *new_bond(const char *, const char *, int &);
+  void create_bond(const char *, int);
+  class Bond *new_bond(const char *, int, int &);
   class Bond *bond_match(const char *);
 
-  void create_angle(const char *, const char *suffix = NULL);
-  class Angle *new_angle(const char *, const char *, int &);
+  void create_angle(const char *, int);
+  class Angle *new_angle(const char *, int, int &);
 
-  void create_dihedral(const char *, const char *suffix = NULL);
-  class Dihedral *new_dihedral(const char *, const char *, int &);
+  void create_dihedral(const char *, int);
+  class Dihedral *new_dihedral(const char *, int, int &);
 
-  void create_improper(const char *, const char *suffix = NULL);
-  class Improper *new_improper(const char *, const char *, int &);
+  void create_improper(const char *, int);
+  class Improper *new_improper(const char *, int, int &);
   class Improper *improper_match(const char *);
 
-  void create_kspace(int, char **, const char *suffix = NULL);
-  class KSpace *new_kspace(int, char **, const char *, int &);
+  void create_kspace(int, char **, int);
+  class KSpace *new_kspace(int, char **, int, int &);
   class KSpace *kspace_match(const char *, int);
 
+  void store_style(char *&, const char *, int);
   void set_special(int, char **);
   void bounds(char *, int, int &, int &, int nmin=1);
   void boundsbig(char *, bigint, bigint &, bigint &, bigint nmin=1);
diff --git a/src/improper_hybrid.cpp b/src/improper_hybrid.cpp
index 9212051e38..09e73ac9b5 100644
--- a/src/improper_hybrid.cpp
+++ b/src/improper_hybrid.cpp
@@ -209,7 +209,7 @@ void ImproperHybrid::settings(int narg, char **arg)
   // one exception is 1st arg of style "table", which is non-numeric
   // need a better way to skip these exceptions
 
-  int dummy;
+  int sflag;
   nstyles = 0;
   i = 0;
 
@@ -223,9 +223,10 @@ void ImproperHybrid::settings(int narg, char **arg)
                  "Improper style hybrid cannot have hybrid as an argument");
     if (strcmp(arg[i],"none") == 0)
       error->all(FLERR,"Improper style hybrid cannot have none as an argument");
-    styles[nstyles] = force->new_improper(arg[i],lmp->suffix,dummy);
-    keywords[nstyles] = new char[strlen(arg[i])+1];
-    strcpy(keywords[nstyles],arg[i]);
+
+    styles[nstyles] = force->new_improper(arg[i],1,sflag);
+    force->store_style(keywords[nstyles],arg[i],sflag);
+
     istyle = i;
     if (strcmp(arg[i],"table") == 0) i++;
     i++;
@@ -319,7 +320,7 @@ void ImproperHybrid::read_restart(FILE *fp)
     keywords[m] = new char[n];
     if (me == 0) fread(keywords[m],sizeof(char),n,fp);
     MPI_Bcast(keywords[m],n,MPI_CHAR,0,world);
-    styles[m] = force->new_improper(keywords[m],lmp->suffix,dummy);
+    styles[m] = force->new_improper(keywords[m],0,dummy);
   }
 }
 
diff --git a/src/input.cpp b/src/input.cpp
index 785929bb21..ac91ab5467 100644
--- a/src/input.cpp
+++ b/src/input.cpp
@@ -1083,7 +1083,7 @@ void Input::angle_style()
   if (narg < 1) error->all(FLERR,"Illegal angle_style command");
   if (atom->avec->angles_allow == 0)
     error->all(FLERR,"Angle_style command when no angles allowed");
-  force->create_angle(arg[0],lmp->suffix);
+  force->create_angle(arg[0],1);
   if (force->angle) force->angle->settings(narg-1,&arg[1]);
 }
 
@@ -1101,7 +1101,7 @@ void Input::atom_style()
   if (narg < 1) error->all(FLERR,"Illegal atom_style command");
   if (domain->box_exist)
     error->all(FLERR,"Atom_style command after simulation box is defined");
-  atom->create_avec(arg[0],narg-1,&arg[1],lmp->suffix);
+  atom->create_avec(arg[0],narg-1,&arg[1],1);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -1124,7 +1124,7 @@ void Input::bond_style()
   if (narg < 1) error->all(FLERR,"Illegal bond_style command");
   if (atom->avec->bonds_allow == 0)
     error->all(FLERR,"Bond_style command when no bonds allowed");
-  force->create_bond(arg[0],lmp->suffix);
+  force->create_bond(arg[0],1);
   if (force->bond) force->bond->settings(narg-1,&arg[1]);
 }
 
@@ -1175,7 +1175,7 @@ void Input::comm_style()
 
 void Input::compute()
 {
-  modify->add_compute(narg,arg,lmp->suffix);
+  modify->add_compute(narg,arg,1);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -1213,7 +1213,7 @@ void Input::dihedral_style()
   if (narg < 1) error->all(FLERR,"Illegal dihedral_style command");
   if (atom->avec->dihedrals_allow == 0)
     error->all(FLERR,"Dihedral_style command when no dihedrals allowed");
-  force->create_dihedral(arg[0],lmp->suffix);
+  force->create_dihedral(arg[0],1);
   if (force->dihedral) force->dihedral->settings(narg-1,&arg[1]);
 }
 
@@ -1253,7 +1253,7 @@ void Input::dump_modify()
 
 void Input::fix()
 {
-  modify->add_fix(narg,arg,lmp->suffix);
+  modify->add_fix(narg,arg,1);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -1290,7 +1290,7 @@ void Input::improper_style()
   if (narg < 1) error->all(FLERR,"Illegal improper_style command");
   if (atom->avec->impropers_allow == 0)
     error->all(FLERR,"Improper_style command when no impropers allowed");
-  force->create_improper(arg[0],lmp->suffix);
+  force->create_improper(arg[0],1);
   if (force->improper) force->improper->settings(narg-1,&arg[1]);
 }
 
@@ -1307,7 +1307,7 @@ void Input::kspace_modify()
 
 void Input::kspace_style()
 {
-  force->create_kspace(narg,arg,lmp->suffix);
+  force->create_kspace(narg,arg,1);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -1412,7 +1412,7 @@ void Input::package()
     fixarg[1] = (char *) "all";
     fixarg[2] = (char *) "GPU";
     for (int i = 1; i < narg; i++) fixarg[i+2] = arg[i];
-    modify->add_fix(2+narg,fixarg,NULL);
+    modify->add_fix(2+narg,fixarg);
     delete [] fixarg;
     force->newton_pair = 0;
 
@@ -1427,9 +1427,54 @@ void Input::package()
     fixarg[1] = (char *) "all";
     fixarg[2] = (char *) "OMP";
     for (int i = 1; i < narg; i++) fixarg[i+2] = arg[i];
-    modify->add_fix(2+narg,fixarg,NULL);
+    modify->add_fix(2+narg,fixarg);
     delete [] fixarg;
 
+ } else if (strcmp(arg[0],"intel") == 0) {
+
+    // add omp package for non-pair routines
+
+    /*
+    char **fixarg = new char*[2+narg];
+    fixarg[0] = (char *) "package_omp";
+    fixarg[1] = (char *) "all";
+    fixarg[2] = (char *) "OMP";
+    int omp_narg = 3;
+    if (narg > 1) {
+      fixarg[3] = arg[1];
+      omp_narg++;
+      if (narg > 2)
+	for (int i = 2; i < narg; i++)
+	  if (strcmp(arg[i],"mixed") == 0) {
+	    fixarg[4] = arg[i];
+	    omp_narg++;
+	  }
+    }
+    modify->add_fix(omp_narg,fixarg);
+
+    // add intel package for neighbor and pair routines
+    */
+
+    char **fixarg = new char*[2+narg];
+    fixarg[0] = (char *) "package_intel";
+    fixarg[1] = (char *) "all";
+    fixarg[2] = (char *) "Intel";
+    for (int i = 1; i < narg; i++) fixarg[i+2] = arg[i];
+    modify->add_fix(2+narg,fixarg);
+    delete [] fixarg;
+
+    /*
+    // if running with offload, set run_style to verlet/intel
+
+    #ifdef LMP_INTEL_OFFLOAD
+    #ifdef __INTEL_OFFLOAD
+    char *str;
+    str = (char *) "verlet/intel";
+    update->create_integrate(1,&str,0);
+    #endif
+    #endif
+    */
+
   } else error->all(FLERR,"Illegal package command");
 }
 
@@ -1461,11 +1506,27 @@ void Input::pair_modify()
 void Input::pair_style()
 {
   if (narg < 1) error->all(FLERR,"Illegal pair_style command");
-  if (force->pair && strcmp(arg[0],force->pair_style) == 0) {
-    force->pair->settings(narg-1,&arg[1]);
-    return;
+  if (force->pair) {
+    int match = 0;
+    if (strcmp(arg[0],force->pair_style) == 0) match = 1;
+    if (!match && lmp->suffix_enable) {
+      char estyle[256];
+      if (lmp->suffix) {
+        sprintf(estyle,"%s/%s",arg[0],lmp->suffix);
+        if (strcmp(estyle,force->pair_style) == 0) match = 1;
+      }
+      if (lmp->suffix2) {
+        sprintf(estyle,"%s/%s",arg[0],lmp->suffix2);
+        if (strcmp(estyle,force->pair_style) == 0) match = 1;
+      }
+    }
+    if (match) {
+      force->pair->settings(narg-1,&arg[1]);
+      return;
+    }
   }
-  force->create_pair(arg[0],lmp->suffix);
+
+  force->create_pair(arg[0],1);
   if (force->pair) force->pair->settings(narg-1,&arg[1]);
 }
 
@@ -1514,7 +1575,7 @@ void Input::run_style()
 {
   if (domain->box_exist == 0)
     error->all(FLERR,"Run_style command before simulation box is defined");
-  update->create_integrate(narg,arg,lmp->suffix);
+  update->create_integrate(narg,arg,1);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -1561,6 +1622,12 @@ void Input::suffix()
     int n = strlen(arg[0]) + 1;
     lmp->suffix = new char[n];
     strcpy(lmp->suffix,arg[0]);
+    // set 2nd suffix = "omp" when suffix = "intel"
+    if (strcmp(lmp->suffix,"intel") == 0) {
+      delete [] lmp->suffix2;
+      lmp->suffix2 = new char[4];
+      strcpy(lmp->suffix2,"omp");
+    }
     lmp->suffix_enable = 1;
   }
 }
diff --git a/src/lammps.cpp b/src/lammps.cpp
index d1e84cf9b3..69945a805f 100644
--- a/src/lammps.cpp
+++ b/src/lammps.cpp
@@ -45,6 +45,7 @@
 #include "accelerator_cuda.h"
 #include "accelerator_kokkos.h"
 #include "accelerator_omp.h"
+#include "accelerator_intel.h"
 #include "timer.h"
 #include "memory.h"
 #include "error.h"
@@ -84,7 +85,7 @@ LAMMPS::LAMMPS(int narg, char **arg, MPI_Comm communicator)
   int citeflag = 1;
   int helpflag = 0;
 
-  suffix = NULL;
+  suffix = suffix2 = NULL;
   suffix_enable = 0;
   char *rfile = NULL;
   char *dfile = NULL;
@@ -172,6 +173,11 @@ LAMMPS::LAMMPS(int narg, char **arg, MPI_Comm communicator)
       int n = strlen(arg[iarg+1]) + 1;
       suffix = new char[n];
       strcpy(suffix,arg[iarg+1]);
+      // set 2nd suffix = "omp" when suffix = "intel"
+      if (strcmp(suffix,"intel") == 0) {
+        suffix2 = new char[4];
+        strcpy(suffix2,"omp");
+      }
       suffix_enable = 1;
       iarg += 2;
     } else if (strcmp(arg[iarg],"-reorder") == 0 ||
@@ -535,6 +541,7 @@ LAMMPS::~LAMMPS()
   delete cuda;
   delete kokkos;
   delete [] suffix;
+  delete [] suffix2;
 
   delete input;
   delete universe;
@@ -571,7 +578,7 @@ void LAMMPS::create()
 
   if (kokkos) atom = new AtomKokkos(this);
   else atom = new Atom(this);
-  atom->create_avec("atomic",0,NULL,suffix);
+  atom->create_avec("atomic",0,NULL,1);
 
   group = new Group(this);
   force = new Force(this);    // must be after group, to create temperature
@@ -590,13 +597,20 @@ void LAMMPS::create()
    invoke package-specific setup commands
    called from LAMMPS constructor and after clear() command
    only invoke if suffix is set and enabled
+   also check if suffix2 is set
 ------------------------------------------------------------------------- */
 
 void LAMMPS::post_create()
 {
-  if (suffix && suffix_enable) {
+  if (!suffix_enable) return;
+  if (suffix) {
     if (strcmp(suffix,"gpu") == 0) input->one("package gpu force/neigh 0 0 1");
     if (strcmp(suffix,"omp") == 0) input->one("package omp *");
+    if (strcmp(suffix,"intel") == 0) 
+      input->one("package intel * mixed balance -1");
+  }
+  if (suffix2) {
+    if (strcmp(suffix,"omp") == 0) input->one("package omp *");
   }
 }
 
diff --git a/src/lammps.h b/src/lammps.h
index 44c7921bd8..8ff0eca067 100644
--- a/src/lammps.h
+++ b/src/lammps.h
@@ -42,11 +42,14 @@ class LAMMPS {
   FILE *screen;                  // screen output
   FILE *logfile;                 // logfile
 
-  char *suffix;                  // suffix to add to input script style names
-  int suffix_enable;             // 1 if suffix enabled, 0 if disabled
+  char *suffix,*suffix2;         // suffixes to add to input script style names
+  int suffix_enable;             // 1 if suffixes are enabled, 0 if disabled
   int cite_enable;               // 1 if generating log.cite, 0 if disabled
 
   class Cuda *cuda;              // CUDA accelerator class
+  //class GPU *gpu;                // GPU accelerator class
+  //class Intel *intel;            // Intel accelerator class
+  //class OMP *omp;                // OMP accelerator class
   class KokkosLMP *kokkos;       // KOKKOS accelerator class
 
   class CiteMe *citeme;          // citation info
diff --git a/src/modify.cpp b/src/modify.cpp
index b55f368fef..065838f4ce 100644
--- a/src/modify.cpp
+++ b/src/modify.cpp
@@ -31,7 +31,7 @@ using namespace FixConst;
 
 #define DELTA 4
 #define BIG 1.0e20
-#define NEXCEPT 4       // change when add to exceptions in add_fix()
+#define NEXCEPT 5       // change when add to exceptions in add_fix()
 
 /* ---------------------------------------------------------------------- */
 
@@ -649,7 +649,7 @@ int Modify::min_reset_ref()
    add a new fix or replace one with same ID
 ------------------------------------------------------------------------- */
 
-void Modify::add_fix(int narg, char **arg, char *suffix)
+void Modify::add_fix(int narg, char **arg, int trysuffix)
 {
   if (narg < 3) error->all(FLERR,"Illegal fix command");
 
@@ -658,9 +658,10 @@ void Modify::add_fix(int narg, char **arg, char *suffix)
   //   but can't think of better way
   // too late if instantiate fix, then check flag set in fix constructor,
   //   since some fixes access domain settings in their constructor
-  // change NEXCEPT above when add new fix to this list
+  // MUST change NEXCEPT above when add new fix to this list
 
-  const char *exceptions[NEXCEPT] = {"GPU","OMP","property/atom","cmap"};
+  const char *exceptions[NEXCEPT] = 
+    {"GPU","OMP","Intel","property/atom","cmap"};
 
   if (domain->box_exist == 0) {
     int m;
@@ -694,12 +695,27 @@ void Modify::add_fix(int narg, char **arg, char *suffix)
 
   if (ifix < nfix) {
     newflag = 0;
-    if (strcmp(arg[2],fix[ifix]->style) != 0)
-      error->all(FLERR,"Replacing a fix, but new style != old style");
+    
+    int match = 0;
+    if (strcmp(arg[2],fix[ifix]->style) == 0) match = 1;
+    if (!match && trysuffix && lmp->suffix_enable) {
+      char estyle[256];
+      if (lmp->suffix) {
+        sprintf(estyle,"%s/%s",arg[2],lmp->suffix);
+        if (strcmp(estyle,fix[ifix]->style) == 0) match = 1;
+      }
+      if (lmp->suffix2) {
+        sprintf(estyle,"%s/%s",arg[2],lmp->suffix2);
+        if (strcmp(estyle,fix[ifix]->style) == 0) match = 1;
+      }
+    }
+    if (!match) error->all(FLERR,"Replacing a fix, but new style != old style");
+
     if (fix[ifix]->igroup != igroup && comm->me == 0)
       error->warning(FLERR,"Replacing a fix, but new group != old group");
     delete fix[ifix];
     fix[ifix] = NULL;
+
   } else {
     newflag = 1;
     if (nfix == maxfix) {
@@ -714,12 +730,22 @@ void Modify::add_fix(int narg, char **arg, char *suffix)
 
   fix[ifix] = NULL;
 
-  if (suffix && lmp->suffix_enable) {
-    char estyle[256];
-    sprintf(estyle,"%s/%s",arg[2],suffix);
-    if (fix_map->find(estyle) != fix_map->end()) {
-      FixCreator fix_creator = (*fix_map)[estyle];
-      fix[ifix] = fix_creator(lmp,narg,arg);
+  if (trysuffix && lmp->suffix_enable) {
+    if (lmp->suffix) {
+      char estyle[256];
+      sprintf(estyle,"%s/%s",arg[2],lmp->suffix);
+      if (fix_map->find(estyle) != fix_map->end()) {
+        FixCreator fix_creator = (*fix_map)[estyle];
+        fix[ifix] = fix_creator(lmp,narg,arg);
+      }
+    }
+    if (fix[ifix] == NULL && lmp->suffix2) {
+      char estyle[256];
+      sprintf(estyle,"%s/%s",arg[2],lmp->suffix2);
+      if (fix_map->find(estyle) != fix_map->end()) {
+        FixCreator fix_creator = (*fix_map)[estyle];
+        fix[ifix] = fix_creator(lmp,narg,arg);
+      }
     }
   }
 
@@ -838,7 +864,7 @@ int Modify::find_fix(const char *id)
    add a new compute
 ------------------------------------------------------------------------- */
 
-void Modify::add_compute(int narg, char **arg, char *suffix)
+void Modify::add_compute(int narg, char **arg, int trysuffix)
 {
   if (narg < 3) error->all(FLERR,"Illegal compute command");
 
@@ -861,12 +887,22 @@ void Modify::add_compute(int narg, char **arg, char *suffix)
 
   compute[ncompute] = NULL;
 
-  if (suffix && lmp->suffix_enable) {
-    char estyle[256];
-    sprintf(estyle,"%s/%s",arg[2],suffix);
-    if (compute_map->find(estyle) != compute_map->end()) {
-      ComputeCreator compute_creator = (*compute_map)[estyle];
-      compute[ncompute] = compute_creator(lmp,narg,arg);
+  if (trysuffix && lmp->suffix_enable) {
+    if (lmp->suffix) {
+      char estyle[256];
+      sprintf(estyle,"%s/%s",arg[2],lmp->suffix);
+      if (compute_map->find(estyle) != compute_map->end()) {
+        ComputeCreator compute_creator = (*compute_map)[estyle];
+        compute[ncompute] = compute_creator(lmp,narg,arg);
+      }
+    }
+    if (compute[ncompute] == NULL && lmp->suffix2) {
+      char estyle[256];
+      sprintf(estyle,"%s/%s",arg[2],lmp->suffix2);
+      if (compute_map->find(estyle) != compute_map->end()) {
+        ComputeCreator compute_creator = (*compute_map)[estyle];
+        compute[ncompute] = compute_creator(lmp,narg,arg);
+      }
     }
   }
 
diff --git a/src/modify.h b/src/modify.h
index 422c77d2fe..ba7101a934 100644
--- a/src/modify.h
+++ b/src/modify.h
@@ -82,12 +82,12 @@ class Modify : protected Pointers {
   virtual int min_dof();
   virtual int min_reset_ref();
 
-  void add_fix(int, char **, char *suffix = NULL);
+  void add_fix(int, char **, int trysuffix=0);
   void modify_fix(int, char **);
   void delete_fix(const char *);
   int find_fix(const char *);
 
-  void add_compute(int, char **, char *suffix = NULL);
+  void add_compute(int, char **, int trysuffix=0);
   void modify_compute(int, char **);
   void delete_compute(const char *);
   int find_compute(const char *);
diff --git a/src/neigh_list.cpp b/src/neigh_list.cpp
index dc925a6f90..0de4c0cb5f 100644
--- a/src/neigh_list.cpp
+++ b/src/neigh_list.cpp
@@ -246,6 +246,7 @@ void NeighList::print_attributes()
   printf("  %d = occasional\n",rq->occasional);
   printf("  %d = dnum\n",rq->dnum);
   printf("  %d = omp\n",rq->omp);
+  printf("  %d = intel\n",rq->intel);
   printf("  %d = ghost\n",rq->ghost);
   printf("  %d = cudable\n",rq->cudable);
   printf("  %d = omp\n",rq->omp);
diff --git a/src/neigh_request.cpp b/src/neigh_request.cpp
index 643d11b3bb..95ddc01517 100644
--- a/src/neigh_request.cpp
+++ b/src/neigh_request.cpp
@@ -56,6 +56,7 @@ NeighRequest::NeighRequest(LAMMPS *lmp) : Pointers(lmp)
   ghost = 0;
   cudable = 0;
   omp = 0;
+  intel = 0;
   kokkos_host = kokkos_device = 0;
 
   // default is no copy or skip
@@ -126,6 +127,7 @@ int NeighRequest::identical(NeighRequest *other)
   if (ghost != other->ghost) same = 0;
   if (cudable != other->cudable) same = 0;
   if (omp != other->omp) same = 0;
+  if (intel != other->intel) same = 0;
 
   if (copy != other->copy_original) same = 0;
   if (same_skip(other) == 0) same = 0;
@@ -155,6 +157,7 @@ int NeighRequest::same_kind(NeighRequest *other)
   if (ghost != other->ghost) same = 0;
   if (cudable != other->cudable) same = 0;
   if (omp != other->omp) same = 0;
+  if (intel != other->intel) same = 0;
 
   return same;
 }
@@ -205,4 +208,5 @@ void NeighRequest::copy_request(NeighRequest *other)
   ghost = other->ghost;
   cudable = other->cudable;
   omp = other->omp;
+  intel = other->intel;
 }
diff --git a/src/neigh_request.h b/src/neigh_request.h
index 769d5354bf..41fa951fee 100644
--- a/src/neigh_request.h
+++ b/src/neigh_request.h
@@ -79,9 +79,10 @@ class NeighRequest : protected Pointers {
 
   int cudable;
 
-  // 1 if using multi-threaded neighbor list build
+  // 1 if using multi-threaded neighbor list build for USER-OMP or USER-INTEL
 
   int omp;
+  int intel;
 
   // 1 if using Kokkos neighbor build
 
diff --git a/src/neighbor.cpp b/src/neighbor.cpp
index 705887ba03..28c051313c 100644
--- a/src/neighbor.cpp
+++ b/src/neighbor.cpp
@@ -920,7 +920,7 @@ void Neighbor::choose_build(int index, NeighRequest *rq)
 {
   PairPtr pb = NULL;
 
-  if (rq->omp == 0) {
+  if (rq->omp == 0 && rq->intel == 0) {
 
     if (rq->copy) pb = &Neighbor::copy_from;
 
@@ -1076,21 +1076,33 @@ void Neighbor::choose_build(int index, NeighRequest *rq)
       } else if (style == BIN) {
         if (rq->newton == 0) {
           if (newton_pair == 0) {
-            if (rq->ghost == 0) pb = &Neighbor::half_bin_no_newton_omp;
-            else if (includegroup)
+            if (rq->ghost == 0) {
+	      if (rq->intel) pb = &Neighbor::half_bin_no_newton_intel;
+	      else pb = &Neighbor::half_bin_no_newton_omp;
+            } else if (includegroup)
               error->all(FLERR,"Neighbor include group not allowed "
                          "with ghost neighbors");
             else pb = &Neighbor::half_bin_no_newton_ghost_omp;
           } else if (triclinic == 0) {
-            pb = &Neighbor::half_bin_newton_omp;
-          } else if (triclinic == 1)
-            pb = &Neighbor::half_bin_newton_tri_omp;
+            if (rq->intel) pb = &Neighbor::half_bin_newton_intel;
+            else pb = &Neighbor::half_bin_newton_omp;
+          } else if (triclinic == 1) {
+            if (rq->intel) pb = &Neighbor::half_bin_newton_tri_intel;
+            else pb = &Neighbor::half_bin_newton_tri_omp;
+	  }
         } else if (rq->newton == 1) {
-          if (triclinic == 0) pb = &Neighbor::half_bin_newton_omp;
-          else if (triclinic == 1) pb = &Neighbor::half_bin_newton_tri_omp;
+          if (triclinic == 0) {
+	    if (rq->intel) pb = &Neighbor::half_bin_newton_intel;
+	    else pb = &Neighbor::half_bin_newton_omp;
+          } else if (triclinic == 1) {
+            if (rq->intel) pb = &Neighbor::half_bin_newton_tri_intel;
+            else pb = &Neighbor::half_bin_newton_tri_omp;
+	  }
         } else if (rq->newton == 2) {
-          if (rq->ghost == 0) pb = &Neighbor::half_bin_no_newton_omp;
-          else if (includegroup)
+          if (rq->ghost == 0) {
+	    if (rq->intel) pb = &Neighbor::half_bin_no_newton_intel;
+	    else pb = &Neighbor::half_bin_no_newton_omp;
+          } else if (includegroup)
             error->all(FLERR,"Neighbor include group not allowed "
                        "with ghost neighbors");
           else pb = &Neighbor::half_bin_no_newton_ghost_omp;
diff --git a/src/neighbor.h b/src/neighbor.h
index 3c0c4af889..05a8622d04 100644
--- a/src/neighbor.h
+++ b/src/neighbor.h
@@ -237,6 +237,7 @@ class Neighbor : protected Pointers {
 
 #define LMP_INSIDE_NEIGHBOR_H
 #include "accelerator_omp.h"
+#include "accelerator_intel.h"
 #undef LMP_INSIDE_NEIGHBOR_H
 
   // pairwise stencil creation functions
diff --git a/src/output.cpp b/src/output.cpp
index 0383dfe84d..6a9223604e 100644
--- a/src/output.cpp
+++ b/src/output.cpp
@@ -50,18 +50,18 @@ Output::Output(LAMMPS *lmp) : Pointers(lmp)
   newarg[0] = (char *) "thermo_temp";
   newarg[1] = (char *) "all";
   newarg[2] = (char *) "temp";
-  modify->add_compute(3,newarg,lmp->suffix);
+  modify->add_compute(3,newarg,1);
 
   newarg[0] = (char *) "thermo_press";
   newarg[1] = (char *) "all";
   newarg[2] = (char *) "pressure";
   newarg[3] = (char *) "thermo_temp";
-  modify->add_compute(4,newarg,lmp->suffix);
+  modify->add_compute(4,newarg,1);
 
   newarg[0] = (char *) "thermo_pe";
   newarg[1] = (char *) "all";
   newarg[2] = (char *) "pe";
-  modify->add_compute(3,newarg,lmp->suffix);
+  modify->add_compute(3,newarg,1);
 
   delete [] newarg;
 
diff --git a/src/pair_hybrid.cpp b/src/pair_hybrid.cpp
index e15d4b00d8..15de14db13 100644
--- a/src/pair_hybrid.cpp
+++ b/src/pair_hybrid.cpp
@@ -219,7 +219,7 @@ void PairHybrid::settings(int narg, char **arg)
   // call settings() with set of args that are not pair style names
   // use force->pair_map to determine which args these are
 
-  int iarg,jarg,dummy;
+  int iarg,jarg,sflag;
 
   iarg = 0;
   nstyles = 0;
@@ -228,10 +228,10 @@ void PairHybrid::settings(int narg, char **arg)
       error->all(FLERR,"Pair style hybrid cannot have hybrid as an argument");
     if (strcmp(arg[iarg],"none") == 0)
       error->all(FLERR,"Pair style hybrid cannot have none as an argument");
-    styles[nstyles] = force->new_pair(arg[iarg],lmp->suffix,dummy);
-    int n = strlen(arg[iarg]) + 1;
-    keywords[nstyles] = new char[n];
-    strcpy(keywords[nstyles],arg[iarg]);
+
+    styles[nstyles] = force->new_pair(arg[iarg],1,sflag);
+    force->store_style(keywords[nstyles],arg[iarg],sflag);
+
     jarg = iarg + 1;
     while (jarg < narg && !force->pair_map->count(arg[jarg])) jarg++;
     styles[nstyles]->settings(jarg-iarg-1,&arg[iarg+1]);
@@ -637,7 +637,7 @@ void PairHybrid::read_restart(FILE *fp)
     keywords[m] = new char[n];
     if (me == 0) fread(keywords[m],sizeof(char),n,fp);
     MPI_Bcast(keywords[m],n,MPI_CHAR,0,world);
-    styles[m] = force->new_pair(keywords[m],lmp->suffix,dummy);
+    styles[m] = force->new_pair(keywords[m],0,dummy);
     styles[m]->read_restart_settings(fp);
   }
 
diff --git a/src/read_restart.cpp b/src/read_restart.cpp
index 749bff7f31..441ecbfcd2 100644
--- a/src/read_restart.cpp
+++ b/src/read_restart.cpp
@@ -797,7 +797,7 @@ void ReadRestart::header(int incompatible)
       char **argcopy = new char*[nargcopy];
       for (int i = 0; i < nargcopy; i++)
         argcopy[i] = read_string();
-      atom->create_avec(style,nargcopy,argcopy);
+      atom->create_avec(style,nargcopy,argcopy,0);
       for (int i = 0; i < nargcopy; i++) delete [] argcopy[i];
       delete [] argcopy;
       delete [] style;
@@ -891,31 +891,31 @@ void ReadRestart::force_fields()
 
     if (flag == PAIR) {
       style = read_string();
-      force->create_pair(style);
+      force->create_pair(style,0);
       delete [] style;
       force->pair->read_restart(fp);
 
     } else if (flag == BOND) {
       style = read_string();
-      force->create_bond(style);
+      force->create_bond(style,0);
       delete [] style;
       force->bond->read_restart(fp);
 
     } else if (flag == ANGLE) {
       style = read_string();
-      force->create_angle(style);
+      force->create_angle(style,0);
       delete [] style;
       force->angle->read_restart(fp);
 
     } else if (flag == DIHEDRAL) {
       style = read_string();
-      force->create_dihedral(style);
+      force->create_dihedral(style,0);
       delete [] style;
       force->dihedral->read_restart(fp);
 
     } else if (flag == IMPROPER) {
       style = read_string();
-      force->create_improper(style);
+      force->create_improper(style,0);
       delete [] style;
       force->improper->read_restart(fp);
 
diff --git a/src/replicate.cpp b/src/replicate.cpp
index 26f3fca7ed..7300da0968 100644
--- a/src/replicate.cpp
+++ b/src/replicate.cpp
@@ -116,7 +116,7 @@ void Replicate::command(int narg, char **arg)
   Atom *old = atom;
   atom = new Atom(lmp);
   atom->settings(old);
-  atom->create_avec(old->atom_style,old->avec->nargcopy,old->avec->argcopy);
+  atom->create_avec(old->atom_style,old->avec->nargcopy,old->avec->argcopy,0);
 
   // check that new system will not be too large
   // new tags cannot exceed MAXTAGINT
diff --git a/src/suffix.h b/src/suffix.h
index 2a150ed5ef..43493d6203 100644
--- a/src/suffix.h
+++ b/src/suffix.h
@@ -22,6 +22,7 @@ namespace Suffix {
   static const int GPU  = 1<<1;
   static const int CUDA = 1<<2;
   static const int OMP  = 1<<3;
+  static const int INTEL  = 1<<4;
 }
 
 }
diff --git a/src/update.cpp b/src/update.cpp
index a2017db066..610cce1abd 100644
--- a/src/update.cpp
+++ b/src/update.cpp
@@ -62,7 +62,7 @@ Update::Update(LAMMPS *lmp) : Pointers(lmp)
   minimize = NULL;
 
   str = (char *) "verlet";
-  create_integrate(1,&str,lmp->suffix);
+  create_integrate(1,&str,1);
 
   str = (char *) "cg";
   create_minimize(1,&str);
@@ -293,7 +293,7 @@ void Update::set_units(const char *style)
 
 /* ---------------------------------------------------------------------- */
 
-void Update::create_integrate(int narg, char **arg, char *suffix)
+void Update::create_integrate(int narg, char **arg, int trysuffix)
 {
   if (narg < 1) error->all(FLERR,"Illegal run_style command");
 
@@ -301,11 +301,12 @@ void Update::create_integrate(int narg, char **arg, char *suffix)
   delete integrate;
 
   int sflag;
-  new_integrate(arg[0],narg-1,&arg[1],suffix,sflag);
+  new_integrate(arg[0],narg-1,&arg[1],trysuffix,sflag);
 
   if (sflag) {
     char estyle[256];
-    sprintf(estyle,"%s/%s",arg[0],suffix);
+    if (sflag == 1) sprintf(estyle,"%s/%s",arg[0],lmp->suffix);
+    else sprintf(estyle,"%s/%s",arg[0],lmp->suffix2);
     int n = strlen(estyle) + 1;
     integrate_style = new char[n];
     strcpy(integrate_style,estyle);
@@ -321,42 +322,59 @@ void Update::create_integrate(int narg, char **arg, char *suffix)
 ------------------------------------------------------------------------- */
 
 void Update::new_integrate(char *style, int narg, char **arg,
-                           char *suffix, int &sflag)
+                           int trysuffix, int &sflag)
 {
-  int success = 0;
+  if (trysuffix && lmp->suffix_enable) {
+    if (lmp->suffix) {
+      sflag = 1;
+      char estyle[256];
+      sprintf(estyle,"%s/%s",style,lmp->suffix);
+      int success = 1;
 
-  if (suffix && lmp->suffix_enable) {
-    sflag = 1;
-    char estyle[256];
-    sprintf(estyle,"%s/%s",style,suffix);
-    success = 1;
-
-    if (0) return;
+      if (0) return;
 
 #define INTEGRATE_CLASS
 #define IntegrateStyle(key,Class) \
-    else if (strcmp(estyle,#key) == 0) integrate = new Class(lmp,narg,arg);
+      else if (strcmp(estyle,#key) == 0) integrate = new Class(lmp,narg,arg);
+#include "style_integrate.h"
+#undef IntegrateStyle
+#undef INTEGRATE_CLASS
+      
+      else success = 0;
+      if (success) return;
+    }
+    
+    if (lmp->suffix2) {
+      sflag = 2;
+      char estyle[256];
+      sprintf(estyle,"%s/%s",style,lmp->suffix2);
+      int success = 1;
+
+      if (0) return;
+
+#define INTEGRATE_CLASS
+#define IntegrateStyle(key,Class) \
+      else if (strcmp(estyle,#key) == 0) integrate = new Class(lmp,narg,arg);
+#include "style_integrate.h"
+#undef IntegrateStyle
+#undef INTEGRATE_CLASS
+      
+      else success = 0;
+      if (success) return;
+    }
+  }
+
+  sflag = 0;
+  if (0) return;
+
+#define INTEGRATE_CLASS
+#define IntegrateStyle(key,Class) \
+  else if (strcmp(style,#key) == 0) integrate = new Class(lmp,narg,arg);
 #include "style_integrate.h"
 #undef IntegrateStyle
 #undef INTEGRATE_CLASS
 
-    else success = 0;
-  }
-
-  if (!success) {
-    sflag = 0;
-
-    if (0) return;
-
-#define INTEGRATE_CLASS
-#define IntegrateStyle(key,Class) \
-    else if (strcmp(style,#key) == 0) integrate = new Class(lmp,narg,arg);
-#include "style_integrate.h"
-#undef IntegrateStyle
-#undef INTEGRATE_CLASS
-
-    else error->all(FLERR,"Illegal integrate style");
-  }
+  else error->all(FLERR,"Illegal integrate style");
 }
 
 /* ---------------------------------------------------------------------- */
diff --git a/src/update.h b/src/update.h
index 53bf041dea..5c6c15bad7 100644
--- a/src/update.h
+++ b/src/update.h
@@ -50,7 +50,7 @@ class Update : protected Pointers {
   ~Update();
   void init();
   void set_units(const char *);
-  void create_integrate(int, char **, char *);
+  void create_integrate(int, char **, int);
   void create_minimize(int, char **);
   void reset_timestep(int, char **);
   void reset_timestep(bigint);
@@ -58,7 +58,7 @@ class Update : protected Pointers {
   bigint memory_usage();
 
  private:
-  void new_integrate(char *, int, char **, char *, int &);
+  void new_integrate(char *, int, char **, int, int &);
 
 };